Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Mellanox] thermal control enhancement for dynamic minimum fan speed and PSU fan speed policy #4403

Merged
merged 7 commits into from
Apr 21, 2020
27 changes: 24 additions & 3 deletions device/mellanox/x86_64-mlnx_msn2700-r0/thermal_policy.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"thermal_control_algorithm": {
"run_at_boot_up": "false",
"run_at_boot_up": "true",
"fan_speed_when_suspend": "60"
},
"info_types": [
Expand Down Expand Up @@ -51,6 +51,24 @@
}
]
},
{
"name": "any fan broken",
"conditions": [
{
"type": "fan.any.fault"
}
],
"actions": [
{
"type": "thermal_control.control",
"status": "false"
},
{
"type": "fan.all.set_speed",
"speed": "100"
}
]
},
{
"name": "all fan and psu presence",
"conditions": [
Expand All @@ -59,12 +77,15 @@
},
{
"type": "psu.all.presence"
},
{
"type": "fan.all.good"
}
],
"actions": [
{
"type": "fan.all.set_speed",
"speed": "60"
"type": "thermal_control.control",
"status": "true"
}
]
}
Expand Down
3 changes: 2 additions & 1 deletion dockers/docker-platform-monitor/Dockerfile.j2
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ RUN apt-get update && \
rrdtool \
python-smbus \
ethtool \
dmidecode && \
dmidecode \
i2c-tools && \
pip install enum34

{% if docker_platform_monitor_debs.strip() -%}
Expand Down
12 changes: 10 additions & 2 deletions platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
MLNX_NUM_PSU = 2

GET_HWSKU_CMD = "sonic-cfggen -d -v DEVICE_METADATA.localhost.hwsku"
GET_PLATFORM_CMD = "sonic-cfggen -d -v DEVICE_METADATA.localhost.platform"

EEPROM_CACHE_ROOT = '/var/cache/sonic/decode-syseeprom'
EEPROM_CACHE_FILE = 'syseeprom_cache'
Expand Down Expand Up @@ -60,6 +61,7 @@ def __init__(self):

# Initialize SKU name
self.sku_name = self._get_sku_name()
self.platform_name = self._get_platform_name()
mi = get_machine_info()
if mi is not None:
self.name = mi['onie_platform']
Expand Down Expand Up @@ -110,9 +112,9 @@ def initialize_fan(self):

for index in range(num_of_fan):
if multi_rotor_in_drawer:
fan = Fan(has_fan_dir, index, index/2, False, self.sku_name)
fan = Fan(has_fan_dir, index, index/2, False, self.platform_name)
else:
fan = Fan(has_fan_dir, index, index, False, self.sku_name)
fan = Fan(has_fan_dir, index, index, False, self.platform_name)
self._fan_list.append(fan)


Expand Down Expand Up @@ -245,6 +247,12 @@ def _get_sku_name(self):
return out.rstrip('\n')


def _get_platform_name(self):
p = subprocess.Popen(GET_PLATFORM_CMD, shell=True, stdout=subprocess.PIPE)
out, err = p.communicate()
return out.rstrip('\n')


def _get_port_position_tuple_by_sku_name(self):
position_tuple = port_position_tuple_list[hwsku_dict_port[self.sku_name]]
return position_tuple
Expand Down
101 changes: 101 additions & 0 deletions platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
DEVICE_DATA = {
'x86_64-mlnx_msn2700-r0': {
'thermal': {
'minimum_table': {
"p2c_trust": {"-127:40":13, "41:120":15},
"p2c_untrust": {"-127:25":13, "26:30":14 , "31:35":15, "36:120":16},
"c2p_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16},
"c2p_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16},
"unk_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16},
"unk_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}
}
}
},
'x86_64-mlnx_msn2740-r0': {
'thermal': {
'minimum_table': {
"p2c_trust": {"-127:120":13},
"p2c_untrust": {"-127:35":13, "36:40":14 , "41:120":15},
"c2p_trust": {"-127:120":13},
"c2p_untrust": {"-127:15":13, "16:30":14 , "31:35":15, "36:120":17},
"unk_trust": {"-127:120":13},
"unk_untrust": {"-127:15":13, "16:30":14 , "31:35":15, "36:120":17},
}
}
},
'x86_64-mlnx_msn2100-r0': {
'thermal': {
'minimum_table': {
"p2c_trust": {"-127:120":12},
"p2c_untrust": {"-127:15":12, "16:25":13, "26:30":14, "31:35":15, "36:120":16},
"c2p_trust": {"-127:40":12, "41:120":13},
"c2p_untrust": {"-127:40":12, "41:120":13},
"unk_trust": {"-127:40":12, "41:120":13},
"unk_untrust": {"-127:15":12, "16:25":13, "26:30":14, "31:35":15, "36:120":16}
}
}
},
'x86_64-mlnx_msn2410-r0': {
'thermal': {
'minimum_table': {
"p2c_trust": {"-127:40":13, "41:120":15},
"p2c_untrust": {"-127:25":13, "26:30":14 , "31:35":15, "36:120":16},
"c2p_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16},
"c2p_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16},
"unk_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16},
"unk_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}
}
}
},
'x86_64-mlnx_msn2010-r0': {
'thermal': {
'minimum_table': {
"p2c_trust": {"-127:120":12},
"p2c_untrust": {"-127:15":12, "16:20":13, "21:30":14, "31:35":15, "36:120":16},
"c2p_trust": {"-127:120":12},
"c2p_untrust": {"-127:20":12, "21:25":13 , "26:30":14, "31:35":15, "36:120":16},
"unk_trust": {"-127:120":12},
"unk_untrust": {"-127:15":12, "16:20":13 , "21:30":14, "31:35":15, "36:120":16}
}
}
},
'x86_64-mlnx_msn3700-r0': {
'thermal': {
'minimum_table': {
"p2c_trust": {"-127:25":12, "26:40":13 , "41:120":14},
"p2c_untrust": {"-127:15":12, "16:30":13 , "31:35":14, "36:40":15, "41:120":16},
"c2p_trust": {"-127:25":12, "26:40":13 , "41:120":14},
"c2p_untrust": {"-127:25":12, "26:40":13 , "41:120":14},
"unk_trust": {"-127:25":12, "26:40":13 , "41:120":14},
"unk_untrust": {"-127:15":12, "16:30":13 , "31:35":14, "36:40":15, "41:120":16},
}
}
},
'x86_64-mlnx_msn3700c-r0': {
'thermal': {
'minimum_table': {
"p2c_trust": {"-127:25":12, "26:40":13 , "41:120":14},
"p2c_untrust": {"-127:15":12, "16:30":13 , "31:35":14, "36:40":15, "41:120":16},
"c2p_trust": {"-127:25":12, "26:40":13 , "41:120":14},
"c2p_untrust": {"-127:25":12, "26:40":13 , "41:120":14},
"unk_trust": {"-127:25":12, "26:40":13 , "41:120":14},
"unk_untrust": {"-127:15":12, "16:30":13 , "31:35":14, "36:40":15, "41:120":16},
}
}
},
'x86_64-mlnx_msn3800-r0': {
'thermal': {
'minimum_table': {
"p2c_trust": {"-127:35":12, "36:120":13},
"p2c_untrust": {"-127:0":12, "1:10":13 , "11:15":14, "16:20":15, "21:35":16, "36:120":17},
"c2p_trust": {"-127:30":12, "31:40":13 , "41:120":14},
"c2p_untrust": {"-127:20":12, "21:30":13 , "31:35":14, "36:40":15, "41:120":16},
"unk_trust": {"-127:30":12, "31:40":13 , "41:120":14},
"unk_untrust": {"-127:0":12, "1:10":13 , "11:15":14, "16:20":15, "21:35":16, "36:120":17},
}
}
},
'x86_64-mlnx_msn4700-r0': {

}
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are missing SN4700 which is also supported.

Copy link
Collaborator Author

@Junchao-Mellanox Junchao-Mellanox Apr 14, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't have minimum table for 4700. We set minimum FAN speed to 60% for 4700 or any other platform who has no minimum table.

104 changes: 89 additions & 15 deletions platform/mellanox/mlnx-platform-api/sonic_platform/fan.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#############################################################################

import os.path
import subprocess

try:
from sonic_platform_base.fan_base import FanBase
Expand All @@ -22,25 +23,34 @@

FAN_PATH = "/var/run/hw-management/thermal/"
LED_PATH = "/var/run/hw-management/led/"
CONFIG_PATH = "/var/run/hw-management/config"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think we have this config path on different files. is this correct to have it this way instead of having it inherited?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is correct. The config path here is a directory and I only need it to find some i2c address related to PSU fan. I don't see difference among differenct platforms.

# fan_dir isn't supported on Spectrum 1. It is supported on Spectrum 2 and later switches
FAN_DIR = "/var/run/hw-management/system/fan_dir"
COOLING_STATE_PATH = "/var/run/hw-management/thermal/cooling_cur_state"

# SKUs with unplugable FANs:
# Platforms with unplugable FANs:
# 1. don't have fanX_status and should be treated as always present
hwsku_dict_with_unplugable_fan = ['ACS-MSN2010', 'ACS-MSN2100']
platform_with_unplugable_fan = ['x86_64-mlnx_msn2010-r0', 'x86_64-mlnx_msn2100-r0']


class Fan(FanBase):
"""Platform-specific Fan class"""

STATUS_LED_COLOR_ORANGE = "orange"

def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False, sku = None):
min_cooling_level = 2
MIN_VALID_COOLING_LEVEL = 1
MAX_VALID_COOLING_LEVEL = 10
# PSU fan speed vector
PSU_FAN_SPEED = ['0x3c', '0x3c', '0x3c', '0x3c', '0x3c',
'0x3c', '0x3c', '0x46', '0x50', '0x5a', '0x64']

def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False, platform = None):
# API index is starting from 0, Mellanox platform index is starting from 1
self.index = fan_index + 1
self.drawer_index = drawer_index + 1

self.is_psu_fan = psu_fan
self.always_presence = False if sku not in hwsku_dict_with_unplugable_fan else True
self.always_presence = False if platform not in platform_with_unplugable_fan else True

self.fan_min_speed_path = "fan{}_min".format(self.index)
if not self.is_psu_fan:
Expand All @@ -54,6 +64,10 @@ def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False, sk
self.fan_presence_path = "psu{}_fan1_speed_get".format(self.index)
self._name = 'psu_{}_fan_{}'.format(self.index, 1)
self.fan_max_speed_path = None
self.psu_i2c_bus_path = os.path.join(CONFIG_PATH, 'psu{0}_i2c_bus'.format(self.index))
self.psu_i2c_addr_path = os.path.join(CONFIG_PATH, 'psu{0}_i2c_addr'.format(self.index))
self.psu_i2c_command_path = os.path.join(CONFIG_PATH, 'fan_command')

self.fan_status_path = "fan{}_fault".format(self.index)
self.fan_green_led_path = "led_fan{}_green".format(self.drawer_index)
self.fan_red_led_path = "led_fan{}_red".format(self.drawer_index)
Expand Down Expand Up @@ -90,7 +104,7 @@ def get_direction(self):

try:
with open(os.path.join(self.fan_dir), 'r') as fan_dir:
fan_dir_bits = int(fan_dir.read())
fan_dir_bits = int(fan_dir.read().strip())
fan_mask = 1 << self.drawer_index - 1
if fan_dir_bits & fan_mask:
return self.FAN_DIRECTION_INTAKE
Expand All @@ -116,7 +130,7 @@ def get_status(self):
else:
try:
with open(os.path.join(FAN_PATH, self.fan_status_path), 'r') as fault_status:
status = int(fault_status.read())
status = int(fault_status.read().strip())
except (ValueError, IOError):
status = 1

Expand All @@ -142,7 +156,7 @@ def get_presence(self):
else:
try:
with open(os.path.join(FAN_PATH, self.fan_presence_path), 'r') as presence_status:
status = int(presence_status.read())
status = int(presence_status.read().strip())
except (ValueError, IOError):
status = 0

Expand All @@ -164,7 +178,7 @@ def _get_max_speed_in_rpm(self):
speed = 0
try:
with open(os.path.join(FAN_PATH, self.fan_max_speed_path), 'r') as max_fan_speed:
speed = int(max_fan_speed.read())
speed = int(max_fan_speed.read().strip())
except (ValueError, IOError):
speed = 0

Expand All @@ -181,7 +195,7 @@ def get_speed(self):
speed = 0
try:
with open(os.path.join(FAN_PATH, self.fan_speed_get_path), 'r') as fan_curr_speed:
speed_in_rpm = int(fan_curr_speed.read())
speed_in_rpm = int(fan_curr_speed.read().strip())
except (ValueError, IOError):
speed_in_rpm = 0

Expand Down Expand Up @@ -210,7 +224,7 @@ def get_target_speed(self):

try:
with open(os.path.join(FAN_PATH, self.fan_speed_set_path), 'r') as fan_pwm:
pwm = int(fan_pwm.read())
pwm = int(fan_pwm.read().strip())
except (ValueError, IOError):
pwm = 0

Expand All @@ -231,13 +245,34 @@ def set_speed(self, speed):
bool: True if set success, False if fail.
"""
status = True
pwm = int(round(PWM_MAX*speed/100.0))

if self.is_psu_fan:
#PSU fan speed is not setable.
return False

from .thermal import logger
try:
with open(self.psu_i2c_bus_path, 'r') as f:
bus = f.read().strip()
with open(self.psu_i2c_addr_path, 'r') as f:
addr = f.read().strip()
with open(self.psu_i2c_command_path, 'r') as f:
command = f.read().strip()
speed = Fan.PSU_FAN_SPEED[int(speed / 10)]
command = "i2cset -f -y {0} {1} {2} {3} wp".format(bus, addr, command, speed)
subprocess.check_call(command, shell = True)
return True
except subprocess.CalledProcessError as ce:
logger.log_error('Failed to call command {}, return code={}, command output={}'.format(ce.cmd, ce.returncode, ce.output))
return False
except Exception as e:
logger.log_error('Failed to set PSU FAN speed - {}'.format(e))
return False

try:
cooling_level = int(speed / 10)
if cooling_level < self.min_cooling_level:
cooling_level = self.min_cooling_level
speed = self.min_cooling_level * 10
self.set_cooling_level(cooling_level, cooling_level)
pwm = int(round(PWM_MAX*speed/100.0))
with open(os.path.join(FAN_PATH, self.fan_speed_set_path), 'w') as fan_pwm:
fan_pwm.write(str(pwm))
except (ValueError, IOError):
Expand Down Expand Up @@ -352,3 +387,42 @@ def get_speed_tolerance(self):
"""
# The tolerance value is fixed as 20% for all the Mellanox platform
return 20

@classmethod
def set_cooling_level(cls, level, cur_state):
"""
Change cooling level. The input level should be an integer value [1, 10].
1 means 10%, 2 means 20%, 10 means 100%.
"""
if not isinstance(level, int):
raise RuntimeError("Failed to set cooling level, input parameter must be integer")

if level < cls.MIN_VALID_COOLING_LEVEL or level > cls.MAX_VALID_COOLING_LEVEL:
raise RuntimeError("Failed to set cooling level, level value must be in range [{}, {}], got {}".format(
cls.MIN_VALID_COOLING_LEVEL,
cls.MAX_VALID_COOLING_LEVEL,
level
))

try:
# Reset FAN cooling level vector. According to low level team,
# if we need set cooling level to X, we need first write a (10+X)
# to cooling_cur_state file to reset the cooling level vector.
with open(COOLING_STATE_PATH, 'w') as cooling_state:
cooling_state.write(str(level + 10))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you please clarify? how do you reset with just setting the cooling level?

Copy link
Collaborator Author

@Junchao-Mellanox Junchao-Mellanox Apr 14, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added comment to code to explain this.


# We need set cooling level after resetting the cooling level vector
with open(COOLING_STATE_PATH, 'w') as cooling_state:
cooling_state.write(str(cur_state))
except (ValueError, IOError) as e:
raise RuntimeError("Failed to set cooling level - {}".format(e))

@classmethod
def get_cooling_level(cls):
try:
with open(COOLING_STATE_PATH, 'r') as cooling_state:
cooling_level = int(cooling_state.read().strip())
return cooling_level
except (ValueError, IOError) as e:
raise RuntimeError("Failed to get cooling level - {}".format(e))

Loading