diff --git a/platform/mellanox/mlnx-sfpd/scripts/mlnx-sfpd b/platform/mellanox/mlnx-sfpd/scripts/mlnx-sfpd index 20bd14ffb4c1..b11403b31514 100644 --- a/platform/mellanox/mlnx-sfpd/scripts/mlnx-sfpd +++ b/platform/mellanox/mlnx-sfpd/scripts/mlnx-sfpd @@ -30,6 +30,8 @@ STATUS_UNKNOWN = '2' SFPD_LIVENESS_EXPIRE_SECS = 30 +SDK_DAEMON_READY_FILE = '/tmp/sdk_ready' + sfp_value_status_dict = { SDK_SFP_STATE_IN: STATUS_PLUGIN, SDK_SFP_STATE_OUT: STATUS_PLUGOUT, @@ -64,7 +66,8 @@ def log_error(msg, also_print_to_console=False): class MlnxSfpd: ''' Listen to plugin/plugout cable events ''' - SX_OPEN_RETRIES = 20 + SX_OPEN_RETRIES = 30 + SX_OPEN_TIMEOUT = 5 SELECT_TIMEOUT = 1 def __init__(self): @@ -75,7 +78,6 @@ class MlnxSfpd: # Allocate SDK fd and user channel structures self.rx_fd_p = new_sx_fd_t_p() self.user_channel_p = new_sx_user_channel_t_p() - self.state_db = SonicV2Connector(host=REDIS_HOSTIP) # Register our signal handlers @@ -98,37 +100,78 @@ class MlnxSfpd: def initialize(self): self.state_db.connect("STATE_DB") - # open SDK API handle - # retry at most SX_OPEN_RETRIES times to wait - # until SDK is started during system startup - retry = 1 - while True: - rc, self.handle = sx_api_open(None) - if rc == SX_STATUS_SUCCESS: - break - - log_warning("failed to open SDK API handle... retrying {}".format(retry)) + swid_cnt_p = None - time.sleep(2 ** retry) - retry += 1 - - if retry > self.SX_OPEN_RETRIES: - raise RuntimeError("failed to open SDK API handle after {} retries".format(retry)) - - rc = sx_api_host_ifc_open(self.handle, self.rx_fd_p) - if rc != SX_STATUS_SUCCESS: - raise RuntimeError("sx_api_host_ifc_open exited with error, rc {}".format(rc)) + try: + # Wait for SDK daemon to be started with detect the sdk_ready file + retry = 0 + while not os.path.exists(SDK_DAEMON_READY_FILE): + if retry >= self.SX_OPEN_RETRIES: + raise RuntimeError("SDK daemon failed to start after {} retries and {} seconds waiting, exiting..." + .format(retry, self.SX_OPEN_TIMEOUT * self.SX_OPEN_RETRIES)) + else: + log_info("SDK daemon not started yet, retry {} times".format(retry)) + retry = retry + 1 + time.sleep(self.SX_OPEN_TIMEOUT) - self.user_channel_p.type = SX_USER_CHANNEL_TYPE_FD - self.user_channel_p.channel.fd = self.rx_fd_p + # to make sure SDK daemon has started + time.sleep(self.SX_OPEN_TIMEOUT) - rc = sx_api_host_ifc_trap_id_register_set(self.handle, - SX_ACCESS_CMD_REGISTER, - self.swid, - SX_TRAP_ID_PMPE, - self.user_channel_p) - if rc != SX_STATUS_SUCCESS: - raise RuntimeError("sx_api_host_ifc_trap_id_register_set exited with error, rc {}".format(c)) + # After SDK daemon started, sx_api_open and sx_api_host_ifc_open is ready for call + rc, self.handle = sx_api_open(None) + if rc != SX_STATUS_SUCCESS: + raise RuntimeError("failed to call sx_api_open with rc {}, exiting...".format(rc)) + + rc = sx_api_host_ifc_open(self.handle, self.rx_fd_p) + if rc != SX_STATUS_SUCCESS: + raise RuntimeError("failed to call sx_api_host_ifc_open with rc {}, exiting...".format(rc)) + + self.user_channel_p.type = SX_USER_CHANNEL_TYPE_FD + self.user_channel_p.channel.fd = self.rx_fd_p + + # Wait for switch to be created and initialized inside SDK + retry = 0 + swid_cnt_p = new_uint32_t_p() + uint32_t_p_assign(swid_cnt_p, 0) + swid_cnt = 0 + while True: + if retry >= self.SX_OPEN_RETRIES: + raise RuntimeError("switch not created after {} retries and {} seconds waiting, exiting..." + .format(retry, self.SX_OPEN_RETRIES * self.SX_OPEN_TIMEOUT)) + else: + rc = sx_api_port_swid_list_get(self.handle, None, swid_cnt_p) + if rc == SX_STATUS_SUCCESS: + swid_cnt = uint32_t_p_value(swid_cnt_p) + if swid_cnt > 0: + delete_uint32_t_p(swid_cnt_p) + swid_cnt_p = None + break + else: + log_info("switch not created yet, swid_cnt {}, retry {} times and wait for {} seconds" + .format(swid_cnt, retry, self.SX_OPEN_TIMEOUT * retry)) + else: + raise RuntimeError("sx_api_port_swid_list_get fail with rc {}, retry {} times and wait for {} seconds". + format(rc, retry, self.SX_OPEN_TIMEOUT * retry)) + + retry = retry + 1 + time.sleep(self.SX_OPEN_TIMEOUT) + + # After switch was created inside SDK, sx_api_host_ifc_trap_id_register_set is ready to call + rc = sx_api_host_ifc_trap_id_register_set(self.handle, + SX_ACCESS_CMD_REGISTER, + self.swid, + SX_TRAP_ID_PMPE, + self.user_channel_p) + + if rc != SX_STATUS_SUCCESS: + raise RuntimeError("sx_api_host_ifc_trap_id_register_set failed with rc {}, exiting...".format(rc)) + + self.running = True + except Exception as e: + log_error("mlnx-sfpd initialization failed due to {}, exiting...".format(repr(e))) + if swid_cnt_p is not None: + delete_uint32_t_p(swid_cnt_p) + self.deinitialize() def deinitialize(self): # remove mlnx-sfpd liveness key in DB if not expired yet @@ -156,7 +199,6 @@ class MlnxSfpd: log_error("sx_api_close exited with error, rc {}".format(rc)) def run(self): - self.running = True while self.running: try: