Skip to content

Commit

Permalink
RDMA/hns: Add the detection for CMDQ status in the device initializat…
Browse files Browse the repository at this point in the history
…ion process

CMDQ may fail during HNS ROCEE initialization. The following is the log
when the execution fails:

  hns3 0000:bd:00.2: In reset process RoCE client reinit.
  hns3 0000:bd:00.2: CMDQ move tail from 840 to 839
  hns3 0000:bd:00.2 hns_2: failed to set gid, ret = -11!
  hns3 0000:bd:00.2: CMDQ move tail from 840 to 839
  <...>
  hns3 0000:bd:00.2: CMDQ move tail from 840 to 839
  hns3 0000:bd:00.2: CMDQ move tail from 840 to 0
  hns3 0000:bd:00.2: [cmd]token 14e mailbox 20 timeout.
  hns3 0000:bd:00.2 hns_2: set HEM step 0 failed!
  hns3 0000:bd:00.2 hns_2: set HEM address to HW failed!
  hns3 0000:bd:00.2 hns_2: failed to alloc mtpt, ret = -16.
  infiniband hns_2: Couldn't create ib_mad PD
  infiniband hns_2: Couldn't open port 1
  hns3 0000:bd:00.2: Reset done, RoCE client reinit finished.

However, even if ib_mad client registration failed, ib_register_device()
still returns success to the driver.

In the device initialization process, CMDQ execution fails because HW/FW
is abnormal. Therefore, if CMDQ fails, the initialization function should
set CMDQ to a fatal error state and return a failure to the caller.

Fixes: 9a44353 ("IB/hns: Add driver files for hns RoCE driver")
Link: https://lore.kernel.org/r/[email protected]
Signed-off-by: Yangyang Li <[email protected]>
Signed-off-by: Wenpeng Liang <[email protected]>
Signed-off-by: Jason Gunthorpe <[email protected]>
  • Loading branch information
l00436852 authored and jgunthorpe committed May 5, 2022
1 parent cc377b9 commit e8ea058
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 0 deletions.
6 changes: 6 additions & 0 deletions drivers/infiniband/hw/hns/hns_roce_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,11 @@ struct hns_roce_cmd_context {
u16 busy;
};

enum hns_roce_cmdq_state {
HNS_ROCE_CMDQ_STATE_NORMAL,
HNS_ROCE_CMDQ_STATE_FATAL_ERR,
};

struct hns_roce_cmdq {
struct dma_pool *pool;
struct semaphore poll_sem;
Expand All @@ -544,6 +549,7 @@ struct hns_roce_cmdq {
* close device, switch into poll mode(non event mode)
*/
u8 use_events;
enum hns_roce_cmdq_state state;
};

struct hns_roce_cmd_mailbox {
Expand Down
21 changes: 21 additions & 0 deletions drivers/infiniband/hw/hns/hns_roce_hw_v2.c
Original file line number Diff line number Diff line change
Expand Up @@ -1265,6 +1265,16 @@ static int hns_roce_cmq_csq_done(struct hns_roce_dev *hr_dev)
return tail == priv->cmq.csq.head;
}

static void update_cmdq_status(struct hns_roce_dev *hr_dev)
{
struct hns_roce_v2_priv *priv = hr_dev->priv;
struct hnae3_handle *handle = priv->handle;

if (handle->rinfo.reset_state == HNS_ROCE_STATE_RST_INIT ||
handle->rinfo.instance_state == HNS_ROCE_STATE_INIT)
hr_dev->cmd.state = HNS_ROCE_CMDQ_STATE_FATAL_ERR;
}

static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
struct hns_roce_cmq_desc *desc, int num)
{
Expand Down Expand Up @@ -1319,6 +1329,8 @@ static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
csq->head, tail);
csq->head = tail;

update_cmdq_status(hr_dev);

ret = -EAGAIN;
}

Expand All @@ -1333,6 +1345,9 @@ static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
bool busy;
int ret;

if (hr_dev->cmd.state == HNS_ROCE_CMDQ_STATE_FATAL_ERR)
return -EIO;

if (!v2_chk_mbox_is_avail(hr_dev, &busy))
return busy ? -EBUSY : 0;

Expand Down Expand Up @@ -1531,6 +1546,9 @@ static void hns_roce_function_clear(struct hns_roce_dev *hr_dev)
int ret;
int i;

if (hr_dev->cmd.state == HNS_ROCE_CMDQ_STATE_FATAL_ERR)
return;

for (i = hr_dev->func_num - 1; i >= 0; i--) {
__hns_roce_function_clear(hr_dev, i);

Expand Down Expand Up @@ -3010,6 +3028,9 @@ static int v2_wait_mbox_complete(struct hns_roce_dev *hr_dev, u32 timeout,
mb_st = (struct hns_roce_mbox_status *)desc.data;
end = msecs_to_jiffies(timeout) + jiffies;
while (v2_chk_mbox_is_avail(hr_dev, &busy)) {
if (hr_dev->cmd.state == HNS_ROCE_CMDQ_STATE_FATAL_ERR)
return -EIO;

status = 0;
hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_QUERY_MB_ST,
true);
Expand Down

0 comments on commit e8ea058

Please sign in to comment.