habanalabs: prevent false heartbeat failure during soft-reset
The heartbeat thread is active during soft-reset, and it tries to send messages to CPU-CP core. Within the soft-reset, in the time window in which the device is marked as disabled, any CPU-CP command is "silently" skipped and a success value it returned. However, in addition to the return value, the heartbeat function also checks the F/W result, but because no command is sent in this time window, the result variable won't hold the expected value and we will have a false heartbeat failure. To avoid it, modify the "silent" skip to be done only in hard-reset. The CPU-CP should be able to handle messages during soft-reset. In addition to the heartbeat problem, this should also solve other issues in other flows that send messages during soft-reset and use the F/W result as it w/o being aware to the reset. Signed-off-by: Tomer Tayar <ttayar@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
This commit is contained in:
Родитель
7a78d4d481
Коммит
930feb41ef
|
@ -214,7 +214,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
|
|||
dma_addr_t pkt_dma_addr;
|
||||
struct hl_bd *sent_bd;
|
||||
u32 tmp, expected_ack_val, pi;
|
||||
int rc = 0;
|
||||
int rc;
|
||||
|
||||
pkt = hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, len,
|
||||
&pkt_dma_addr);
|
||||
|
@ -228,8 +228,11 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
|
|||
|
||||
mutex_lock(&hdev->send_cpu_message_lock);
|
||||
|
||||
if (hdev->disabled)
|
||||
/* CPU-CP messages can be sent during soft-reset */
|
||||
if (hdev->disabled && !hdev->reset_info.is_in_soft_reset) {
|
||||
rc = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (hdev->device_cpu_disabled) {
|
||||
rc = -EIO;
|
||||
|
|
Загрузка…
Ссылка в новой задаче