habanalabs/gaudi: fetch HBM ecc info from FW
Once FW security is enabled there is no access to HBM ecc registers, need to read values from FW using a dedicated interface. Signed-off-by: Ofir Bitton <obitton@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
This commit is contained in:
Родитель
d611b9f0b1
Коммит
5a2998f46c
|
@ -6839,10 +6839,41 @@ static int gaudi_soft_reset_late_init(struct hl_device *hdev)
|
|||
return hl_fw_unmask_irq_arr(hdev, gaudi->events, sizeof(gaudi->events));
|
||||
}
|
||||
|
||||
static int gaudi_hbm_read_interrupts(struct hl_device *hdev, int device)
|
||||
static int gaudi_hbm_read_interrupts(struct hl_device *hdev, int device,
|
||||
struct hl_eq_hbm_ecc_data *hbm_ecc_data)
|
||||
{
|
||||
int ch, err = 0;
|
||||
u32 base, val, val2;
|
||||
u32 base, val, val2, wr_par, rd_par, ca_par, derr, serr, type, ch;
|
||||
int err = 0;
|
||||
|
||||
if (!hdev->asic_prop.fw_security_disabled) {
|
||||
if (!hbm_ecc_data) {
|
||||
dev_err(hdev->dev, "No FW ECC data");
|
||||
return 0;
|
||||
}
|
||||
|
||||
wr_par = FIELD_GET(CPUCP_PKT_HBM_ECC_INFO_WR_PAR_MASK,
|
||||
le32_to_cpu(hbm_ecc_data->hbm_ecc_info));
|
||||
rd_par = FIELD_GET(CPUCP_PKT_HBM_ECC_INFO_RD_PAR_MASK,
|
||||
le32_to_cpu(hbm_ecc_data->hbm_ecc_info));
|
||||
ca_par = FIELD_GET(CPUCP_PKT_HBM_ECC_INFO_CA_PAR_MASK,
|
||||
le32_to_cpu(hbm_ecc_data->hbm_ecc_info));
|
||||
derr = FIELD_GET(CPUCP_PKT_HBM_ECC_INFO_DERR_MASK,
|
||||
le32_to_cpu(hbm_ecc_data->hbm_ecc_info));
|
||||
serr = FIELD_GET(CPUCP_PKT_HBM_ECC_INFO_SERR_MASK,
|
||||
le32_to_cpu(hbm_ecc_data->hbm_ecc_info));
|
||||
type = FIELD_GET(CPUCP_PKT_HBM_ECC_INFO_TYPE_MASK,
|
||||
le32_to_cpu(hbm_ecc_data->hbm_ecc_info));
|
||||
ch = FIELD_GET(CPUCP_PKT_HBM_ECC_INFO_HBM_CH_MASK,
|
||||
le32_to_cpu(hbm_ecc_data->hbm_ecc_info));
|
||||
|
||||
dev_err(hdev->dev,
|
||||
"HBM%d pc%d interrupts info: WR_PAR=%d, RD_PAR=%d, CA_PAR=%d, SERR=%d, DERR=%d\n",
|
||||
device, ch, wr_par, rd_par, ca_par, serr, derr);
|
||||
|
||||
err = 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
base = GAUDI_HBM_CFG_BASE + device * GAUDI_HBM_CFG_OFFSET;
|
||||
for (ch = 0 ; ch < GAUDI_HBM_CHANNELS ; ch++) {
|
||||
|
@ -6858,7 +6889,7 @@ static int gaudi_hbm_read_interrupts(struct hl_device *hdev, int device)
|
|||
|
||||
val2 = RREG32(base + ch * 0x1000 + 0x060);
|
||||
dev_err(hdev->dev,
|
||||
"HBM%d pc%d ECC info: 1ST_ERR_ADDR=0x%x, 1ST_ERR_TYPE=%d, SEC_CONT_CNT=%d, SEC_CNT=%d, DED_CNT=%d\n",
|
||||
"HBM%d pc%d ECC info: 1ST_ERR_ADDR=0x%x, 1ST_ERR_TYPE=%d, SEC_CONT_CNT=%d, SEC_CNT=%d, DEC_CNT=%d\n",
|
||||
device, ch * 2,
|
||||
RREG32(base + ch * 0x1000 + 0x064),
|
||||
(val2 & 0x200) >> 9, (val2 & 0xFC00) >> 10,
|
||||
|
@ -6878,7 +6909,7 @@ static int gaudi_hbm_read_interrupts(struct hl_device *hdev, int device)
|
|||
|
||||
val2 = RREG32(base + ch * 0x1000 + 0x070);
|
||||
dev_err(hdev->dev,
|
||||
"HBM%d pc%d ECC info: 1ST_ERR_ADDR=0x%x, 1ST_ERR_TYPE=%d, SEC_CONT_CNT=%d, SEC_CNT=%d, DED_CNT=%d\n",
|
||||
"HBM%d pc%d ECC info: 1ST_ERR_ADDR=0x%x, 1ST_ERR_TYPE=%d, SEC_CONT_CNT=%d, SEC_CNT=%d, DEC_CNT=%d\n",
|
||||
device, ch * 2 + 1,
|
||||
RREG32(base + ch * 0x1000 + 0x074),
|
||||
(val2 & 0x200) >> 9, (val2 & 0xFC00) >> 10,
|
||||
|
@ -7079,7 +7110,8 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
|
|||
case GAUDI_EVENT_HBM3_SPI_0:
|
||||
gaudi_print_irq_info(hdev, event_type, false);
|
||||
gaudi_hbm_read_interrupts(hdev,
|
||||
gaudi_hbm_event_to_dev(event_type));
|
||||
gaudi_hbm_event_to_dev(event_type),
|
||||
&eq_entry->hbm_ecc_data);
|
||||
if (hdev->hard_reset_on_fw_events)
|
||||
hl_device_reset(hdev, true, false);
|
||||
break;
|
||||
|
@ -7090,7 +7122,8 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
|
|||
case GAUDI_EVENT_HBM3_SPI_1:
|
||||
gaudi_print_irq_info(hdev, event_type, false);
|
||||
gaudi_hbm_read_interrupts(hdev,
|
||||
gaudi_hbm_event_to_dev(event_type));
|
||||
gaudi_hbm_event_to_dev(event_type),
|
||||
&eq_entry->hbm_ecc_data);
|
||||
break;
|
||||
|
||||
case GAUDI_EVENT_TPC0_DEC:
|
||||
|
|
|
@ -11,6 +11,37 @@
|
|||
#include <linux/types.h>
|
||||
#include <linux/if_ether.h>
|
||||
|
||||
#define NUM_HBM_PSEUDO_CH 2
|
||||
#define NUM_HBM_CH_PER_DEV 8
|
||||
#define CPUCP_PKT_HBM_ECC_INFO_WR_PAR_SHIFT 0
|
||||
#define CPUCP_PKT_HBM_ECC_INFO_WR_PAR_MASK 0x00000001
|
||||
#define CPUCP_PKT_HBM_ECC_INFO_RD_PAR_SHIFT 1
|
||||
#define CPUCP_PKT_HBM_ECC_INFO_RD_PAR_MASK 0x00000002
|
||||
#define CPUCP_PKT_HBM_ECC_INFO_CA_PAR_SHIFT 2
|
||||
#define CPUCP_PKT_HBM_ECC_INFO_CA_PAR_MASK 0x00000004
|
||||
#define CPUCP_PKT_HBM_ECC_INFO_DERR_SHIFT 3
|
||||
#define CPUCP_PKT_HBM_ECC_INFO_DERR_MASK 0x00000008
|
||||
#define CPUCP_PKT_HBM_ECC_INFO_SERR_SHIFT 4
|
||||
#define CPUCP_PKT_HBM_ECC_INFO_SERR_MASK 0x00000010
|
||||
#define CPUCP_PKT_HBM_ECC_INFO_TYPE_SHIFT 5
|
||||
#define CPUCP_PKT_HBM_ECC_INFO_TYPE_MASK 0x00000020
|
||||
#define CPUCP_PKT_HBM_ECC_INFO_HBM_CH_SHIFT 6
|
||||
#define CPUCP_PKT_HBM_ECC_INFO_HBM_CH_MASK 0x000007C0
|
||||
|
||||
struct hl_eq_hbm_ecc_data {
|
||||
/* SERR counter */
|
||||
__le32 sec_cnt;
|
||||
/* DERR counter */
|
||||
__le32 dec_cnt;
|
||||
/* Supplemental Information according to the mask bits */
|
||||
__le32 hbm_ecc_info;
|
||||
/* Address in hbm where the ecc happened */
|
||||
__le32 first_addr;
|
||||
/* SERR continuous address counter */
|
||||
__le32 sec_cont_cnt;
|
||||
__le32 pad;
|
||||
};
|
||||
|
||||
/*
|
||||
* EVENT QUEUE
|
||||
*/
|
||||
|
@ -31,6 +62,7 @@ struct hl_eq_entry {
|
|||
struct hl_eq_header hdr;
|
||||
union {
|
||||
struct hl_eq_ecc_data ecc_data;
|
||||
struct hl_eq_hbm_ecc_data hbm_ecc_data;
|
||||
__le64 data[7];
|
||||
};
|
||||
};
|
||||
|
|
Загрузка…
Ссылка в новой задаче