habanalabs/gaudi: use HBM_ECC_EN bit for ECC ERR

driver should use ECC info from FW only if HBM ECC CAP is set.
otherwise, try to fetch the data from MC regs only if security is
disabled.

Signed-off-by: Ohad Sharabi <osharabi@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
This commit is contained in:
Ohad Sharabi 2021-01-27 15:42:53 +02:00 коммит произвёл Oded Gabbay
Родитель e52606d2f5
Коммит b520ca5d82
1 изменённых файлов: 15 добавлений и 3 удалений

Просмотреть файл

@ -7105,7 +7105,9 @@ static int gaudi_hbm_read_interrupts(struct hl_device *hdev, int device,
u32 base, val, val2, wr_par, rd_par, ca_par, derr, serr, type, ch;
int err = 0;
if (!hdev->asic_prop.fw_security_disabled) {
if (hdev->asic_prop.fw_security_status_valid &&
(hdev->asic_prop.fw_app_security_map &
CPU_BOOT_DEV_STS0_HBM_ECC_EN)) {
if (!hbm_ecc_data) {
dev_err(hdev->dev, "No FW ECC data");
return 0;
@ -7127,14 +7129,24 @@ static int gaudi_hbm_read_interrupts(struct hl_device *hdev, int device,
le32_to_cpu(hbm_ecc_data->hbm_ecc_info));
dev_err(hdev->dev,
"HBM%d pc%d ECC: TYPE=%d, WR_PAR=%d, RD_PAR=%d, CA_PAR=%d, SERR=%d, DERR=%d\n",
device, ch, type, wr_par, rd_par, ca_par, serr, derr);
"HBM%d pc%d interrupts info: WR_PAR=%d, RD_PAR=%d, CA_PAR=%d, SERR=%d, DERR=%d\n",
device, ch, wr_par, rd_par, ca_par, serr, derr);
dev_err(hdev->dev,
"HBM%d pc%d ECC info: 1ST_ERR_ADDR=0x%x, 1ST_ERR_TYPE=%d, SEC_CONT_CNT=%u, SEC_CNT=%d, DEC_CNT=%d\n",
device, ch, hbm_ecc_data->first_addr, type,
hbm_ecc_data->sec_cont_cnt, hbm_ecc_data->sec_cnt,
hbm_ecc_data->dec_cnt);
err = 1;
return 0;
}
if (!hdev->asic_prop.fw_security_disabled) {
dev_info(hdev->dev, "Cannot access MC regs for ECC data while security is enabled\n");
return 0;
}
base = GAUDI_HBM_CFG_BASE + device * GAUDI_HBM_CFG_OFFSET;
for (ch = 0 ; ch < GAUDI_HBM_CHANNELS ; ch++) {
val = RREG32_MASK(base + ch * 0x1000 + 0x06C, 0x0000FFFF);