amd64_edac: Reorganize error reporting path
Rewrite CE/UE paths so that they use the same code and drop additional code duplication in handle_ue. Add a struct err_info which collects required info for the error reporting. This, in turn, helps slimming all edac_mc_handle_error() calls down to one. Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
This commit is contained in:
Родитель
c8d1adf092
Коммит
33ca0643c9
|
@ -709,10 +709,10 @@ static inline u64 input_addr_to_sys_addr(struct mem_ctl_info *mci,
|
||||||
|
|
||||||
/* Map the Error address to a PAGE and PAGE OFFSET. */
|
/* Map the Error address to a PAGE and PAGE OFFSET. */
|
||||||
static inline void error_address_to_page_and_offset(u64 error_address,
|
static inline void error_address_to_page_and_offset(u64 error_address,
|
||||||
u32 *page, u32 *offset)
|
struct err_info *err)
|
||||||
{
|
{
|
||||||
*page = (u32) (error_address >> PAGE_SHIFT);
|
err->page = (u32) (error_address >> PAGE_SHIFT);
|
||||||
*offset = ((u32) error_address) & ~PAGE_MASK;
|
err->offset = ((u32) error_address) & ~PAGE_MASK;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1023,59 +1023,44 @@ static void read_dram_base_limit_regs(struct amd64_pvt *pvt, unsigned range)
|
||||||
}
|
}
|
||||||
|
|
||||||
static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr,
|
static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr,
|
||||||
u16 syndrome)
|
struct err_info *err)
|
||||||
{
|
{
|
||||||
struct mem_ctl_info *src_mci;
|
|
||||||
struct amd64_pvt *pvt = mci->pvt_info;
|
struct amd64_pvt *pvt = mci->pvt_info;
|
||||||
int channel, csrow;
|
|
||||||
u32 page, offset;
|
|
||||||
|
|
||||||
error_address_to_page_and_offset(sys_addr, &page, &offset);
|
error_address_to_page_and_offset(sys_addr, err);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Find out which node the error address belongs to. This may be
|
* Find out which node the error address belongs to. This may be
|
||||||
* different from the node that detected the error.
|
* different from the node that detected the error.
|
||||||
*/
|
*/
|
||||||
src_mci = find_mc_by_sys_addr(mci, sys_addr);
|
err->src_mci = find_mc_by_sys_addr(mci, sys_addr);
|
||||||
if (!src_mci) {
|
if (!err->src_mci) {
|
||||||
amd64_mc_err(mci, "failed to map error addr 0x%lx to a node\n",
|
amd64_mc_err(mci, "failed to map error addr 0x%lx to a node\n",
|
||||||
(unsigned long)sys_addr);
|
(unsigned long)sys_addr);
|
||||||
edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1,
|
err->err_code = ERR_NODE;
|
||||||
page, offset, syndrome,
|
|
||||||
-1, -1, -1,
|
|
||||||
"failed to map error addr to a node",
|
|
||||||
"");
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Now map the sys_addr to a CSROW */
|
/* Now map the sys_addr to a CSROW */
|
||||||
csrow = sys_addr_to_csrow(src_mci, sys_addr);
|
err->csrow = sys_addr_to_csrow(err->src_mci, sys_addr);
|
||||||
if (csrow < 0) {
|
if (err->csrow < 0) {
|
||||||
edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1,
|
err->err_code = ERR_CSROW;
|
||||||
page, offset, syndrome,
|
|
||||||
-1, -1, -1,
|
|
||||||
"failed to map error addr to a csrow",
|
|
||||||
"");
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* CHIPKILL enabled */
|
/* CHIPKILL enabled */
|
||||||
if (pvt->nbcfg & NBCFG_CHIPKILL) {
|
if (pvt->nbcfg & NBCFG_CHIPKILL) {
|
||||||
channel = get_channel_from_ecc_syndrome(mci, syndrome);
|
err->channel = get_channel_from_ecc_syndrome(mci, err->syndrome);
|
||||||
if (channel < 0) {
|
if (err->channel < 0) {
|
||||||
/*
|
/*
|
||||||
* Syndrome didn't map, so we don't know which of the
|
* Syndrome didn't map, so we don't know which of the
|
||||||
* 2 DIMMs is in error. So we need to ID 'both' of them
|
* 2 DIMMs is in error. So we need to ID 'both' of them
|
||||||
* as suspect.
|
* as suspect.
|
||||||
*/
|
*/
|
||||||
amd64_mc_warn(src_mci, "unknown syndrome 0x%04x - "
|
amd64_mc_warn(err->src_mci, "unknown syndrome 0x%04x - "
|
||||||
"possible error reporting race\n",
|
"possible error reporting race\n",
|
||||||
syndrome);
|
err->syndrome);
|
||||||
edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1,
|
err->err_code = ERR_CHANNEL;
|
||||||
page, offset, syndrome,
|
|
||||||
csrow, -1, -1,
|
|
||||||
"unknown syndrome - possible error reporting race",
|
|
||||||
"");
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -1087,13 +1072,8 @@ static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr,
|
||||||
* was obtained from email communication with someone at AMD.
|
* was obtained from email communication with someone at AMD.
|
||||||
* (Wish the email was placed in this comment - norsk)
|
* (Wish the email was placed in this comment - norsk)
|
||||||
*/
|
*/
|
||||||
channel = ((sys_addr & BIT(3)) != 0);
|
err->channel = ((sys_addr & BIT(3)) != 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, src_mci, 1,
|
|
||||||
page, offset, syndrome,
|
|
||||||
csrow, channel, -1,
|
|
||||||
"", "");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static int ddr2_cs_size(unsigned i, bool dct_width)
|
static int ddr2_cs_size(unsigned i, bool dct_width)
|
||||||
|
@ -1479,7 +1459,7 @@ static u64 f1x_swap_interleaved_region(struct amd64_pvt *pvt, u64 sys_addr)
|
||||||
|
|
||||||
/* For a given @dram_range, check if @sys_addr falls within it. */
|
/* For a given @dram_range, check if @sys_addr falls within it. */
|
||||||
static int f1x_match_to_this_node(struct amd64_pvt *pvt, unsigned range,
|
static int f1x_match_to_this_node(struct amd64_pvt *pvt, unsigned range,
|
||||||
u64 sys_addr, int *nid, int *chan_sel)
|
u64 sys_addr, int *chan_sel)
|
||||||
{
|
{
|
||||||
int cs_found = -EINVAL;
|
int cs_found = -EINVAL;
|
||||||
u64 chan_addr;
|
u64 chan_addr;
|
||||||
|
@ -1552,15 +1532,14 @@ static int f1x_match_to_this_node(struct amd64_pvt *pvt, unsigned range,
|
||||||
|
|
||||||
cs_found = f1x_lookup_addr_in_dct(chan_addr, node_id, channel);
|
cs_found = f1x_lookup_addr_in_dct(chan_addr, node_id, channel);
|
||||||
|
|
||||||
if (cs_found >= 0) {
|
if (cs_found >= 0)
|
||||||
*nid = node_id;
|
|
||||||
*chan_sel = channel;
|
*chan_sel = channel;
|
||||||
}
|
|
||||||
return cs_found;
|
return cs_found;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int f1x_translate_sysaddr_to_cs(struct amd64_pvt *pvt, u64 sys_addr,
|
static int f1x_translate_sysaddr_to_cs(struct amd64_pvt *pvt, u64 sys_addr,
|
||||||
int *node, int *chan_sel)
|
int *chan_sel)
|
||||||
{
|
{
|
||||||
int cs_found = -EINVAL;
|
int cs_found = -EINVAL;
|
||||||
unsigned range;
|
unsigned range;
|
||||||
|
@ -1574,8 +1553,7 @@ static int f1x_translate_sysaddr_to_cs(struct amd64_pvt *pvt, u64 sys_addr,
|
||||||
(get_dram_limit(pvt, range) >= sys_addr)) {
|
(get_dram_limit(pvt, range) >= sys_addr)) {
|
||||||
|
|
||||||
cs_found = f1x_match_to_this_node(pvt, range,
|
cs_found = f1x_match_to_this_node(pvt, range,
|
||||||
sys_addr, node,
|
sys_addr, chan_sel);
|
||||||
chan_sel);
|
|
||||||
if (cs_found >= 0)
|
if (cs_found >= 0)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -1591,22 +1569,15 @@ static int f1x_translate_sysaddr_to_cs(struct amd64_pvt *pvt, u64 sys_addr,
|
||||||
* (MCX_ADDR).
|
* (MCX_ADDR).
|
||||||
*/
|
*/
|
||||||
static void f1x_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr,
|
static void f1x_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr,
|
||||||
u16 syndrome)
|
struct err_info *err)
|
||||||
{
|
{
|
||||||
struct amd64_pvt *pvt = mci->pvt_info;
|
struct amd64_pvt *pvt = mci->pvt_info;
|
||||||
u32 page, offset;
|
|
||||||
int nid, csrow, chan = 0;
|
|
||||||
|
|
||||||
error_address_to_page_and_offset(sys_addr, &page, &offset);
|
error_address_to_page_and_offset(sys_addr, err);
|
||||||
|
|
||||||
csrow = f1x_translate_sysaddr_to_cs(pvt, sys_addr, &nid, &chan);
|
err->csrow = f1x_translate_sysaddr_to_cs(pvt, sys_addr, &err->channel);
|
||||||
|
if (err->csrow < 0) {
|
||||||
if (csrow < 0) {
|
err->err_code = ERR_CSROW;
|
||||||
edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1,
|
|
||||||
page, offset, syndrome,
|
|
||||||
-1, -1, -1,
|
|
||||||
"failed to map error addr to a csrow",
|
|
||||||
"");
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1616,12 +1587,7 @@ static void f1x_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr,
|
||||||
* this point.
|
* this point.
|
||||||
*/
|
*/
|
||||||
if (dct_ganging_enabled(pvt))
|
if (dct_ganging_enabled(pvt))
|
||||||
chan = get_channel_from_ecc_syndrome(mci, syndrome);
|
err->channel = get_channel_from_ecc_syndrome(mci, err->syndrome);
|
||||||
|
|
||||||
edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1,
|
|
||||||
page, offset, syndrome,
|
|
||||||
csrow, chan, -1,
|
|
||||||
"", "");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1890,78 +1856,54 @@ static int get_channel_from_ecc_syndrome(struct mem_ctl_info *mci, u16 syndrome)
|
||||||
return map_err_sym_to_channel(err_sym, pvt->ecc_sym_sz);
|
return map_err_sym_to_channel(err_sym, pvt->ecc_sym_sz);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
static void __log_bus_error(struct mem_ctl_info *mci, struct err_info *err,
|
||||||
* Handle any Correctable Errors (CEs) that have occurred. Check for valid ERROR
|
u8 ecc_type)
|
||||||
* ADDRESS and process.
|
|
||||||
*/
|
|
||||||
static void amd64_handle_ce(struct mem_ctl_info *mci, struct mce *m)
|
|
||||||
{
|
{
|
||||||
struct amd64_pvt *pvt = mci->pvt_info;
|
enum hw_event_mc_err_type err_type;
|
||||||
u64 sys_addr;
|
const char *string;
|
||||||
u16 syndrome;
|
|
||||||
|
|
||||||
sys_addr = get_error_address(m);
|
if (ecc_type == 2)
|
||||||
syndrome = extract_syndrome(m->status);
|
err_type = HW_EVENT_ERR_CORRECTED;
|
||||||
|
else if (ecc_type == 1)
|
||||||
amd64_mc_err(mci, "CE ERROR_ADDRESS= 0x%llx\n", sys_addr);
|
err_type = HW_EVENT_ERR_UNCORRECTED;
|
||||||
|
else {
|
||||||
pvt->ops->map_sysaddr_to_csrow(mci, sys_addr, syndrome);
|
WARN(1, "Something is rotten in the state of Denmark.\n");
|
||||||
}
|
|
||||||
|
|
||||||
/* Handle any Un-correctable Errors (UEs) */
|
|
||||||
static void amd64_handle_ue(struct mem_ctl_info *mci, struct mce *m)
|
|
||||||
{
|
|
||||||
struct mem_ctl_info *log_mci, *src_mci = NULL;
|
|
||||||
int csrow;
|
|
||||||
u64 sys_addr;
|
|
||||||
u32 page, offset;
|
|
||||||
|
|
||||||
log_mci = mci;
|
|
||||||
|
|
||||||
sys_addr = get_error_address(m);
|
|
||||||
error_address_to_page_and_offset(sys_addr, &page, &offset);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Find out which node the error address belongs to. This may be
|
|
||||||
* different from the node that detected the error.
|
|
||||||
*/
|
|
||||||
src_mci = find_mc_by_sys_addr(mci, sys_addr);
|
|
||||||
if (!src_mci) {
|
|
||||||
amd64_mc_err(mci, "ERROR ADDRESS (0x%lx) NOT mapped to a MC\n",
|
|
||||||
(unsigned long)sys_addr);
|
|
||||||
edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1,
|
|
||||||
page, offset, 0,
|
|
||||||
-1, -1, -1,
|
|
||||||
"ERROR ADDRESS NOT mapped to a MC",
|
|
||||||
"");
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
log_mci = src_mci;
|
switch (err->err_code) {
|
||||||
|
case DECODE_OK:
|
||||||
csrow = sys_addr_to_csrow(log_mci, sys_addr);
|
string = "";
|
||||||
if (csrow < 0) {
|
break;
|
||||||
amd64_mc_err(mci, "ERROR_ADDRESS (0x%lx) NOT mapped to CS\n",
|
case ERR_NODE:
|
||||||
(unsigned long)sys_addr);
|
string = "Failed to map error addr to a node";
|
||||||
edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1,
|
break;
|
||||||
page, offset, 0,
|
case ERR_CSROW:
|
||||||
-1, -1, -1,
|
string = "Failed to map error addr to a csrow";
|
||||||
"ERROR ADDRESS NOT mapped to CS",
|
break;
|
||||||
"");
|
case ERR_CHANNEL:
|
||||||
} else {
|
string = "unknown syndrome - possible error reporting race";
|
||||||
edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1,
|
break;
|
||||||
page, offset, 0,
|
default:
|
||||||
csrow, -1, -1,
|
string = "WTF error";
|
||||||
"", "");
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
edac_mc_handle_error(err_type, mci, 1,
|
||||||
|
err->page, err->offset, err->syndrome,
|
||||||
|
err->csrow, err->channel, -1,
|
||||||
|
string, "");
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci,
|
static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci,
|
||||||
struct mce *m)
|
struct mce *m)
|
||||||
{
|
{
|
||||||
|
struct amd64_pvt *pvt = mci->pvt_info;
|
||||||
u8 ecc_type = (m->status >> 45) & 0x3;
|
u8 ecc_type = (m->status >> 45) & 0x3;
|
||||||
u8 xec = XEC(m->status, 0x1f);
|
u8 xec = XEC(m->status, 0x1f);
|
||||||
u16 ec = EC(m->status);
|
u16 ec = EC(m->status);
|
||||||
|
u64 sys_addr;
|
||||||
|
struct err_info err;
|
||||||
|
|
||||||
/* Bail out early if this was an 'observed' error */
|
/* Bail out early if this was an 'observed' error */
|
||||||
if (PP(ec) == NBSL_PP_OBS)
|
if (PP(ec) == NBSL_PP_OBS)
|
||||||
|
@ -1971,10 +1913,16 @@ static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci,
|
||||||
if (xec && xec != F10_NBSL_EXT_ERR_ECC)
|
if (xec && xec != F10_NBSL_EXT_ERR_ECC)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
memset(&err, 0, sizeof(err));
|
||||||
|
|
||||||
|
sys_addr = get_error_address(m);
|
||||||
|
|
||||||
if (ecc_type == 2)
|
if (ecc_type == 2)
|
||||||
amd64_handle_ce(mci, m);
|
err.syndrome = extract_syndrome(m->status);
|
||||||
else if (ecc_type == 1)
|
|
||||||
amd64_handle_ue(mci, m);
|
pvt->ops->map_sysaddr_to_csrow(mci, sys_addr, &err);
|
||||||
|
|
||||||
|
__log_bus_error(mci, &err, ecc_type);
|
||||||
}
|
}
|
||||||
|
|
||||||
void amd64_decode_bus_error(int node_id, struct mce *m)
|
void amd64_decode_bus_error(int node_id, struct mce *m)
|
||||||
|
|
|
@ -376,6 +376,23 @@ struct amd64_pvt {
|
||||||
struct error_injection injection;
|
struct error_injection injection;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum err_codes {
|
||||||
|
DECODE_OK = 0,
|
||||||
|
ERR_NODE = -1,
|
||||||
|
ERR_CSROW = -2,
|
||||||
|
ERR_CHANNEL = -3,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct err_info {
|
||||||
|
int err_code;
|
||||||
|
struct mem_ctl_info *src_mci;
|
||||||
|
int csrow;
|
||||||
|
int channel;
|
||||||
|
u16 syndrome;
|
||||||
|
u32 page;
|
||||||
|
u32 offset;
|
||||||
|
};
|
||||||
|
|
||||||
static inline u64 get_dram_base(struct amd64_pvt *pvt, unsigned i)
|
static inline u64 get_dram_base(struct amd64_pvt *pvt, unsigned i)
|
||||||
{
|
{
|
||||||
u64 addr = ((u64)pvt->ranges[i].base.lo & 0xffff0000) << 8;
|
u64 addr = ((u64)pvt->ranges[i].base.lo & 0xffff0000) << 8;
|
||||||
|
@ -449,7 +466,7 @@ static inline void amd64_remove_sysfs_inject_files(struct mem_ctl_info *mci)
|
||||||
struct low_ops {
|
struct low_ops {
|
||||||
int (*early_channel_count) (struct amd64_pvt *pvt);
|
int (*early_channel_count) (struct amd64_pvt *pvt);
|
||||||
void (*map_sysaddr_to_csrow) (struct mem_ctl_info *mci, u64 sys_addr,
|
void (*map_sysaddr_to_csrow) (struct mem_ctl_info *mci, u64 sys_addr,
|
||||||
u16 syndrome);
|
struct err_info *);
|
||||||
int (*dbam_to_cs) (struct amd64_pvt *pvt, u8 dct, unsigned cs_mode);
|
int (*dbam_to_cs) (struct amd64_pvt *pvt, u8 dct, unsigned cs_mode);
|
||||||
int (*read_dct_pci_cfg) (struct amd64_pvt *pvt, int offset,
|
int (*read_dct_pci_cfg) (struct amd64_pvt *pvt, int offset,
|
||||||
u32 *val, const char *func);
|
u32 *val, const char *func);
|
||||||
|
|
Загрузка…
Ссылка в новой задаче