From e44ef4e4516cce783e95d7221936aa9a4f325ad9 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Thu, 16 May 2019 09:49:20 +0300 Subject: [PATCH 01/15] devlink: Hang reporter's dump method on a dumpit cb The devlink health reporter provides a dump method on an error. Dump may contain a large amount of data, in this case doit cb isn't sufficient. This is because the user side is blocking and doesn't allow draining of the socket until the socket runs out of buffers. Using dumpit cb is the correct way to go. Please note that thankfully the dump op is not yet implemented in any driver and therefore this change is not breaking userspace. Fixes: 35455e23e6f3 ("devlink: Add health dump {get,clear} commands") Signed-off-by: Aya Levin Acked-by: Jiri Pirko Signed-off-by: Saeed Mahameed --- net/core/devlink.c | 120 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 99 insertions(+), 21 deletions(-) diff --git a/net/core/devlink.c b/net/core/devlink.c index fd15a66c1d2f..4baf716e535e 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4518,6 +4518,35 @@ nla_put_failure: return err; } +static int devlink_fmsg_dumpit(struct devlink_fmsg *fmsg, struct sk_buff *skb, + struct netlink_callback *cb, + enum devlink_command cmd) +{ + int index = cb->args[0]; + int tmp_index = index; + void *hdr; + int err; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI, cmd); + if (!hdr) { + err = -EMSGSIZE; + goto nla_put_failure; + } + + err = devlink_fmsg_prepare_skb(fmsg, skb, &index); + if ((err && err != -EMSGSIZE) || tmp_index == index) + goto nla_put_failure; + + cb->args[0] = index; + genlmsg_end(skb, hdr); + return skb->len; + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return err; +} + struct devlink_health_reporter { struct list_head list; void *priv; @@ -4750,17 +4779,16 @@ int devlink_health_report(struct devlink_health_reporter *reporter, EXPORT_SYMBOL_GPL(devlink_health_report); static struct devlink_health_reporter * -devlink_health_reporter_get_from_info(struct devlink *devlink, - struct genl_info *info) +devlink_health_reporter_get_from_attrs(struct devlink *devlink, + struct nlattr **attrs) { struct devlink_health_reporter *reporter; char *reporter_name; - if (!info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]) + if (!attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]) return NULL; - reporter_name = - nla_data(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]); + reporter_name = nla_data(attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]); mutex_lock(&devlink->reporters_lock); reporter = devlink_health_reporter_find_by_name(devlink, reporter_name); if (reporter) @@ -4769,6 +4797,48 @@ devlink_health_reporter_get_from_info(struct devlink *devlink, return reporter; } +static struct devlink_health_reporter * +devlink_health_reporter_get_from_info(struct devlink *devlink, + struct genl_info *info) +{ + return devlink_health_reporter_get_from_attrs(devlink, info->attrs); +} + +static struct devlink_health_reporter * +devlink_health_reporter_get_from_cb(struct netlink_callback *cb) +{ + struct devlink_health_reporter *reporter; + struct devlink *devlink; + struct nlattr **attrs; + int err; + + attrs = kmalloc_array(DEVLINK_ATTR_MAX + 1, sizeof(*attrs), GFP_KERNEL); + if (!attrs) + return NULL; + + err = nlmsg_parse_deprecated(cb->nlh, + GENL_HDRLEN + devlink_nl_family.hdrsize, + attrs, DEVLINK_ATTR_MAX, + devlink_nl_family.policy, cb->extack); + if (err) + goto free; + + mutex_lock(&devlink_mutex); + devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs); + if (IS_ERR(devlink)) + goto unlock; + + reporter = devlink_health_reporter_get_from_attrs(devlink, attrs); + mutex_unlock(&devlink_mutex); + kfree(attrs); + return reporter; +unlock: + mutex_unlock(&devlink_mutex); +free: + kfree(attrs); + return NULL; +} + static void devlink_health_reporter_put(struct devlink_health_reporter *reporter) { @@ -5004,32 +5074,40 @@ out: return err; } -static int devlink_nl_cmd_health_reporter_dump_get_doit(struct sk_buff *skb, - struct genl_info *info) +static int +devlink_nl_cmd_health_reporter_dump_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) { - struct devlink *devlink = info->user_ptr[0]; struct devlink_health_reporter *reporter; + u64 start = cb->args[0]; int err; - reporter = devlink_health_reporter_get_from_info(devlink, info); + reporter = devlink_health_reporter_get_from_cb(cb); if (!reporter) return -EINVAL; if (!reporter->ops->dump) { - devlink_health_reporter_put(reporter); - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + goto out; + } + mutex_lock(&reporter->dump_lock); + if (!start) { + err = devlink_health_do_dump(reporter, NULL); + if (err) + goto unlock; + cb->args[1] = reporter->dump_ts; + } + if (!reporter->dump_fmsg || cb->args[1] != reporter->dump_ts) { + NL_SET_ERR_MSG_MOD(cb->extack, "Dump trampled, please retry"); + err = -EAGAIN; + goto unlock; } - mutex_lock(&reporter->dump_lock); - err = devlink_health_do_dump(reporter, NULL); - if (err) - goto out; - - err = devlink_fmsg_snd(reporter->dump_fmsg, info, - DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, 0); - -out: + err = devlink_fmsg_dumpit(reporter->dump_fmsg, skb, cb, + DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET); +unlock: mutex_unlock(&reporter->dump_lock); +out: devlink_health_reporter_put(reporter); return err; } @@ -5366,7 +5444,7 @@ static const struct genl_ops devlink_nl_ops[] = { { .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_health_reporter_dump_get_doit, + .dumpit = devlink_nl_cmd_health_reporter_dump_get_dumpit, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | DEVLINK_NL_FLAG_NO_LOCK, From 00091c0da136f36feaa974f761fd0fc8905a29c9 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 11 Jun 2019 17:15:13 -0700 Subject: [PATCH 02/15] Documentation: net: mlx5: Add mlx5 initial documentation Add initial documentation for mlx5 driver. Signed-off-by: Saeed Mahameed --- .../networking/device_drivers/index.rst | 1 + .../device_drivers/mellanox/mlx5.rst | 101 ++++++++++++++++++ MAINTAINERS | 1 + 3 files changed, 103 insertions(+) create mode 100644 Documentation/networking/device_drivers/mellanox/mlx5.rst diff --git a/Documentation/networking/device_drivers/index.rst b/Documentation/networking/device_drivers/index.rst index 75fa537763a4..24598d5f8ffa 100644 --- a/Documentation/networking/device_drivers/index.rst +++ b/Documentation/networking/device_drivers/index.rst @@ -21,6 +21,7 @@ Contents: intel/i40e intel/iavf intel/ice + mellanox/mlx5 .. only:: subproject diff --git a/Documentation/networking/device_drivers/mellanox/mlx5.rst b/Documentation/networking/device_drivers/mellanox/mlx5.rst new file mode 100644 index 000000000000..0be802186d12 --- /dev/null +++ b/Documentation/networking/device_drivers/mellanox/mlx5.rst @@ -0,0 +1,101 @@ +.. SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB + +================================================= +Mellanox ConnectX(R) mlx5 core VPI Network Driver +================================================= + +Copyright (c) 2019, Mellanox Technologies LTD. + +Contents +======== + +- `Enabling the driver and kconfig options`_ + +Enabling the driver and kconfig options +================================================ + +| mlx5 core is modular and most of the major mlx5 core driver features can be selected (compiled in/out) +| at build time via kernel Kconfig flags. +| Basic features, ethernet net device rx/tx offloads and XDP, are available with the most basic flags +| CONFIG_MLX5_CORE=y/m and CONFIG_MLX5_CORE_EN=y. +| For the list of advanced features please see below. + +**CONFIG_MLX5_CORE=(y/m/n)** (module mlx5_core.ko) + +| The driver can be enabled by choosing CONFIG_MLX5_CORE=y/m in kernel config. +| This will provide mlx5 core driver for mlx5 ulps to interface with (mlx5e, mlx5_ib). + + +**CONFIG_MLX5_CORE_EN=(y/n)** + +| Choosing this option will allow basic ethernet netdevice support with all of the standard rx/tx offloads. +| mlx5e is the mlx5 ulp driver which provides netdevice kernel interface, when chosen, mlx5e will be +| built-in into mlx5_core.ko. + + +**CONFIG_MLX5_EN_ARFS=(y/n)** + +| Enables Hardware-accelerated receive flow steering (arfs) support, and ntuple filtering. +| https://community.mellanox.com/s/article/howto-configure-arfs-on-connectx-4 + + +**CONFIG_MLX5_EN_RXNFC=(y/n)** + +| Enables ethtool receive network flow classification, which allows user defined +| flow rules to direct traffic into arbitrary rx queue via ethtool set/get_rxnfc API. + + +**CONFIG_MLX5_CORE_EN_DCB=(y/n)**: + +| Enables `Data Center Bridging (DCB) Support `_. + + +**CONFIG_MLX5_MPFS=(y/n)** + +| Ethernet Multi-Physical Function Switch (MPFS) support in ConnectX NIC. +| MPFs is required for when `Multi-Host `_ configuration is enabled to allow passing +| user configured unicast MAC addresses to the requesting PF. + + +**CONFIG_MLX5_ESWITCH=(y/n)** + +| Ethernet SRIOV E-Switch support in ConnectX NIC. E-Switch provides internal SRIOV packet steering +| and switching for the enabled VFs and PF in two available modes: +| 1) `Legacy SRIOV mode (L2 mac vlan steering based) `_. +| 2) `Switchdev mode (eswitch offloads) `_. + + +**CONFIG_MLX5_CORE_IPOIB=(y/n)** + +| IPoIB offloads & acceleration support. +| Requires CONFIG_MLX5_CORE_EN to provide an accelerated interface for the rdma +| IPoIB ulp netdevice. + + +**CONFIG_MLX5_FPGA=(y/n)** + +| Build support for the Innova family of network cards by Mellanox Technologies. +| Innova network cards are comprised of a ConnectX chip and an FPGA chip on one board. +| If you select this option, the mlx5_core driver will include the Innova FPGA core and allow +| building sandbox-specific client drivers. + + +**CONFIG_MLX5_EN_IPSEC=(y/n)** + +| Enables `IPSec XFRM cryptography-offload accelaration `_. + +**CONFIG_MLX5_EN_TLS=(y/n)** + +| TLS cryptography-offload accelaration. + + +**CONFIG_MLX5_INFINIBAND=(y/n/m)** (module mlx5_ib.ko) + +| Provides low-level InfiniBand/RDMA and `RoCE `_ support. + + +**External options** ( Choose if the corresponding mlx5 feature is required ) + +- CONFIG_PTP_1588_CLOCK: When chosen, mlx5 ptp support will be enabled +- CONFIG_VXLAN: When chosen, mlx5 vxaln support will be enabled. +- CONFIG_MLXFW: When chosen, mlx5 firmware flashing support will be enabled (via devlink and ethtool). diff --git a/MAINTAINERS b/MAINTAINERS index b70d5d5716a9..863b33cd30e0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10108,6 +10108,7 @@ Q: http://patchwork.ozlabs.org/project/netdev/list/ S: Supported F: drivers/net/ethernet/mellanox/mlx5/core/ F: include/linux/mlx5/ +F: Documentation/networking/device_drivers/mellanox/ MELLANOX MLX5 IB driver M: Leon Romanovsky From 1f28d7768f1d181249318a0f61949d4cb412e8c4 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Tue, 11 Dec 2018 16:09:51 +0200 Subject: [PATCH 03/15] net/mlx5: Move all devlink related functions calls to devlink.c Centralize all devlink related callbacks in one file. In the downstream patch, some more functionality will be added, this patch is preparing the driver infrastructure for it. Currently, move devlink un/register functions calls into this file. Signed-off-by: Eran Ben Elisha Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/Makefile | 2 +- .../net/ethernet/mellanox/mlx5/core/devlink.c | 58 +++++++++++++++++++ .../net/ethernet/mellanox/mlx5/core/devlink.h | 14 +++++ .../net/ethernet/mellanox/mlx5/core/main.c | 44 +++----------- 4 files changed, 80 insertions(+), 38 deletions(-) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/devlink.c create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/devlink.h diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index d9d363fe5cf7..9006fda6bd11 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -15,7 +15,7 @@ mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \ health.o mcg.o cq.o alloc.o qp.o port.o mr.o pd.o \ transobj.o vport.o sriov.o fs_cmd.o fs_core.o \ fs_counters.o rl.o lag.o dev.o events.o wq.o lib/gid.o \ - lib/devcom.o diag/fs_tracepoint.o diag/fw_tracer.o + lib/devcom.o diag/fs_tracepoint.o diag/fw_tracer.o devlink.o # # Netdev basic diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c new file mode 100644 index 000000000000..ed4202e883f0 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2019 Mellanox Technologies */ + +#include + +#include "mlx5_core.h" +#include "eswitch.h" + +static int mlx5_devlink_flash_update(struct devlink *devlink, + const char *file_name, + const char *component, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + const struct firmware *fw; + int err; + + if (component) + return -EOPNOTSUPP; + + err = request_firmware_direct(&fw, file_name, &dev->pdev->dev); + if (err) + return err; + + return mlx5_firmware_flash(dev, fw, extack); +} + +static const struct devlink_ops mlx5_devlink_ops = { +#ifdef CONFIG_MLX5_ESWITCH + .eswitch_mode_set = mlx5_devlink_eswitch_mode_set, + .eswitch_mode_get = mlx5_devlink_eswitch_mode_get, + .eswitch_inline_mode_set = mlx5_devlink_eswitch_inline_mode_set, + .eswitch_inline_mode_get = mlx5_devlink_eswitch_inline_mode_get, + .eswitch_encap_mode_set = mlx5_devlink_eswitch_encap_mode_set, + .eswitch_encap_mode_get = mlx5_devlink_eswitch_encap_mode_get, +#endif + .flash_update = mlx5_devlink_flash_update, +}; + +struct devlink *mlx5_devlink_alloc() +{ + return devlink_alloc(&mlx5_devlink_ops, sizeof(struct mlx5_core_dev)); +} + +void mlx5_devlink_free(struct devlink *devlink) +{ + devlink_free(devlink); +} + +int mlx5_devlink_register(struct devlink *devlink, struct device *dev) +{ + return devlink_register(devlink, dev); +} + +void mlx5_devlink_unregister(struct devlink *devlink) +{ + devlink_unregister(devlink); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h new file mode 100644 index 000000000000..d0ba03774ddf --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2019, Mellanox Technologies */ + +#ifndef __MLX5_DEVLINK_H__ +#define __MLX5_DEVLINK_H__ + +#include + +struct devlink *mlx5_devlink_alloc(void); +void mlx5_devlink_free(struct devlink *devlink); +int mlx5_devlink_register(struct devlink *devlink, struct device *dev); +void mlx5_devlink_unregister(struct devlink *devlink); + +#endif /* __MLX5_DEVLINK_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 7ec135eaabc6..5ea141893b99 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -56,6 +56,7 @@ #include "fs_core.h" #include "lib/mpfs.h" #include "eswitch.h" +#include "devlink.h" #include "lib/mlx5.h" #include "fpga/core.h" #include "fpga/ipsec.h" @@ -1214,37 +1215,6 @@ out: return err; } -static int mlx5_devlink_flash_update(struct devlink *devlink, - const char *file_name, - const char *component, - struct netlink_ext_ack *extack) -{ - struct mlx5_core_dev *dev = devlink_priv(devlink); - const struct firmware *fw; - int err; - - if (component) - return -EOPNOTSUPP; - - err = request_firmware_direct(&fw, file_name, &dev->pdev->dev); - if (err) - return err; - - return mlx5_firmware_flash(dev, fw, extack); -} - -static const struct devlink_ops mlx5_devlink_ops = { -#ifdef CONFIG_MLX5_ESWITCH - .eswitch_mode_set = mlx5_devlink_eswitch_mode_set, - .eswitch_mode_get = mlx5_devlink_eswitch_mode_get, - .eswitch_inline_mode_set = mlx5_devlink_eswitch_inline_mode_set, - .eswitch_inline_mode_get = mlx5_devlink_eswitch_inline_mode_get, - .eswitch_encap_mode_set = mlx5_devlink_eswitch_encap_mode_set, - .eswitch_encap_mode_get = mlx5_devlink_eswitch_encap_mode_get, -#endif - .flash_update = mlx5_devlink_flash_update, -}; - static int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx) { struct mlx5_priv *priv = &dev->priv; @@ -1306,9 +1276,9 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *id) struct devlink *devlink; int err; - devlink = devlink_alloc(&mlx5_devlink_ops, sizeof(*dev)); + devlink = mlx5_devlink_alloc(); if (!devlink) { - dev_err(&pdev->dev, "kzalloc failed\n"); + dev_err(&pdev->dev, "devlink alloc failed\n"); return -ENOMEM; } @@ -1336,7 +1306,7 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *id) request_module_nowait(MLX5_IB_MOD); - err = devlink_register(devlink, &pdev->dev); + err = mlx5_devlink_register(devlink, &pdev->dev); if (err) goto clean_load; @@ -1351,7 +1321,7 @@ err_load_one: pci_init_err: mlx5_mdev_uninit(dev); mdev_init_err: - devlink_free(devlink); + mlx5_devlink_free(devlink); return err; } @@ -1361,7 +1331,7 @@ static void remove_one(struct pci_dev *pdev) struct mlx5_core_dev *dev = pci_get_drvdata(pdev); struct devlink *devlink = priv_to_devlink(dev); - devlink_unregister(devlink); + mlx5_devlink_unregister(devlink); mlx5_unregister_device(dev); if (mlx5_unload_one(dev, true)) { @@ -1372,7 +1342,7 @@ static void remove_one(struct pci_dev *pdev) mlx5_pci_close(dev); mlx5_mdev_uninit(dev); - devlink_free(devlink); + mlx5_devlink_free(devlink); } static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev, From b25bbc2f24dcab9cd186ef4003c39bf51ad0454c Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Thu, 28 Jun 2018 15:05:58 +0300 Subject: [PATCH 04/15] net/mlx5: Add Vendor Specific Capability access gateway The Vendor Specific Capability (VSC) is used to activate a gateway interfacing with the device. The gateway is used to read or write device configurations, which are organized in different domains (spaces). A configuration access may result in multiple actions, reads, writes. Example usages are accessing the Crspace domain to read the crspace or locking a device semaphore using the Semaphore domain. The configuration access use pci_cfg_access to prevent parallel access to the VSC space by the driver and userspace calls. Signed-off-by: Alex Vesker Signed-off-by: Feras Daoud Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/Makefile | 3 +- .../ethernet/mellanox/mlx5/core/lib/pci_vsc.c | 286 ++++++++++++++++++ .../ethernet/mellanox/mlx5/core/lib/pci_vsc.h | 24 ++ .../net/ethernet/mellanox/mlx5/core/main.c | 3 + include/linux/mlx5/driver.h | 1 + 5 files changed, 316 insertions(+), 1 deletion(-) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.c create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.h diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index 9006fda6bd11..8e07354faea1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -15,7 +15,8 @@ mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \ health.o mcg.o cq.o alloc.o qp.o port.o mr.o pd.o \ transobj.o vport.o sriov.o fs_cmd.o fs_core.o \ fs_counters.o rl.o lag.o dev.o events.o wq.o lib/gid.o \ - lib/devcom.o diag/fs_tracepoint.o diag/fw_tracer.o devlink.o + lib/devcom.o lib/pci_vsc.o diag/fs_tracepoint.o \ + diag/fw_tracer.o devlink.o # # Netdev basic diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.c new file mode 100644 index 000000000000..a27b0119b3d6 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.c @@ -0,0 +1,286 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2019 Mellanox Technologies */ + +#include +#include "mlx5_core.h" +#include "pci_vsc.h" + +#define MLX5_EXTRACT_C(source, offset, size) \ + ((((u32)(source)) >> (offset)) & MLX5_ONES32(size)) +#define MLX5_EXTRACT(src, start, len) \ + (((len) == 32) ? (src) : MLX5_EXTRACT_C(src, start, len)) +#define MLX5_ONES32(size) \ + ((size) ? (0xffffffff >> (32 - (size))) : 0) +#define MLX5_MASK32(offset, size) \ + (MLX5_ONES32(size) << (offset)) +#define MLX5_MERGE_C(rsrc1, rsrc2, start, len) \ + ((((rsrc2) << (start)) & (MLX5_MASK32((start), (len)))) | \ + ((rsrc1) & (~MLX5_MASK32((start), (len))))) +#define MLX5_MERGE(rsrc1, rsrc2, start, len) \ + (((len) == 32) ? (rsrc2) : MLX5_MERGE_C(rsrc1, rsrc2, start, len)) +#define vsc_read(dev, offset, val) \ + pci_read_config_dword((dev)->pdev, (dev)->vsc_addr + (offset), (val)) +#define vsc_write(dev, offset, val) \ + pci_write_config_dword((dev)->pdev, (dev)->vsc_addr + (offset), (val)) +#define VSC_MAX_RETRIES 2048 + +enum mlx5_vsc_state { + MLX5_VSC_UNLOCK, + MLX5_VSC_LOCK, +}; + +enum { + VSC_CTRL_OFFSET = 0x4, + VSC_COUNTER_OFFSET = 0x8, + VSC_SEMAPHORE_OFFSET = 0xc, + VSC_ADDR_OFFSET = 0x10, + VSC_DATA_OFFSET = 0x14, + + VSC_FLAG_BIT_OFFS = 31, + VSC_FLAG_BIT_LEN = 1, + + VSC_SYND_BIT_OFFS = 30, + VSC_SYND_BIT_LEN = 1, + + VSC_ADDR_BIT_OFFS = 0, + VSC_ADDR_BIT_LEN = 30, + + VSC_SPACE_BIT_OFFS = 0, + VSC_SPACE_BIT_LEN = 16, + + VSC_SIZE_VLD_BIT_OFFS = 28, + VSC_SIZE_VLD_BIT_LEN = 1, + + VSC_STATUS_BIT_OFFS = 29, + VSC_STATUS_BIT_LEN = 3, +}; + +void mlx5_pci_vsc_init(struct mlx5_core_dev *dev) +{ + if (!mlx5_core_is_pf(dev)) + return; + + dev->vsc_addr = pci_find_capability(dev->pdev, + PCI_CAP_ID_VNDR); + if (!dev->vsc_addr) + mlx5_core_warn(dev, "Failed to get valid vendor specific ID\n"); +} + +int mlx5_vsc_gw_lock(struct mlx5_core_dev *dev) +{ + u32 counter = 0; + int retries = 0; + u32 lock_val; + int ret; + + pci_cfg_access_lock(dev->pdev); + do { + if (retries > VSC_MAX_RETRIES) { + ret = -EBUSY; + goto pci_unlock; + } + + /* Check if semaphore is already locked */ + ret = vsc_read(dev, VSC_SEMAPHORE_OFFSET, &lock_val); + if (ret) + goto pci_unlock; + + if (lock_val) { + retries++; + usleep_range(1000, 2000); + continue; + } + + /* Read and write counter value, if written value is + * the same, semaphore was acquired successfully. + */ + ret = vsc_read(dev, VSC_COUNTER_OFFSET, &counter); + if (ret) + goto pci_unlock; + + ret = vsc_write(dev, VSC_SEMAPHORE_OFFSET, counter); + if (ret) + goto pci_unlock; + + ret = vsc_read(dev, VSC_SEMAPHORE_OFFSET, &lock_val); + if (ret) + goto pci_unlock; + + retries++; + } while (counter != lock_val); + + return 0; + +pci_unlock: + pci_cfg_access_unlock(dev->pdev); + return ret; +} + +int mlx5_vsc_gw_unlock(struct mlx5_core_dev *dev) +{ + int ret; + + ret = vsc_write(dev, VSC_SEMAPHORE_OFFSET, MLX5_VSC_UNLOCK); + pci_cfg_access_unlock(dev->pdev); + return ret; +} + +int mlx5_vsc_gw_set_space(struct mlx5_core_dev *dev, u16 space, + u32 *ret_space_size) +{ + int ret; + u32 val = 0; + + if (!mlx5_vsc_accessible(dev)) + return -EINVAL; + + if (ret_space_size) + *ret_space_size = 0; + + /* Get a unique val */ + ret = vsc_read(dev, VSC_CTRL_OFFSET, &val); + if (ret) + goto out; + + /* Try to modify the lock */ + val = MLX5_MERGE(val, space, VSC_SPACE_BIT_OFFS, VSC_SPACE_BIT_LEN); + ret = vsc_write(dev, VSC_CTRL_OFFSET, val); + if (ret) + goto out; + + /* Verify lock was modified */ + ret = vsc_read(dev, VSC_CTRL_OFFSET, &val); + if (ret) + goto out; + + if (MLX5_EXTRACT(val, VSC_STATUS_BIT_OFFS, VSC_STATUS_BIT_LEN) == 0) + return -EINVAL; + + /* Get space max address if indicated by size valid bit */ + if (ret_space_size && + MLX5_EXTRACT(val, VSC_SIZE_VLD_BIT_OFFS, VSC_SIZE_VLD_BIT_LEN)) { + ret = vsc_read(dev, VSC_ADDR_OFFSET, &val); + if (ret) { + mlx5_core_warn(dev, "Failed to get max space size\n"); + goto out; + } + *ret_space_size = MLX5_EXTRACT(val, VSC_ADDR_BIT_OFFS, + VSC_ADDR_BIT_LEN); + } + return 0; + +out: + return ret; +} + +static int mlx5_vsc_wait_on_flag(struct mlx5_core_dev *dev, u8 expected_val) +{ + int retries = 0; + u32 flag; + int ret; + + do { + if (retries > VSC_MAX_RETRIES) + return -EBUSY; + + ret = vsc_read(dev, VSC_ADDR_OFFSET, &flag); + if (ret) + return ret; + flag = MLX5_EXTRACT(flag, VSC_FLAG_BIT_OFFS, VSC_FLAG_BIT_LEN); + retries++; + + if ((retries & 0xf) == 0) + usleep_range(1000, 2000); + + } while (flag != expected_val); + + return 0; +} + +static int mlx5_vsc_gw_write(struct mlx5_core_dev *dev, unsigned int address, + u32 data) +{ + int ret; + + if (MLX5_EXTRACT(address, VSC_SYND_BIT_OFFS, + VSC_FLAG_BIT_LEN + VSC_SYND_BIT_LEN)) + return -EINVAL; + + /* Set flag to 0x1 */ + address = MLX5_MERGE(address, 1, VSC_FLAG_BIT_OFFS, 1); + ret = vsc_write(dev, VSC_DATA_OFFSET, data); + if (ret) + goto out; + + ret = vsc_write(dev, VSC_ADDR_OFFSET, address); + if (ret) + goto out; + + /* Wait for the flag to be cleared */ + ret = mlx5_vsc_wait_on_flag(dev, 0); + +out: + return ret; +} + +static int mlx5_vsc_gw_read(struct mlx5_core_dev *dev, unsigned int address, + u32 *data) +{ + int ret; + + if (MLX5_EXTRACT(address, VSC_SYND_BIT_OFFS, + VSC_FLAG_BIT_LEN + VSC_SYND_BIT_LEN)) + return -EINVAL; + + ret = vsc_write(dev, VSC_ADDR_OFFSET, address); + if (ret) + goto out; + + ret = mlx5_vsc_wait_on_flag(dev, 1); + if (ret) + goto out; + + ret = vsc_read(dev, VSC_DATA_OFFSET, data); +out: + return ret; +} + +static int mlx5_vsc_gw_read_fast(struct mlx5_core_dev *dev, + unsigned int read_addr, + unsigned int *next_read_addr, + u32 *data) +{ + int ret; + + ret = mlx5_vsc_gw_read(dev, read_addr, data); + if (ret) + goto out; + + ret = vsc_read(dev, VSC_ADDR_OFFSET, next_read_addr); + if (ret) + goto out; + + *next_read_addr = MLX5_EXTRACT(*next_read_addr, VSC_ADDR_BIT_OFFS, + VSC_ADDR_BIT_LEN); + + if (*next_read_addr <= read_addr) + ret = -EINVAL; +out: + return ret; +} + +int mlx5_vsc_gw_read_block_fast(struct mlx5_core_dev *dev, u32 *data, + int length) +{ + unsigned int next_read_addr = 0; + unsigned int read_addr = 0; + + while (read_addr < length) { + if (mlx5_vsc_gw_read_fast(dev, read_addr, &next_read_addr, + &data[(read_addr >> 2)])) + return read_addr; + + read_addr = next_read_addr; + } + return length; +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.h new file mode 100644 index 000000000000..28ea6bfa439f --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2019 Mellanox Technologies */ + +#ifndef __MLX5_PCI_VSC_H__ +#define __MLX5_PCI_VSC_H__ + +enum { + MLX5_VSC_SPACE_SCAN_CRSPACE = 0x7, +}; + +void mlx5_pci_vsc_init(struct mlx5_core_dev *dev); +int mlx5_vsc_gw_lock(struct mlx5_core_dev *dev); +int mlx5_vsc_gw_unlock(struct mlx5_core_dev *dev); +int mlx5_vsc_gw_set_space(struct mlx5_core_dev *dev, u16 space, + u32 *ret_space_size); +int mlx5_vsc_gw_read_block_fast(struct mlx5_core_dev *dev, u32 *data, + int length); + +static inline bool mlx5_vsc_accessible(struct mlx5_core_dev *dev) +{ + return !!dev->vsc_addr; +} + +#endif /* __MLX5_PCI_VSC_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 5ea141893b99..3adc09a1a312 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -66,6 +66,7 @@ #include "lib/vxlan.h" #include "lib/geneve.h" #include "lib/devcom.h" +#include "lib/pci_vsc.h" #include "diag/fw_tracer.h" #include "ecpf.h" @@ -763,6 +764,8 @@ static int mlx5_pci_init(struct mlx5_core_dev *dev, struct pci_dev *pdev, goto err_clr_master; } + mlx5_pci_vsc_init(dev); + return 0; err_clr_master: diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 3a810bf043fe..f732445bcbdb 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -693,6 +693,7 @@ struct mlx5_core_dev { struct mlx5_clock clock; struct mlx5_ib_clock_info *clock_info; struct mlx5_fw_tracer *tracer; + u32 vsc_addr; }; struct mlx5_db { From 8b9d8baae1de7400f19058020ee8f0f27d436687 Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Tue, 17 Jul 2018 11:18:26 +0300 Subject: [PATCH 05/15] net/mlx5: Add Crdump support Crdump allows the driver to retrieve a dump of the FW PCI crspace. This is useful in case of catastrophic issues which may require FW reset. The crspace dump can be used for later debug. Signed-off-by: Alex Vesker Signed-off-by: Moshe Shemesh Reviewed-by: Feras Daoud Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/Makefile | 2 +- .../ethernet/mellanox/mlx5/core/diag/crdump.c | 106 ++++++++++++++++++ .../ethernet/mellanox/mlx5/core/lib/mlx5.h | 3 + .../net/ethernet/mellanox/mlx5/core/main.c | 5 + include/linux/mlx5/driver.h | 1 + 5 files changed, 116 insertions(+), 1 deletion(-) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index 8e07354faea1..5fe2bf916c06 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -16,7 +16,7 @@ mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \ transobj.o vport.o sriov.o fs_cmd.o fs_core.o \ fs_counters.o rl.o lag.o dev.o events.o wq.o lib/gid.o \ lib/devcom.o lib/pci_vsc.o diag/fs_tracepoint.o \ - diag/fw_tracer.o devlink.o + diag/fw_tracer.o diag/crdump.o devlink.o # # Netdev basic diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c new file mode 100644 index 000000000000..dfb34172c69b --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c @@ -0,0 +1,106 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2019 Mellanox Technologies */ + +#include +#include "mlx5_core.h" +#include "lib/pci_vsc.h" +#include "lib/mlx5.h" + +#define BAD_ACCESS 0xBADACCE5 +#define MLX5_PROTECTED_CR_SCAN_CRSPACE 0x7 + +static bool mlx5_crdump_enabled(struct mlx5_core_dev *dev) +{ + return !!dev->priv.health.crdump_size; +} + +static int mlx5_crdump_fill(struct mlx5_core_dev *dev, u32 *cr_data) +{ + u32 crdump_size = dev->priv.health.crdump_size; + int i, ret; + + for (i = 0; i < (crdump_size / 4); i++) + cr_data[i] = BAD_ACCESS; + + ret = mlx5_vsc_gw_read_block_fast(dev, cr_data, crdump_size); + if (ret <= 0) { + if (ret == 0) + return -EIO; + return ret; + } + + if (crdump_size != ret) { + mlx5_core_warn(dev, "failed to read full dump, read %d out of %u\n", + ret, crdump_size); + return -EINVAL; + } + + return 0; +} + +int mlx5_crdump_collect(struct mlx5_core_dev *dev, u32 *cr_data) +{ + int ret; + + if (!mlx5_crdump_enabled(dev)) + return -ENODEV; + + ret = mlx5_vsc_gw_lock(dev); + if (ret) { + mlx5_core_warn(dev, "crdump: failed to lock vsc gw err %d\n", + ret); + return ret; + } + + ret = mlx5_vsc_gw_set_space(dev, MLX5_VSC_SPACE_SCAN_CRSPACE, NULL); + if (ret) + goto unlock; + + ret = mlx5_crdump_fill(dev, cr_data); + +unlock: + mlx5_vsc_gw_unlock(dev); + return ret; +} + +int mlx5_crdump_enable(struct mlx5_core_dev *dev) +{ + struct mlx5_priv *priv = &dev->priv; + u32 space_size; + int ret; + + if (!mlx5_core_is_pf(dev) || !mlx5_vsc_accessible(dev) || + mlx5_crdump_enabled(dev)) + return 0; + + ret = mlx5_vsc_gw_lock(dev); + if (ret) + return ret; + + /* Check if space is supported and get space size */ + ret = mlx5_vsc_gw_set_space(dev, MLX5_VSC_SPACE_SCAN_CRSPACE, + &space_size); + if (ret) { + /* Unlock and mask error since space is not supported */ + mlx5_vsc_gw_unlock(dev); + return 0; + } + + if (!space_size) { + mlx5_core_warn(dev, "Invalid Crspace size, zero\n"); + mlx5_vsc_gw_unlock(dev); + return -EINVAL; + } + + ret = mlx5_vsc_gw_unlock(dev); + if (ret) + return ret; + + priv->health.crdump_size = space_size; + return 0; +} + +void mlx5_crdump_disable(struct mlx5_core_dev *dev) +{ + dev->priv.health.crdump_size = 0; +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h index 397a2847867a..d918e44491f4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h @@ -41,6 +41,9 @@ int mlx5_core_reserve_gids(struct mlx5_core_dev *dev, unsigned int count); void mlx5_core_unreserve_gids(struct mlx5_core_dev *dev, unsigned int count); int mlx5_core_reserved_gid_alloc(struct mlx5_core_dev *dev, int *gid_index); void mlx5_core_reserved_gid_free(struct mlx5_core_dev *dev, int gid_index); +int mlx5_crdump_enable(struct mlx5_core_dev *dev); +void mlx5_crdump_disable(struct mlx5_core_dev *dev); +int mlx5_crdump_collect(struct mlx5_core_dev *dev, u32 *cr_data); /* TODO move to lib/events.h */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 3adc09a1a312..c70e97071b87 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1313,6 +1313,10 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *id) if (err) goto clean_load; + err = mlx5_crdump_enable(dev); + if (err) + dev_err(&pdev->dev, "mlx5_crdump_enable failed with error code %d\n", err); + pci_save_state(pdev); return 0; @@ -1334,6 +1338,7 @@ static void remove_one(struct pci_dev *pdev) struct mlx5_core_dev *dev = pci_get_drvdata(pdev); struct devlink *devlink = priv_to_devlink(dev); + mlx5_crdump_disable(dev); mlx5_devlink_unregister(devlink); mlx5_unregister_device(dev); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index f732445bcbdb..4ae533b3da07 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -435,6 +435,7 @@ struct mlx5_core_health { u32 prev; int miss_counter; bool sick; + u32 crdump_size; /* wq spinlock to synchronize draining */ spinlock_t wq_lock; struct workqueue_struct *wq; From 63cbc552eebf08818af2025aef4589a48ef849c0 Mon Sep 17 00:00:00 2001 From: Feras Daoud Date: Mon, 12 Nov 2018 15:23:02 +0200 Subject: [PATCH 06/15] net/mlx5: Handle SW reset of FW in error flow New mlx5 adapters allow the driver to reset the FW in the event of an error, this action called "SW Reset". When an SW reset is issued on any PF all PFs enter reset state which is a recoverable condition. The existing recovery flow was designed to allow the recovery of a VF after a PF driver reload. This patch adds the sw reset to the NIC states as a preparation for sw reset handling. When a software reset is issued the following occurs: 1. The NIC interface mode is set to 7 while the reset is in progress. 2. Once the reset completes the NIC interface mode is set to 1. Signed-off-by: Feras Daoud Signed-off-by: Moshe Shemesh Signed-off-by: Daniel Jurgens Reviewed-by: Alex Vesker Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/en_selftest.c | 2 +- .../net/ethernet/mellanox/mlx5/core/health.c | 105 ++++++++---------- .../net/ethernet/mellanox/mlx5/core/main.c | 2 +- .../ethernet/mellanox/mlx5/core/mlx5_core.h | 2 +- include/linux/mlx5/driver.h | 2 +- 5 files changed, 48 insertions(+), 65 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c index 4382ef85488c..840ec945ccba 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c @@ -64,7 +64,7 @@ static int mlx5e_test_health_info(struct mlx5e_priv *priv) { struct mlx5_core_health *health = &priv->mdev->priv.health; - return health->sick ? 1 : 0; + return health->fatal_error ? 1 : 0; } static int mlx5e_test_link_state(struct mlx5e_priv *priv) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index a2656f4008d9..737e6d550775 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -62,12 +62,18 @@ enum { enum { MLX5_DROP_NEW_HEALTH_WORK, - MLX5_DROP_NEW_RECOVERY_WORK, +}; + +enum { + MLX5_SENSOR_NO_ERR = 0, + MLX5_SENSOR_PCI_COMM_ERR = 1, + MLX5_SENSOR_NIC_DISABLED = 2, + MLX5_SENSOR_NIC_SW_RESET = 3, }; u8 mlx5_get_nic_state(struct mlx5_core_dev *dev) { - return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3; + return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 7; } void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state) @@ -80,18 +86,25 @@ void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state) &dev->iseg->cmdq_addr_l_sz); } -static int in_fatal(struct mlx5_core_dev *dev) +static bool sensor_pci_not_working(struct mlx5_core_dev *dev) { struct mlx5_core_health *health = &dev->priv.health; struct health_buffer __iomem *h = health->health; + /* Offline PCI reads return 0xffffffff */ + return (ioread32be(&h->fw_ver) == 0xffffffff); +} + +static u32 check_fatal_sensors(struct mlx5_core_dev *dev) +{ + if (sensor_pci_not_working(dev)) + return MLX5_SENSOR_PCI_COMM_ERR; if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED) - return 1; + return MLX5_SENSOR_NIC_DISABLED; + if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_SW_RESET) + return MLX5_SENSOR_NIC_SW_RESET; - if (ioread32be(&h->fw_ver) == 0xffffffff) - return 1; - - return 0; + return MLX5_SENSOR_NO_ERR; } void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force) @@ -101,7 +114,8 @@ void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force) goto unlock; mlx5_core_err(dev, "start\n"); - if (pci_channel_offline(dev->pdev) || in_fatal(dev) || force) { + if (pci_channel_offline(dev->pdev) || + dev->priv.health.fatal_error != MLX5_SENSOR_NO_ERR || force) { dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; mlx5_cmd_flush(dev); } @@ -137,38 +151,14 @@ static void mlx5_handle_bad_state(struct mlx5_core_dev *dev) mlx5_disable_device(dev); } -static void health_recover(struct work_struct *work) -{ - struct mlx5_core_health *health; - struct delayed_work *dwork; - struct mlx5_core_dev *dev; - struct mlx5_priv *priv; - u8 nic_state; - - dwork = container_of(work, struct delayed_work, work); - health = container_of(dwork, struct mlx5_core_health, recover_work); - priv = container_of(health, struct mlx5_priv, health); - dev = container_of(priv, struct mlx5_core_dev, priv); - - nic_state = mlx5_get_nic_state(dev); - if (nic_state == MLX5_NIC_IFC_INVALID) { - mlx5_core_err(dev, "health recovery flow aborted since the nic state is invalid\n"); - return; - } - - mlx5_core_err(dev, "starting health recovery flow\n"); - mlx5_recover_device(dev); -} - /* How much time to wait until health resetting the driver (in msecs) */ -#define MLX5_RECOVERY_DELAY_MSECS 60000 +#define MLX5_RECOVERY_WAIT_MSECS 60000 static void health_care(struct work_struct *work) { - unsigned long recover_delay = msecs_to_jiffies(MLX5_RECOVERY_DELAY_MSECS); struct mlx5_core_health *health; struct mlx5_core_dev *dev; struct mlx5_priv *priv; - unsigned long flags; + unsigned long end; health = container_of(work, struct mlx5_core_health, work); priv = container_of(health, struct mlx5_priv, health); @@ -176,13 +166,18 @@ static void health_care(struct work_struct *work) mlx5_core_warn(dev, "handling bad device here\n"); mlx5_handle_bad_state(dev); - spin_lock_irqsave(&health->wq_lock, flags); - if (!test_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags)) - schedule_delayed_work(&health->recover_work, recover_delay); - else - mlx5_core_err(dev, - "new health works are not permitted at this stage\n"); - spin_unlock_irqrestore(&health->wq_lock, flags); + end = jiffies + msecs_to_jiffies(MLX5_RECOVERY_WAIT_MSECS); + while (sensor_pci_not_working(dev)) { + if (time_after(jiffies, end)) { + mlx5_core_err(dev, + "health recovery flow aborted, PCI reads still not working\n"); + return; + } + msleep(100); + } + + mlx5_core_err(dev, "starting health recovery flow\n"); + mlx5_recover_device(dev); } static const char *hsynd_str(u8 synd) @@ -274,6 +269,7 @@ static void poll_health(struct timer_list *t) { struct mlx5_core_dev *dev = from_timer(dev, t, priv.health.timer); struct mlx5_core_health *health = &dev->priv.health; + u32 fatal_error; u32 count; if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) @@ -291,8 +287,11 @@ static void poll_health(struct timer_list *t) print_health_info(dev); } - if (in_fatal(dev) && !health->sick) { - health->sick = true; + fatal_error = check_fatal_sensors(dev); + + if (fatal_error && !health->fatal_error) { + mlx5_core_err(dev, "Fatal error %u detected\n", fatal_error); + dev->priv.health.fatal_error = fatal_error; print_health_info(dev); mlx5_trigger_health_work(dev); } @@ -306,9 +305,8 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev) struct mlx5_core_health *health = &dev->priv.health; timer_setup(&health->timer, poll_health, 0); - health->sick = 0; + health->fatal_error = MLX5_SENSOR_NO_ERR; clear_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); - clear_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags); health->health = &dev->iseg->health; health->health_counter = &dev->iseg->health_counter; @@ -324,7 +322,6 @@ void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health) if (disable_health) { spin_lock_irqsave(&health->wq_lock, flags); set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); - set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags); spin_unlock_irqrestore(&health->wq_lock, flags); } @@ -338,23 +335,10 @@ void mlx5_drain_health_wq(struct mlx5_core_dev *dev) spin_lock_irqsave(&health->wq_lock, flags); set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); - set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags); spin_unlock_irqrestore(&health->wq_lock, flags); - cancel_delayed_work_sync(&health->recover_work); cancel_work_sync(&health->work); } -void mlx5_drain_health_recovery(struct mlx5_core_dev *dev) -{ - struct mlx5_core_health *health = &dev->priv.health; - unsigned long flags; - - spin_lock_irqsave(&health->wq_lock, flags); - set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags); - spin_unlock_irqrestore(&health->wq_lock, flags); - cancel_delayed_work_sync(&dev->priv.health.recover_work); -} - void mlx5_health_flush(struct mlx5_core_dev *dev) { struct mlx5_core_health *health = &dev->priv.health; @@ -387,7 +371,6 @@ int mlx5_health_init(struct mlx5_core_dev *dev) return -ENOMEM; spin_lock_init(&health->wq_lock); INIT_WORK(&health->work, health_care); - INIT_DELAYED_WORK(&health->recover_work, health_recover); return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index c70e97071b87..fd0e2949c4f2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1191,7 +1191,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, bool cleanup) int err = 0; if (cleanup) - mlx5_drain_health_recovery(dev); + mlx5_drain_health_wq(dev); mutex_lock(&dev->intf_state_mutex); if (!test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index d4dd8c1ae55c..97f8cf67ced0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -214,7 +214,7 @@ enum { MLX5_NIC_IFC_FULL = 0, MLX5_NIC_IFC_DISABLED = 1, MLX5_NIC_IFC_NO_DRAM_NIC = 2, - MLX5_NIC_IFC_INVALID = 3 + MLX5_NIC_IFC_SW_RESET = 7 }; u8 mlx5_get_nic_state(struct mlx5_core_dev *dev); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 4ae533b3da07..cc7fd8e62844 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -435,6 +435,7 @@ struct mlx5_core_health { u32 prev; int miss_counter; bool sick; + u32 fatal_error; u32 crdump_size; /* wq spinlock to synchronize draining */ spinlock_t wq_lock; @@ -906,7 +907,6 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev); void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health); void mlx5_drain_health_wq(struct mlx5_core_dev *dev); void mlx5_trigger_health_work(struct mlx5_core_dev *dev); -void mlx5_drain_health_recovery(struct mlx5_core_dev *dev); int mlx5_buf_alloc_node(struct mlx5_core_dev *dev, int size, struct mlx5_frag_buf *buf, int node); int mlx5_buf_alloc(struct mlx5_core_dev *dev, From 1ef6f1a17e56f9126472d2c50818f468f1fc43d2 Mon Sep 17 00:00:00 2001 From: Feras Daoud Date: Sun, 2 Dec 2018 14:41:22 +0200 Subject: [PATCH 07/15] net/mlx5: Control CR-space access by different PFs Since the FW can be shared between different PFs/VFs it is common that more than one health poll will detected a failure, this can lead to multiple resets which are unneeded. The solution is to use a FW locking mechanism using semaphore space to provide a way to allow only one device to collect the cr-dump and to issue a sw-reset. Signed-off-by: Feras Daoud Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/lib/pci_vsc.c | 40 ++++++++++++++++--- .../ethernet/mellanox/mlx5/core/lib/pci_vsc.h | 8 ++++ .../ethernet/mellanox/mlx5/core/mlx5_core.h | 4 ++ 3 files changed, 47 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.c index a27b0119b3d6..6b774e0c2766 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.c @@ -24,11 +24,6 @@ pci_write_config_dword((dev)->pdev, (dev)->vsc_addr + (offset), (val)) #define VSC_MAX_RETRIES 2048 -enum mlx5_vsc_state { - MLX5_VSC_UNLOCK, - MLX5_VSC_LOCK, -}; - enum { VSC_CTRL_OFFSET = 0x4, VSC_COUNTER_OFFSET = 0x8, @@ -284,3 +279,38 @@ int mlx5_vsc_gw_read_block_fast(struct mlx5_core_dev *dev, u32 *data, } return length; } + +int mlx5_vsc_sem_set_space(struct mlx5_core_dev *dev, u16 space, + enum mlx5_vsc_state state) +{ + u32 data, id = 0; + int ret; + + ret = mlx5_vsc_gw_set_space(dev, MLX5_SEMAPHORE_SPACE_DOMAIN, NULL); + if (ret) { + mlx5_core_warn(dev, "Failed to set gw space %d\n", ret); + return ret; + } + + if (state == MLX5_VSC_LOCK) { + /* Get a unique ID based on the counter */ + ret = vsc_read(dev, VSC_COUNTER_OFFSET, &id); + if (ret) + return ret; + } + + /* Try to modify lock */ + ret = mlx5_vsc_gw_write(dev, space, id); + if (ret) + return ret; + + /* Verify lock was modified */ + ret = mlx5_vsc_gw_read(dev, space, &data); + if (ret) + return -EINVAL; + + if (data != id) + return -EBUSY; + + return 0; +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.h index 28ea6bfa439f..64272a6d7754 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.h @@ -4,6 +4,11 @@ #ifndef __MLX5_PCI_VSC_H__ #define __MLX5_PCI_VSC_H__ +enum mlx5_vsc_state { + MLX5_VSC_UNLOCK, + MLX5_VSC_LOCK, +}; + enum { MLX5_VSC_SPACE_SCAN_CRSPACE = 0x7, }; @@ -21,4 +26,7 @@ static inline bool mlx5_vsc_accessible(struct mlx5_core_dev *dev) return !!dev->vsc_addr; } +int mlx5_vsc_sem_set_space(struct mlx5_core_dev *dev, u16 space, + enum mlx5_vsc_state state); + #endif /* __MLX5_PCI_VSC_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index 97f8cf67ced0..8593c8183d87 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -111,6 +111,10 @@ enum { MLX5_DRIVER_SYND = 0xbadd00de, }; +enum mlx5_semaphore_space_address { + MLX5_SEMAPHORE_SPACE_DOMAIN = 0xA, +}; + int mlx5_query_hca_caps(struct mlx5_core_dev *dev); int mlx5_query_board_id(struct mlx5_core_dev *dev); int mlx5_cmd_init_hca(struct mlx5_core_dev *dev, uint32_t *sw_owner_id); From 3e5b72ac2f298423902169db7893fef43365e0a6 Mon Sep 17 00:00:00 2001 From: Feras Daoud Date: Mon, 12 Nov 2018 16:40:17 +0200 Subject: [PATCH 08/15] net/mlx5: Issue SW reset on FW assert If a FW assert is considered fatal, indicated by a new bit in the health buffer, reset the FW. After the reset go through the normal recovery flow. Only one PF needs to issue the reset, so an attempt is made to prevent the 2nd function from also issuing the reset. It's not an error if that happens, it just slows recovery. Signed-off-by: Feras Daoud Signed-off-by: Alex Vesker Signed-off-by: Moshe Shemesh Signed-off-by: Daniel Jurgens Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/diag/crdump.c | 13 +- .../net/ethernet/mellanox/mlx5/core/health.c | 157 +++++++++++++++++- .../net/ethernet/mellanox/mlx5/core/main.c | 1 + .../ethernet/mellanox/mlx5/core/mlx5_core.h | 2 + include/linux/mlx5/device.h | 10 +- include/linux/mlx5/driver.h | 1 + 6 files changed, 176 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c index dfb34172c69b..28d02749d3c4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c @@ -51,14 +51,23 @@ int mlx5_crdump_collect(struct mlx5_core_dev *dev, u32 *cr_data) ret); return ret; } + /* Verify no other PF is running cr-dump or sw reset */ + ret = mlx5_vsc_sem_set_space(dev, MLX5_SEMAPHORE_SW_RESET, + MLX5_VSC_LOCK); + if (ret) { + mlx5_core_warn(dev, "Failed to lock SW reset semaphore\n"); + goto unlock_gw; + } ret = mlx5_vsc_gw_set_space(dev, MLX5_VSC_SPACE_SCAN_CRSPACE, NULL); if (ret) - goto unlock; + goto unlock_sem; ret = mlx5_crdump_fill(dev, cr_data); -unlock: +unlock_sem: + mlx5_vsc_sem_set_space(dev, MLX5_SEMAPHORE_SW_RESET, MLX5_VSC_UNLOCK); +unlock_gw: mlx5_vsc_gw_unlock(dev); return ret; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index 737e6d550775..caf54bd7d538 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -40,6 +40,7 @@ #include "mlx5_core.h" #include "lib/eq.h" #include "lib/mlx5.h" +#include "lib/pci_vsc.h" enum { MLX5_HEALTH_POLL_INTERVAL = 2 * HZ, @@ -67,8 +68,10 @@ enum { enum { MLX5_SENSOR_NO_ERR = 0, MLX5_SENSOR_PCI_COMM_ERR = 1, - MLX5_SENSOR_NIC_DISABLED = 2, - MLX5_SENSOR_NIC_SW_RESET = 3, + MLX5_SENSOR_PCI_ERR = 2, + MLX5_SENSOR_NIC_DISABLED = 3, + MLX5_SENSOR_NIC_SW_RESET = 4, + MLX5_SENSOR_FW_SYND_RFR = 5, }; u8 mlx5_get_nic_state(struct mlx5_core_dev *dev) @@ -95,32 +98,162 @@ static bool sensor_pci_not_working(struct mlx5_core_dev *dev) return (ioread32be(&h->fw_ver) == 0xffffffff); } +static bool sensor_fw_synd_rfr(struct mlx5_core_dev *dev) +{ + struct mlx5_core_health *health = &dev->priv.health; + struct health_buffer __iomem *h = health->health; + u32 rfr = ioread32be(&h->rfr) >> MLX5_RFR_OFFSET; + u8 synd = ioread8(&h->synd); + + if (rfr && synd) + mlx5_core_dbg(dev, "FW requests reset, synd: %d\n", synd); + return rfr && synd; +} + static u32 check_fatal_sensors(struct mlx5_core_dev *dev) { if (sensor_pci_not_working(dev)) return MLX5_SENSOR_PCI_COMM_ERR; + if (pci_channel_offline(dev->pdev)) + return MLX5_SENSOR_PCI_ERR; if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED) return MLX5_SENSOR_NIC_DISABLED; if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_SW_RESET) return MLX5_SENSOR_NIC_SW_RESET; + if (sensor_fw_synd_rfr(dev)) + return MLX5_SENSOR_FW_SYND_RFR; return MLX5_SENSOR_NO_ERR; } +static int lock_sem_sw_reset(struct mlx5_core_dev *dev, bool lock) +{ + enum mlx5_vsc_state state; + int ret; + + if (!mlx5_core_is_pf(dev)) + return -EBUSY; + + /* Try to lock GW access, this stage doesn't return + * EBUSY because locked GW does not mean that other PF + * already started the reset. + */ + ret = mlx5_vsc_gw_lock(dev); + if (ret == -EBUSY) + return -EINVAL; + if (ret) + return ret; + + state = lock ? MLX5_VSC_LOCK : MLX5_VSC_UNLOCK; + /* At this stage, if the return status == EBUSY, then we know + * for sure that another PF started the reset, so don't allow + * another reset. + */ + ret = mlx5_vsc_sem_set_space(dev, MLX5_SEMAPHORE_SW_RESET, state); + if (ret) + mlx5_core_warn(dev, "Failed to lock SW reset semaphore\n"); + + /* Unlock GW access */ + mlx5_vsc_gw_unlock(dev); + + return ret; +} + +static bool reset_fw_if_needed(struct mlx5_core_dev *dev) +{ + bool supported = (ioread32be(&dev->iseg->initializing) >> + MLX5_FW_RESET_SUPPORTED_OFFSET) & 1; + u32 fatal_error; + + if (!supported) + return false; + + /* The reset only needs to be issued by one PF. The health buffer is + * shared between all functions, and will be cleared during a reset. + * Check again to avoid a redundant 2nd reset. If the fatal erros was + * PCI related a reset won't help. + */ + fatal_error = check_fatal_sensors(dev); + if (fatal_error == MLX5_SENSOR_PCI_COMM_ERR || + fatal_error == MLX5_SENSOR_NIC_DISABLED || + fatal_error == MLX5_SENSOR_NIC_SW_RESET) { + mlx5_core_warn(dev, "Not issuing FW reset. Either it's already done or won't help."); + return false; + } + + mlx5_core_warn(dev, "Issuing FW Reset\n"); + /* Write the NIC interface field to initiate the reset, the command + * interface address also resides here, don't overwrite it. + */ + mlx5_set_nic_state(dev, MLX5_NIC_IFC_SW_RESET); + + return true; +} + void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force) { mutex_lock(&dev->intf_state_mutex); if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) goto unlock; + if (dev->state == MLX5_DEVICE_STATE_UNINITIALIZED) { + dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; + goto unlock; + } - mlx5_core_err(dev, "start\n"); - if (pci_channel_offline(dev->pdev) || - dev->priv.health.fatal_error != MLX5_SENSOR_NO_ERR || force) { + if (check_fatal_sensors(dev) || force) { dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; mlx5_cmd_flush(dev); } mlx5_notifier_call_chain(dev->priv.events, MLX5_DEV_EVENT_SYS_ERROR, (void *)1); +unlock: + mutex_unlock(&dev->intf_state_mutex); +} + +#define MLX5_CRDUMP_WAIT_MS 60000 +#define MLX5_FW_RESET_WAIT_MS 1000 +void mlx5_error_sw_reset(struct mlx5_core_dev *dev) +{ + unsigned long end, delay_ms = MLX5_FW_RESET_WAIT_MS; + int lock = -EBUSY; + + mutex_lock(&dev->intf_state_mutex); + if (dev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR) + goto unlock; + + mlx5_core_err(dev, "start\n"); + + if (check_fatal_sensors(dev) == MLX5_SENSOR_FW_SYND_RFR) { + /* Get cr-dump and reset FW semaphore */ + lock = lock_sem_sw_reset(dev, true); + + if (lock == -EBUSY) { + delay_ms = MLX5_CRDUMP_WAIT_MS; + goto recover_from_sw_reset; + } + /* Execute SW reset */ + reset_fw_if_needed(dev); + } + +recover_from_sw_reset: + /* Recover from SW reset */ + end = jiffies + msecs_to_jiffies(delay_ms); + do { + if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED) + break; + + cond_resched(); + } while (!time_after(jiffies, end)); + + if (mlx5_get_nic_state(dev) != MLX5_NIC_IFC_DISABLED) { + dev_err(&dev->pdev->dev, "NIC IFC still %d after %lums.\n", + mlx5_get_nic_state(dev), delay_ms); + } + + /* Release FW semaphore if you are the lock owner */ + if (!lock) + lock_sem_sw_reset(dev, false); + mlx5_core_err(dev, "end\n"); unlock: @@ -143,6 +276,20 @@ static void mlx5_handle_bad_state(struct mlx5_core_dev *dev) case MLX5_NIC_IFC_NO_DRAM_NIC: mlx5_core_warn(dev, "Expected to see disabled NIC but it is no dram nic\n"); break; + + case MLX5_NIC_IFC_SW_RESET: + /* The IFC mode field is 3 bits, so it will read 0x7 in 2 cases: + * 1. PCI has been disabled (ie. PCI-AER, PF driver unloaded + * and this is a VF), this is not recoverable by SW reset. + * Logging of this is handled elsewhere. + * 2. FW reset has been issued by another function, driver can + * be reloaded to recover after the mode switches to + * MLX5_NIC_IFC_DISABLED. + */ + if (dev->priv.health.fatal_error != MLX5_SENSOR_PCI_COMM_ERR) + mlx5_core_warn(dev, "NIC SW reset in progress\n"); + break; + default: mlx5_core_warn(dev, "Expected to see disabled NIC but it is has invalid value %d\n", nic_interface); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index fd0e2949c4f2..ec5287c51825 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1361,6 +1361,7 @@ static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev, mlx5_core_info(dev, "%s was called\n", __func__); mlx5_enter_error_state(dev, false); + mlx5_error_sw_reset(dev); mlx5_unload_one(dev, false); /* In case of kernel call drain the health wq */ if (state) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index 8593c8183d87..29bb61a10289 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -113,6 +113,7 @@ enum { enum mlx5_semaphore_space_address { MLX5_SEMAPHORE_SPACE_DOMAIN = 0xA, + MLX5_SEMAPHORE_SW_RESET = 0x20, }; int mlx5_query_hca_caps(struct mlx5_core_dev *dev); @@ -122,6 +123,7 @@ int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev); int mlx5_cmd_force_teardown_hca(struct mlx5_core_dev *dev); int mlx5_cmd_fast_teardown_hca(struct mlx5_core_dev *dev); void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force); +void mlx5_error_sw_reset(struct mlx5_core_dev *dev); void mlx5_disable_device(struct mlx5_core_dev *dev); void mlx5_recover_device(struct mlx5_core_dev *dev); int mlx5_sriov_init(struct mlx5_core_dev *dev); diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 5e760067ac41..35ed38c2ae6c 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -510,6 +510,10 @@ struct mlx5_cmd_layout { u8 status_own; }; +enum mlx5_fatal_assert_bit_offsets { + MLX5_RFR_OFFSET = 31, +}; + struct health_buffer { __be32 assert_var[5]; __be32 rsvd0[3]; @@ -518,12 +522,16 @@ struct health_buffer { __be32 rsvd1[2]; __be32 fw_ver; __be32 hw_id; - __be32 rsvd2; + __be32 rfr; u8 irisc_index; u8 synd; __be16 ext_synd; }; +enum mlx5_initializing_bit_offsets { + MLX5_FW_RESET_SUPPORTED_OFFSET = 30, +}; + enum mlx5_cmd_addr_l_sz_offset { MLX5_NIC_IFC_OFFSET = 8, }; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index cc7fd8e62844..89205b6cc7ef 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -583,6 +583,7 @@ struct mlx5_priv { }; enum mlx5_device_state { + MLX5_DEVICE_STATE_UNINITIALIZED, MLX5_DEVICE_STATE_UP, MLX5_DEVICE_STATE_INTERNAL_ERROR, }; From 1e34f3efd413a6318c3edd6e8e7e091f1214b2e6 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Tue, 11 Dec 2018 16:09:53 +0200 Subject: [PATCH 09/15] net/mlx5: Create FW devlink_health_reporter Create mlx5_devlink_health_reporter for FW reporter. The FW reporter implements devlink_health_reporter diagnose callback. The fw reporter diagnose command can be triggered any time by the user to check current fw status. In healthy status, it will return clear syndrome. Otherwise it will return the syndrome and description of the error type. Command example and output on healthy status: $ devlink health diagnose pci/0000:82:00.0 reporter fw Syndrome: 0 Command example and output on non healthy status: $ devlink health diagnose pci/0000:82:00.0 reporter fw Syndrome: 8 Description: unrecoverable hardware error Signed-off-by: Moshe Shemesh Signed-off-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/health.c | 48 +++++++++++++++++++ include/linux/mlx5/driver.h | 2 + 2 files changed, 50 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index caf54bd7d538..973cc005ae60 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -388,6 +388,51 @@ static void print_health_info(struct mlx5_core_dev *dev) mlx5_core_err(dev, "raw fw_ver 0x%08x\n", fw); } +static int +mlx5_fw_reporter_diagnose(struct devlink_health_reporter *reporter, + struct devlink_fmsg *fmsg) +{ + struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter); + struct mlx5_core_health *health = &dev->priv.health; + struct health_buffer __iomem *h = health->health; + u8 synd; + int err; + + synd = ioread8(&h->synd); + err = devlink_fmsg_u8_pair_put(fmsg, "Syndrome", synd); + if (err || !synd) + return err; + return devlink_fmsg_string_pair_put(fmsg, "Description", hsynd_str(synd)); +} + +static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = { + .name = "fw", + .diagnose = mlx5_fw_reporter_diagnose, +}; + +static void mlx5_fw_reporter_create(struct mlx5_core_dev *dev) +{ + struct mlx5_core_health *health = &dev->priv.health; + struct devlink *devlink = priv_to_devlink(dev); + + health->fw_reporter = + devlink_health_reporter_create(devlink, &mlx5_fw_reporter_ops, + 0, false, dev); + if (IS_ERR(health->fw_reporter)) + mlx5_core_warn(dev, "Failed to create fw reporter, err = %ld\n", + PTR_ERR(health->fw_reporter)); +} + +static void mlx5_fw_reporter_destroy(struct mlx5_core_dev *dev) +{ + struct mlx5_core_health *health = &dev->priv.health; + + if (IS_ERR_OR_NULL(health->fw_reporter)) + return; + + devlink_health_reporter_destroy(health->fw_reporter); +} + static unsigned long get_next_poll_jiffies(void) { unsigned long next; @@ -498,6 +543,7 @@ void mlx5_health_cleanup(struct mlx5_core_dev *dev) struct mlx5_core_health *health = &dev->priv.health; destroy_workqueue(health->wq); + mlx5_fw_reporter_destroy(dev); } int mlx5_health_init(struct mlx5_core_dev *dev) @@ -519,5 +565,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev) spin_lock_init(&health->wq_lock); INIT_WORK(&health->work, health_care); + mlx5_fw_reporter_create(dev); + return 0; } diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 89205b6cc7ef..8d5d065d1aa6 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -53,6 +53,7 @@ #include #include #include +#include enum { MLX5_BOARD_ID_LEN = 64, @@ -443,6 +444,7 @@ struct mlx5_core_health { unsigned long flags; struct work_struct work; struct delayed_work recover_work; + struct devlink_health_reporter *fw_reporter; }; struct mlx5_qp_table { From fd1483fe1f9fd45fe312adffb0faffa57446690d Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Tue, 11 Dec 2018 16:09:54 +0200 Subject: [PATCH 10/15] net/mlx5: Add support for FW reporter dump Add support of dump callback for mlx5 FW reporter. Once we trigger FW dump, the FW will write the core dump to its raw data buffer. The tracer translates the raw data to traces and save it to a cyclic array. Once dump is done, the saved traces data is filled into the dump buffer. In case syndrome is not zero the health buffer content will be printed as well. FW dump example: $ devlink health dump show pci/0000:82:00.0 reporter fw dump fw traces: timestamp: 509006640427 lost: false event_id: 185 msg: dump general info GVMI=0x0000 timestamp: 509006645474 lost: false event_id: 185 msg: GVMI management info, gvmi_management context: timestamp: 509006654463 lost: false event_id: 185 msg: [000]: 00000000 00000000 00000000 00000000 timestamp: 509006656127 lost: false event_id: 185 msg: [010]: 00000000 00000000 00000000 00000000 timestamp: 509006656255 lost: false event_id: 185 msg: [020]: 00000000 00000000 00000000 00000000 timestamp: 509006656511 lost: false event_id: 185 msg: [030]: 00000000 00000000 00000000 00000000 timestamp: 509006656639 lost: false event_id: 185 msg: [040]: 00000000 00000000 00000000 00000000 timestamp: 509006656895 lost: false event_id: 185 msg: [050]: 00000000 00000000 00000000 00000000 timestamp: 509006657023 lost: false event_id: 185 msg: [060]: 00000000 00000000 00000000 00000000 timestamp: 509006657180 lost: false event_id: 185 msg: [070]: 00000000 00000000 00000000 00000000 timestamp: 509006659839 lost: false event_id: 185 msg: CMDIF dbase from IRON: active_dbase_slots = 0x00000000 timestamp: 509006667391 lost: false event_id: 185 msg: GVMI=0x0000 hw_toc context: timestamp: 509006667647 lost: false event_id: 185 msg: [000]: 00000000 00000000 00000000 fffff000 timestamp: 509006667775 lost: false event_id: 185 msg: [010]: 00000000 00000000 00000000 80d00000 ... ... Signed-off-by: Moshe Shemesh Signed-off-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- .../mellanox/mlx5/core/diag/fw_tracer.c | 139 ++++++++++++++++++ .../mellanox/mlx5/core/diag/fw_tracer.h | 20 +++ .../net/ethernet/mellanox/mlx5/core/health.c | 111 ++++++++++++++ 3 files changed, 270 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c index 6999f4486e9e..8a4930c8bf62 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c @@ -243,6 +243,19 @@ free_strings_db: return -ENOMEM; } +static void +mlx5_fw_tracer_init_saved_traces_array(struct mlx5_fw_tracer *tracer) +{ + tracer->st_arr.saved_traces_index = 0; + mutex_init(&tracer->st_arr.lock); +} + +static void +mlx5_fw_tracer_clean_saved_traces_array(struct mlx5_fw_tracer *tracer) +{ + mutex_destroy(&tracer->st_arr.lock); +} + static void mlx5_tracer_read_strings_db(struct work_struct *work) { struct mlx5_fw_tracer *tracer = container_of(work, struct mlx5_fw_tracer, @@ -522,6 +535,24 @@ static void mlx5_fw_tracer_clean_ready_list(struct mlx5_fw_tracer *tracer) list_del(&str_frmt->list); } +static void mlx5_fw_tracer_save_trace(struct mlx5_fw_tracer *tracer, + u64 timestamp, bool lost, + u8 event_id, char *msg) +{ + struct mlx5_fw_trace_data *trace_data; + + mutex_lock(&tracer->st_arr.lock); + trace_data = &tracer->st_arr.straces[tracer->st_arr.saved_traces_index]; + trace_data->timestamp = timestamp; + trace_data->lost = lost; + trace_data->event_id = event_id; + strncpy(trace_data->msg, msg, TRACE_STR_MSG); + + tracer->st_arr.saved_traces_index = + (tracer->st_arr.saved_traces_index + 1) & (SAVED_TRACES_NUM - 1); + mutex_unlock(&tracer->st_arr.lock); +} + static void mlx5_tracer_print_trace(struct tracer_string_format *str_frmt, struct mlx5_core_dev *dev, u64 trace_timestamp) @@ -540,6 +571,9 @@ static void mlx5_tracer_print_trace(struct tracer_string_format *str_frmt, trace_mlx5_fw(dev->tracer, trace_timestamp, str_frmt->lost, str_frmt->event_id, tmp); + mlx5_fw_tracer_save_trace(dev->tracer, trace_timestamp, + str_frmt->lost, str_frmt->event_id, tmp); + /* remove it from hash */ mlx5_tracer_clean_message(str_frmt); } @@ -786,6 +820,109 @@ static void mlx5_fw_tracer_ownership_change(struct work_struct *work) mlx5_fw_tracer_start(tracer); } +static int mlx5_fw_tracer_set_core_dump_reg(struct mlx5_core_dev *dev, + u32 *in, int size_in) +{ + u32 out[MLX5_ST_SZ_DW(core_dump_reg)] = {}; + + if (!MLX5_CAP_DEBUG(dev, core_dump_general) && + !MLX5_CAP_DEBUG(dev, core_dump_qp)) + return -EOPNOTSUPP; + + return mlx5_core_access_reg(dev, in, size_in, out, sizeof(out), + MLX5_REG_CORE_DUMP, 0, 1); +} + +int mlx5_fw_tracer_trigger_core_dump_general(struct mlx5_core_dev *dev) +{ + struct mlx5_fw_tracer *tracer = dev->tracer; + u32 in[MLX5_ST_SZ_DW(core_dump_reg)] = {}; + int err; + + if (!MLX5_CAP_DEBUG(dev, core_dump_general) || !tracer) + return -EOPNOTSUPP; + if (!tracer->owner) + return -EPERM; + + MLX5_SET(core_dump_reg, in, core_dump_type, 0x0); + + err = mlx5_fw_tracer_set_core_dump_reg(dev, in, sizeof(in)); + if (err) + return err; + queue_work(tracer->work_queue, &tracer->handle_traces_work); + flush_workqueue(tracer->work_queue); + return 0; +} + +static int +mlx5_devlink_fmsg_fill_trace(struct devlink_fmsg *fmsg, + struct mlx5_fw_trace_data *trace_data) +{ + int err; + + err = devlink_fmsg_obj_nest_start(fmsg); + if (err) + return err; + + err = devlink_fmsg_u64_pair_put(fmsg, "timestamp", trace_data->timestamp); + if (err) + return err; + + err = devlink_fmsg_bool_pair_put(fmsg, "lost", trace_data->lost); + if (err) + return err; + + err = devlink_fmsg_u8_pair_put(fmsg, "event_id", trace_data->event_id); + if (err) + return err; + + err = devlink_fmsg_string_pair_put(fmsg, "msg", trace_data->msg); + if (err) + return err; + + err = devlink_fmsg_obj_nest_end(fmsg); + if (err) + return err; + return 0; +} + +int mlx5_fw_tracer_get_saved_traces_objects(struct mlx5_fw_tracer *tracer, + struct devlink_fmsg *fmsg) +{ + struct mlx5_fw_trace_data *straces = tracer->st_arr.straces; + u32 index, start_index, end_index; + u32 saved_traces_index; + int err; + + if (!straces[0].timestamp) + return -ENOMSG; + + mutex_lock(&tracer->st_arr.lock); + saved_traces_index = tracer->st_arr.saved_traces_index; + if (straces[saved_traces_index].timestamp) + start_index = saved_traces_index; + else + start_index = 0; + end_index = (saved_traces_index - 1) & (SAVED_TRACES_NUM - 1); + + err = devlink_fmsg_arr_pair_nest_start(fmsg, "dump fw traces"); + if (err) + goto unlock; + index = start_index; + while (index != end_index) { + err = mlx5_devlink_fmsg_fill_trace(fmsg, &straces[index]); + if (err) + goto unlock; + + index = (index + 1) & (SAVED_TRACES_NUM - 1); + } + + err = devlink_fmsg_arr_pair_nest_end(fmsg); +unlock: + mutex_unlock(&tracer->st_arr.lock); + return err; +} + /* Create software resources (Buffers, etc ..) */ struct mlx5_fw_tracer *mlx5_fw_tracer_create(struct mlx5_core_dev *dev) { @@ -833,6 +970,7 @@ struct mlx5_fw_tracer *mlx5_fw_tracer_create(struct mlx5_core_dev *dev) goto free_log_buf; } + mlx5_fw_tracer_init_saved_traces_array(tracer); mlx5_core_dbg(dev, "FWTracer: Tracer created\n"); return tracer; @@ -917,6 +1055,7 @@ void mlx5_fw_tracer_destroy(struct mlx5_fw_tracer *tracer) cancel_work_sync(&tracer->read_fw_strings_work); mlx5_fw_tracer_clean_ready_list(tracer); mlx5_fw_tracer_clean_print_hash(tracer); + mlx5_fw_tracer_clean_saved_traces_array(tracer); mlx5_fw_tracer_free_strings_db(tracer); mlx5_fw_tracer_destroy_log_buf(tracer); flush_workqueue(tracer->work_queue); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h index a8b8747f2b61..40601fba80ba 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h @@ -46,6 +46,9 @@ #define TRACER_BLOCK_SIZE_BYTE 256 #define TRACES_PER_BLOCK 32 +#define TRACE_STR_MSG 256 +#define SAVED_TRACES_NUM 8192 + #define TRACER_MAX_PARAMS 7 #define MESSAGE_HASH_BITS 6 #define MESSAGE_HASH_SIZE BIT(MESSAGE_HASH_BITS) @@ -53,6 +56,13 @@ #define MASK_52_7 (0x1FFFFFFFFFFF80) #define MASK_6_0 (0x7F) +struct mlx5_fw_trace_data { + u64 timestamp; + bool lost; + u8 event_id; + char msg[TRACE_STR_MSG]; +}; + struct mlx5_fw_tracer { struct mlx5_core_dev *dev; struct mlx5_nb nb; @@ -83,6 +93,13 @@ struct mlx5_fw_tracer { u32 consumer_index; } buff; + /* Saved Traces Array */ + struct { + struct mlx5_fw_trace_data straces[SAVED_TRACES_NUM]; + u32 saved_traces_index; + struct mutex lock; /* Protect st_arr access */ + } st_arr; + u64 last_timestamp; struct work_struct handle_traces_work; struct hlist_head hash[MESSAGE_HASH_SIZE]; @@ -171,5 +188,8 @@ struct mlx5_fw_tracer *mlx5_fw_tracer_create(struct mlx5_core_dev *dev); int mlx5_fw_tracer_init(struct mlx5_fw_tracer *tracer); void mlx5_fw_tracer_cleanup(struct mlx5_fw_tracer *tracer); void mlx5_fw_tracer_destroy(struct mlx5_fw_tracer *tracer); +int mlx5_fw_tracer_trigger_core_dump_general(struct mlx5_core_dev *dev); +int mlx5_fw_tracer_get_saved_traces_objects(struct mlx5_fw_tracer *tracer, + struct devlink_fmsg *fmsg); #endif diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index 973cc005ae60..1c20d3f1d238 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -41,6 +41,7 @@ #include "lib/eq.h" #include "lib/mlx5.h" #include "lib/pci_vsc.h" +#include "diag/fw_tracer.h" enum { MLX5_HEALTH_POLL_INTERVAL = 2 * HZ, @@ -405,9 +406,119 @@ mlx5_fw_reporter_diagnose(struct devlink_health_reporter *reporter, return devlink_fmsg_string_pair_put(fmsg, "Description", hsynd_str(synd)); } +struct mlx5_fw_reporter_ctx { + u8 err_synd; + int miss_counter; +}; + +static int +mlx5_fw_reporter_ctx_pairs_put(struct devlink_fmsg *fmsg, + struct mlx5_fw_reporter_ctx *fw_reporter_ctx) +{ + int err; + + err = devlink_fmsg_u8_pair_put(fmsg, "syndrome", + fw_reporter_ctx->err_synd); + if (err) + return err; + err = devlink_fmsg_u32_pair_put(fmsg, "fw_miss_counter", + fw_reporter_ctx->miss_counter); + if (err) + return err; + return 0; +} + +static int +mlx5_fw_reporter_heath_buffer_data_put(struct mlx5_core_dev *dev, + struct devlink_fmsg *fmsg) +{ + struct mlx5_core_health *health = &dev->priv.health; + struct health_buffer __iomem *h = health->health; + int err; + int i; + + if (!ioread8(&h->synd)) + return 0; + + err = devlink_fmsg_pair_nest_start(fmsg, "health buffer"); + if (err) + return err; + err = devlink_fmsg_obj_nest_start(fmsg); + if (err) + return err; + err = devlink_fmsg_arr_pair_nest_start(fmsg, "assert_var"); + if (err) + return err; + + for (i = 0; i < ARRAY_SIZE(h->assert_var); i++) { + err = devlink_fmsg_u32_put(fmsg, ioread32be(h->assert_var + i)); + if (err) + return err; + } + err = devlink_fmsg_arr_pair_nest_end(fmsg); + if (err) + return err; + err = devlink_fmsg_u32_pair_put(fmsg, "assert_exit_ptr", + ioread32be(&h->assert_exit_ptr)); + if (err) + return err; + err = devlink_fmsg_u32_pair_put(fmsg, "assert_callra", + ioread32be(&h->assert_callra)); + if (err) + return err; + err = devlink_fmsg_u32_pair_put(fmsg, "hw_id", ioread32be(&h->hw_id)); + if (err) + return err; + err = devlink_fmsg_u8_pair_put(fmsg, "irisc_index", + ioread8(&h->irisc_index)); + if (err) + return err; + err = devlink_fmsg_u8_pair_put(fmsg, "synd", ioread8(&h->synd)); + if (err) + return err; + err = devlink_fmsg_u32_pair_put(fmsg, "ext_synd", + ioread16be(&h->ext_synd)); + if (err) + return err; + err = devlink_fmsg_u32_pair_put(fmsg, "raw_fw_ver", + ioread32be(&h->fw_ver)); + if (err) + return err; + err = devlink_fmsg_obj_nest_end(fmsg); + if (err) + return err; + return devlink_fmsg_pair_nest_end(fmsg); +} + +static int +mlx5_fw_reporter_dump(struct devlink_health_reporter *reporter, + struct devlink_fmsg *fmsg, void *priv_ctx) +{ + struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter); + int err; + + err = mlx5_fw_tracer_trigger_core_dump_general(dev); + if (err) + return err; + + if (priv_ctx) { + struct mlx5_fw_reporter_ctx *fw_reporter_ctx = priv_ctx; + + err = mlx5_fw_reporter_ctx_pairs_put(fmsg, fw_reporter_ctx); + if (err) + return err; + } + + err = mlx5_fw_reporter_heath_buffer_data_put(dev, fmsg); + if (err) + return err; + return mlx5_fw_tracer_get_saved_traces_objects(dev->tracer, fmsg); +} + static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = { .name = "fw", .diagnose = mlx5_fw_reporter_diagnose, + .dump = mlx5_fw_reporter_dump, }; static void mlx5_fw_reporter_create(struct mlx5_core_dev *dev) From d1bf0e2cc4a6e66c2bff48176b8b2930098468ef Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Tue, 11 Dec 2018 16:09:56 +0200 Subject: [PATCH 11/15] net/mlx5: Report devlink health on FW issues Use devlink_health_report() to report any symptom of FW issue as FW counter miss or new health syndrome. The FW issues detected in mlx5 during poll_health which is called in timer atomic context and so health work queue is used to schedule the reports. Signed-off-by: Moshe Shemesh Signed-off-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/health.c | 33 +++++++++++++++++++ include/linux/mlx5/driver.h | 3 +- 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index 1c20d3f1d238..5e876f1de114 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -515,6 +515,29 @@ mlx5_fw_reporter_dump(struct devlink_health_reporter *reporter, return mlx5_fw_tracer_get_saved_traces_objects(dev->tracer, fmsg); } +static void mlx5_fw_reporter_err_work(struct work_struct *work) +{ + struct mlx5_fw_reporter_ctx fw_reporter_ctx; + struct mlx5_core_health *health; + + health = container_of(work, struct mlx5_core_health, report_work); + + if (IS_ERR_OR_NULL(health->fw_reporter)) + return; + + fw_reporter_ctx.err_synd = health->synd; + fw_reporter_ctx.miss_counter = health->miss_counter; + if (fw_reporter_ctx.err_synd) { + devlink_health_report(health->fw_reporter, + "FW syndrom reported", &fw_reporter_ctx); + return; + } + if (fw_reporter_ctx.miss_counter) + devlink_health_report(health->fw_reporter, + "FW miss counter reported", + &fw_reporter_ctx); +} + static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = { .name = "fw", .diagnose = mlx5_fw_reporter_diagnose, @@ -572,7 +595,9 @@ static void poll_health(struct timer_list *t) { struct mlx5_core_dev *dev = from_timer(dev, t, priv.health.timer); struct mlx5_core_health *health = &dev->priv.health; + struct health_buffer __iomem *h = health->health; u32 fatal_error; + u8 prev_synd; u32 count; if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) @@ -588,8 +613,14 @@ static void poll_health(struct timer_list *t) if (health->miss_counter == MAX_MISSES) { mlx5_core_err(dev, "device's health compromised - reached miss count\n"); print_health_info(dev); + queue_work(health->wq, &health->report_work); } + prev_synd = health->synd; + health->synd = ioread8(&h->synd); + if (health->synd && health->synd != prev_synd) + queue_work(health->wq, &health->report_work); + fatal_error = check_fatal_sensors(dev); if (fatal_error && !health->fatal_error) { @@ -639,6 +670,7 @@ void mlx5_drain_health_wq(struct mlx5_core_dev *dev) spin_lock_irqsave(&health->wq_lock, flags); set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); spin_unlock_irqrestore(&health->wq_lock, flags); + cancel_work_sync(&health->report_work); cancel_work_sync(&health->work); } @@ -675,6 +707,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev) return -ENOMEM; spin_lock_init(&health->wq_lock); INIT_WORK(&health->work, health_care); + INIT_WORK(&health->report_work, mlx5_fw_reporter_err_work); mlx5_fw_reporter_create(dev); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 8d5d065d1aa6..1931a4080d78 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -435,7 +435,7 @@ struct mlx5_core_health { struct timer_list timer; u32 prev; int miss_counter; - bool sick; + u8 synd; u32 fatal_error; u32 crdump_size; /* wq spinlock to synchronize draining */ @@ -443,6 +443,7 @@ struct mlx5_core_health { struct workqueue_struct *wq; unsigned long flags; struct work_struct work; + struct work_struct report_work; struct delayed_work recover_work; struct devlink_health_reporter *fw_reporter; }; From 96c82cdfe77b5e769624af71ec0554434037b82f Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Tue, 11 Dec 2018 16:09:57 +0200 Subject: [PATCH 12/15] net/mlx5: Add fw fatal devlink_health_reporter Create mlx5_devlink_health_reporter for fw fatal reporter. The fw fatal reporter is added in addition to the fw reporter and implements the recover callback. The point of having two reporters for FW issues, is that we don't want to run FW recover on any issue, but only fatal ones. Signed-off-by: Moshe Shemesh Signed-off-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/health.c | 81 ++++++++++++++----- include/linux/mlx5/driver.h | 1 + 2 files changed, 62 insertions(+), 20 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index 5e876f1de114..82a658834675 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -301,31 +301,43 @@ static void mlx5_handle_bad_state(struct mlx5_core_dev *dev) /* How much time to wait until health resetting the driver (in msecs) */ #define MLX5_RECOVERY_WAIT_MSECS 60000 -static void health_care(struct work_struct *work) +static int mlx5_health_try_recover(struct mlx5_core_dev *dev) { - struct mlx5_core_health *health; - struct mlx5_core_dev *dev; - struct mlx5_priv *priv; unsigned long end; - health = container_of(work, struct mlx5_core_health, work); - priv = container_of(health, struct mlx5_priv, health); - dev = container_of(priv, struct mlx5_core_dev, priv); mlx5_core_warn(dev, "handling bad device here\n"); mlx5_handle_bad_state(dev); - end = jiffies + msecs_to_jiffies(MLX5_RECOVERY_WAIT_MSECS); while (sensor_pci_not_working(dev)) { if (time_after(jiffies, end)) { mlx5_core_err(dev, "health recovery flow aborted, PCI reads still not working\n"); - return; + return -EIO; } msleep(100); } mlx5_core_err(dev, "starting health recovery flow\n"); mlx5_recover_device(dev); + if (!test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state) || + check_fatal_sensors(dev)) { + mlx5_core_err(dev, "health recovery failed\n"); + return -EIO; + } + return 0; +} + +static void health_recover_work(struct work_struct *work) +{ + struct mlx5_core_health *health; + struct mlx5_core_dev *dev; + struct mlx5_priv *priv; + + health = container_of(work, struct mlx5_core_health, work); + priv = container_of(health, struct mlx5_priv, health); + dev = container_of(priv, struct mlx5_core_dev, priv); + + mlx5_health_try_recover(dev); } static const char *hsynd_str(u8 synd) @@ -544,7 +556,22 @@ static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = { .dump = mlx5_fw_reporter_dump, }; -static void mlx5_fw_reporter_create(struct mlx5_core_dev *dev) +static int +mlx5_fw_fatal_reporter_recover(struct devlink_health_reporter *reporter, + void *priv_ctx) +{ + struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter); + + return mlx5_health_try_recover(dev); +} + +static const struct devlink_health_reporter_ops mlx5_fw_fatal_reporter_ops = { + .name = "fw_fatal", + .recover = mlx5_fw_fatal_reporter_recover, +}; + +#define MLX5_REPORTER_FW_GRACEFUL_PERIOD 1200000 +static void mlx5_fw_reporters_create(struct mlx5_core_dev *dev) { struct mlx5_core_health *health = &dev->priv.health; struct devlink *devlink = priv_to_devlink(dev); @@ -555,16 +582,26 @@ static void mlx5_fw_reporter_create(struct mlx5_core_dev *dev) if (IS_ERR(health->fw_reporter)) mlx5_core_warn(dev, "Failed to create fw reporter, err = %ld\n", PTR_ERR(health->fw_reporter)); + + health->fw_fatal_reporter = + devlink_health_reporter_create(devlink, + &mlx5_fw_fatal_reporter_ops, + MLX5_REPORTER_FW_GRACEFUL_PERIOD, + true, dev); + if (IS_ERR(health->fw_fatal_reporter)) + mlx5_core_warn(dev, "Failed to create fw fatal reporter, err = %ld\n", + PTR_ERR(health->fw_fatal_reporter)); } -static void mlx5_fw_reporter_destroy(struct mlx5_core_dev *dev) +static void mlx5_fw_reporters_destroy(struct mlx5_core_dev *dev) { struct mlx5_core_health *health = &dev->priv.health; - if (IS_ERR_OR_NULL(health->fw_reporter)) - return; + if (!IS_ERR_OR_NULL(health->fw_reporter)) + devlink_health_reporter_destroy(health->fw_reporter); - devlink_health_reporter_destroy(health->fw_reporter); + if (!IS_ERR_OR_NULL(health->fw_fatal_reporter)) + devlink_health_reporter_destroy(health->fw_fatal_reporter); } static unsigned long get_next_poll_jiffies(void) @@ -686,7 +723,7 @@ void mlx5_health_cleanup(struct mlx5_core_dev *dev) struct mlx5_core_health *health = &dev->priv.health; destroy_workqueue(health->wq); - mlx5_fw_reporter_destroy(dev); + mlx5_fw_reporters_destroy(dev); } int mlx5_health_init(struct mlx5_core_dev *dev) @@ -694,22 +731,26 @@ int mlx5_health_init(struct mlx5_core_dev *dev) struct mlx5_core_health *health; char *name; + mlx5_fw_reporters_create(dev); + health = &dev->priv.health; name = kmalloc(64, GFP_KERNEL); if (!name) - return -ENOMEM; + goto out_err; strcpy(name, "mlx5_health"); strcat(name, dev_name(dev->device)); health->wq = create_singlethread_workqueue(name); kfree(name); if (!health->wq) - return -ENOMEM; + goto out_err; spin_lock_init(&health->wq_lock); - INIT_WORK(&health->work, health_care); + INIT_WORK(&health->work, health_recover_work); INIT_WORK(&health->report_work, mlx5_fw_reporter_err_work); - mlx5_fw_reporter_create(dev); - return 0; + +out_err: + mlx5_fw_reporters_destroy(dev); + return -ENOMEM; } diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 1931a4080d78..caac96bf9c0d 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -446,6 +446,7 @@ struct mlx5_core_health { struct work_struct report_work; struct delayed_work recover_work; struct devlink_health_reporter *fw_reporter; + struct devlink_health_reporter *fw_fatal_reporter; }; struct mlx5_qp_table { From 9b1f2982360579cbdb3069fa026f6cfc31c4388b Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 16 Jan 2019 13:23:25 +0200 Subject: [PATCH 13/15] net/mlx5: Add support for FW fatal reporter dump Add support of dump callback for mlx5 FW fatal reporter. The FW fatal dump uses cr-dump functionality to gather cr-space data for debug. The cr-dump uses vsc interface which is valid even if the FW command interface is not functional, which is the case in most FW fatal errors. Command example and output: $ devlink health dump show pci/0000:82:00.0 reporter fw_fatal crdump_data: 00 20 00 01 00 00 00 00 03 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 80 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ba 82 00 00 0c 00 00 00 00 00 00 00 00 00 00 00 00 00 00 20 00 00 00 00 00 00 00 00 00 00 00 00 00 00 fa 00 a4 0e 00 00 00 00 00 00 80 c7 fe ff 50 0a 00 00 ... ... Signed-off-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/health.c | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index 82a658834675..4ef62c6c6424 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -565,9 +565,59 @@ mlx5_fw_fatal_reporter_recover(struct devlink_health_reporter *reporter, return mlx5_health_try_recover(dev); } +#define MLX5_CR_DUMP_CHUNK_SIZE 256 +static int +mlx5_fw_fatal_reporter_dump(struct devlink_health_reporter *reporter, + struct devlink_fmsg *fmsg, void *priv_ctx) +{ + struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter); + u32 crdump_size = dev->priv.health.crdump_size; + u32 *cr_data; + u32 data_size; + u32 offset; + int err; + + if (!mlx5_core_is_pf(dev)) + return -EPERM; + + cr_data = kvmalloc(crdump_size, GFP_KERNEL); + if (!cr_data) + return -ENOMEM; + err = mlx5_crdump_collect(dev, cr_data); + if (err) + return err; + + if (priv_ctx) { + struct mlx5_fw_reporter_ctx *fw_reporter_ctx = priv_ctx; + + err = mlx5_fw_reporter_ctx_pairs_put(fmsg, fw_reporter_ctx); + if (err) + goto free_data; + } + + err = devlink_fmsg_arr_pair_nest_start(fmsg, "crdump_data"); + if (err) + goto free_data; + for (offset = 0; offset < crdump_size; offset += data_size) { + if (crdump_size - offset < MLX5_CR_DUMP_CHUNK_SIZE) + data_size = crdump_size - offset; + else + data_size = MLX5_CR_DUMP_CHUNK_SIZE; + err = devlink_fmsg_binary_put(fmsg, cr_data, data_size); + if (err) + goto free_data; + } + err = devlink_fmsg_arr_pair_nest_end(fmsg); + +free_data: + kfree(cr_data); + return err; +} + static const struct devlink_health_reporter_ops mlx5_fw_fatal_reporter_ops = { .name = "fw_fatal", .recover = mlx5_fw_fatal_reporter_recover, + .dump = mlx5_fw_fatal_reporter_dump, }; #define MLX5_REPORTER_FW_GRACEFUL_PERIOD 1200000 From b3bd076f7501afea2871bb4738ab53498fd32cd5 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Sun, 27 Jan 2019 18:38:39 +0200 Subject: [PATCH 14/15] net/mlx5: Report devlink health on FW fatal issues Report devlink health on FW fatal issues via fw_fatal_reporter. The driver recover flow for FW fatal error is now being handled by the devlink health. Having the recovery controlled by devlink health, the user has the ability to cancel the auto-recovery for debug session and run it manually. Call mlx5_enter_error_state() before calling devlink_health_report() to ensure entering device error state even if auto-recovery is off. Signed-off-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/health.c | 42 ++++++++++++------- .../net/ethernet/mellanox/mlx5/core/main.c | 10 ++--- include/linux/mlx5/driver.h | 2 +- 3 files changed, 31 insertions(+), 23 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index 4ef62c6c6424..2fe6923f7ce0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -327,19 +327,6 @@ static int mlx5_health_try_recover(struct mlx5_core_dev *dev) return 0; } -static void health_recover_work(struct work_struct *work) -{ - struct mlx5_core_health *health; - struct mlx5_core_dev *dev; - struct mlx5_priv *priv; - - health = container_of(work, struct mlx5_core_health, work); - priv = container_of(health, struct mlx5_priv, health); - dev = container_of(priv, struct mlx5_core_dev, priv); - - mlx5_health_try_recover(dev); -} - static const char *hsynd_str(u8 synd) { switch (synd) { @@ -614,6 +601,29 @@ free_data: return err; } +static void mlx5_fw_fatal_reporter_err_work(struct work_struct *work) +{ + struct mlx5_fw_reporter_ctx fw_reporter_ctx; + struct mlx5_core_health *health; + struct mlx5_core_dev *dev; + struct mlx5_priv *priv; + + health = container_of(work, struct mlx5_core_health, fatal_report_work); + priv = container_of(health, struct mlx5_priv, health); + dev = container_of(priv, struct mlx5_core_dev, priv); + + mlx5_enter_error_state(dev, false); + if (IS_ERR_OR_NULL(health->fw_fatal_reporter)) { + if (mlx5_health_try_recover(dev)) + mlx5_core_err(dev, "health recovery failed\n"); + return; + } + fw_reporter_ctx.err_synd = health->synd; + fw_reporter_ctx.miss_counter = health->miss_counter; + devlink_health_report(health->fw_fatal_reporter, + "FW fatal error reported", &fw_reporter_ctx); +} + static const struct devlink_health_reporter_ops mlx5_fw_fatal_reporter_ops = { .name = "fw_fatal", .recover = mlx5_fw_fatal_reporter_recover, @@ -672,7 +682,7 @@ void mlx5_trigger_health_work(struct mlx5_core_dev *dev) spin_lock_irqsave(&health->wq_lock, flags); if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags)) - queue_work(health->wq, &health->work); + queue_work(health->wq, &health->fatal_report_work); else mlx5_core_err(dev, "new health works are not permitted at this stage\n"); spin_unlock_irqrestore(&health->wq_lock, flags); @@ -758,7 +768,7 @@ void mlx5_drain_health_wq(struct mlx5_core_dev *dev) set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); spin_unlock_irqrestore(&health->wq_lock, flags); cancel_work_sync(&health->report_work); - cancel_work_sync(&health->work); + cancel_work_sync(&health->fatal_report_work); } void mlx5_health_flush(struct mlx5_core_dev *dev) @@ -795,7 +805,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev) if (!health->wq) goto out_err; spin_lock_init(&health->wq_lock); - INIT_WORK(&health->work, health_recover_work); + INIT_WORK(&health->fatal_report_work, mlx5_fw_fatal_reporter_err_work); INIT_WORK(&health->report_work, mlx5_fw_reporter_err_work); return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index ec5287c51825..998eec938d3c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1363,11 +1363,8 @@ static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev, mlx5_enter_error_state(dev, false); mlx5_error_sw_reset(dev); mlx5_unload_one(dev, false); - /* In case of kernel call drain the health wq */ - if (state) { - mlx5_drain_health_wq(dev); - mlx5_pci_disable_device(dev); - } + mlx5_drain_health_wq(dev); + mlx5_pci_disable_device(dev); return state == pci_channel_io_perm_failure ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET; @@ -1535,7 +1532,8 @@ MODULE_DEVICE_TABLE(pci, mlx5_core_pci_table); void mlx5_disable_device(struct mlx5_core_dev *dev) { - mlx5_pci_err_detected(dev->pdev, 0); + mlx5_error_sw_reset(dev); + mlx5_unload_one(dev, false); } void mlx5_recover_device(struct mlx5_core_dev *dev) diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index caac96bf9c0d..25847beabd3f 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -442,7 +442,7 @@ struct mlx5_core_health { spinlock_t wq_lock; struct workqueue_struct *wq; unsigned long flags; - struct work_struct work; + struct work_struct fatal_report_work; struct work_struct report_work; struct delayed_work recover_work; struct devlink_health_reporter *fw_reporter; From 06efeb555524a8c65ef429f2603885c31a5212b1 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Sat, 1 Jun 2019 16:40:16 +0300 Subject: [PATCH 15/15] Documentation: net: mlx5: Devlink health documentation Documentation for devlink health reporters supported by mlx5. Signed-off-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- .../device_drivers/mellanox/mlx5.rst | 72 +++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/Documentation/networking/device_drivers/mellanox/mlx5.rst b/Documentation/networking/device_drivers/mellanox/mlx5.rst index 0be802186d12..4eeef2df912f 100644 --- a/Documentation/networking/device_drivers/mellanox/mlx5.rst +++ b/Documentation/networking/device_drivers/mellanox/mlx5.rst @@ -10,6 +10,7 @@ Contents ======== - `Enabling the driver and kconfig options`_ +- `Devlink health reporters`_ Enabling the driver and kconfig options ================================================ @@ -99,3 +100,74 @@ Enabling the driver and kconfig options - CONFIG_PTP_1588_CLOCK: When chosen, mlx5 ptp support will be enabled - CONFIG_VXLAN: When chosen, mlx5 vxaln support will be enabled. - CONFIG_MLXFW: When chosen, mlx5 firmware flashing support will be enabled (via devlink and ethtool). + + +Devlink health reporters +======================== + +tx reporter +----------- +The tx reporter is responsible of two error scenarios: + +- TX timeout + Report on kernel tx timeout detection. + Recover by searching lost interrupts. +- TX error completion + Report on error tx completion. + Recover by flushing the TX queue and reset it. + +TX reporter also support Diagnose callback, on which it provides +real time information of its send queues status. + +User commands examples: + +- Diagnose send queues status:: + + $ devlink health diagnose pci/0000:82:00.0 reporter tx + +- Show number of tx errors indicated, number of recover flows ended successfully, + is autorecover enabled and graceful period from last recover:: + + $ devlink health show pci/0000:82:00.0 reporter tx + +fw reporter +----------- +The fw reporter implements diagnose and dump callbacks. +It follows symptoms of fw error such as fw syndrome by triggering +fw core dump and storing it into the dump buffer. +The fw reporter diagnose command can be triggered any time by the user to check +current fw status. + +User commands examples: + +- Check fw heath status:: + + $ devlink health diagnose pci/0000:82:00.0 reporter fw + +- Read FW core dump if already stored or trigger new one:: + + $ devlink health dump show pci/0000:82:00.0 reporter fw + +NOTE: This command can run only on the PF which has fw tracer ownership, +running it on other PF or any VF will return "Operation not permitted". + +fw fatal reporter +----------------- +The fw fatal reporter implements dump and recover callbacks. +It follows fatal errors indications by CR-space dump and recover flow. +The CR-space dump uses vsc interface which is valid even if the FW command +interface is not functional, which is the case in most FW fatal errors. +The recover function runs recover flow which reloads the driver and triggers fw +reset if needed. + +User commands examples: + +- Run fw recover flow manually:: + + $ devlink health recover pci/0000:82:00.0 reporter fw_fatal + +- Read FW CR-space dump if already strored or trigger new one:: + + $ devlink health dump show pci/0000:82:00.1 reporter fw_fatal + +NOTE: This command can run only on PF.