From 90b2620e6a8aa08c40cc78d61603e0acd853c33a Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Wed, 28 Nov 2018 06:44:36 -0800 Subject: [PATCH 1/5] IB/hfi1: Fix a latency issue for small messages A recent performance enhancement introduced a latency issue in the HFI message path. The new algorithm removed a forced call send for PIO messages and added a forced schedule event for messages larger than the MTU. For PIO, the schedule path can introduce thrashing that can significantly impact the throughput for small messages. If a message size is within the PIO threshold, always take the send path. Fixes: 0b79b27748cb ("IB/{hfi1, qib, rdmavt}: Schedule multi RC/UC packets instead of posting") Reviewed-by: Mike Marciniszyn Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/qp.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 6f3bc4dab858..1a016248039f 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -340,6 +340,13 @@ int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, bool *call_send) default: break; } + + /* + * System latency between send and schedule is large enough that + * forcing call_send to true for piothreshold packets is necessary. + */ + if (wqe->length <= piothreshold) + *call_send = true; return 0; } From 36d842194a57f1b21fbc6a6875f2fa2f9a7f8679 Mon Sep 17 00:00:00 2001 From: Piotr Stankiewicz Date: Wed, 28 Nov 2018 06:44:46 -0800 Subject: [PATCH 2/5] IB/hfi1: Fix an out-of-bounds access in get_hw_stats When running with KASAN, the following trace is produced: [ 62.535888] ================================================================== [ 62.544930] BUG: KASAN: slab-out-of-bounds in gut_hw_stats+0x122/0x230 [hfi1] [ 62.553856] Write of size 8 at addr ffff88080e8d6330 by task kworker/0:1/14 [ 62.565333] CPU: 0 PID: 14 Comm: kworker/0:1 Not tainted 4.19.0-test-build-kasan+ #8 [ 62.575087] Hardware name: Intel Corporation S2600KPR/S2600KPR, BIOS SE5C610.86B.01.01.0019.101220160604 10/12/2016 [ 62.587951] Workqueue: events work_for_cpu_fn [ 62.594050] Call Trace: [ 62.598023] dump_stack+0xc6/0x14c [ 62.603089] ? dump_stack_print_info.cold.1+0x2f/0x2f [ 62.610041] ? kmsg_dump_rewind_nolock+0x59/0x59 [ 62.616615] ? get_hw_stats+0x122/0x230 [hfi1] [ 62.622985] print_address_description+0x6c/0x23c [ 62.629744] ? get_hw_stats+0x122/0x230 [hfi1] [ 62.636108] kasan_report.cold.6+0x241/0x308 [ 62.642365] get_hw_stats+0x122/0x230 [hfi1] [ 62.648703] ? hfi1_alloc_rn+0x40/0x40 [hfi1] [ 62.655088] ? __kmalloc+0x110/0x240 [ 62.660695] ? hfi1_alloc_rn+0x40/0x40 [hfi1] [ 62.667142] setup_hw_stats+0xd8/0x430 [ib_core] [ 62.673972] ? show_hfi+0x50/0x50 [hfi1] [ 62.680026] ib_device_register_sysfs+0x165/0x180 [ib_core] [ 62.687995] ib_register_device+0x5a2/0xa10 [ib_core] [ 62.695340] ? show_hfi+0x50/0x50 [hfi1] [ 62.701421] ? ib_unregister_device+0x2e0/0x2e0 [ib_core] [ 62.709222] ? __vmalloc_node_range+0x2d0/0x380 [ 62.716131] ? rvt_driver_mr_init+0x11f/0x2d0 [rdmavt] [ 62.723735] ? vmalloc_node+0x5c/0x70 [ 62.729697] ? rvt_driver_mr_init+0x11f/0x2d0 [rdmavt] [ 62.737347] ? rvt_driver_mr_init+0x1f5/0x2d0 [rdmavt] [ 62.744998] ? __rvt_alloc_mr+0x110/0x110 [rdmavt] [ 62.752315] ? rvt_rc_error+0x140/0x140 [rdmavt] [ 62.759434] ? rvt_vma_open+0x30/0x30 [rdmavt] [ 62.766364] ? mutex_unlock+0x1d/0x40 [ 62.772445] ? kmem_cache_create_usercopy+0x15d/0x230 [ 62.780115] rvt_register_device+0x1f6/0x360 [rdmavt] [ 62.787823] ? rvt_get_port_immutable+0x180/0x180 [rdmavt] [ 62.796058] ? __get_txreq+0x400/0x400 [hfi1] [ 62.802969] ? memcpy+0x34/0x50 [ 62.808611] hfi1_register_ib_device+0xde6/0xeb0 [hfi1] [ 62.816601] ? hfi1_get_npkeys+0x10/0x10 [hfi1] [ 62.823760] ? hfi1_init+0x89f/0x9a0 [hfi1] [ 62.830469] ? hfi1_setup_eagerbufs+0xad0/0xad0 [hfi1] [ 62.838204] ? pcie_capability_clear_and_set_word+0xcd/0xe0 [ 62.846429] ? pcie_capability_read_word+0xd0/0xd0 [ 62.853791] ? hfi1_pcie_init+0x187/0x4b0 [hfi1] [ 62.860958] init_one+0x67f/0xae0 [hfi1] [ 62.867301] ? hfi1_init+0x9a0/0x9a0 [hfi1] [ 62.873876] ? wait_woken+0x130/0x130 [ 62.879860] ? read_word_at_a_time+0xe/0x20 [ 62.886329] ? strscpy+0x14b/0x280 [ 62.891998] ? hfi1_init+0x9a0/0x9a0 [hfi1] [ 62.898405] local_pci_probe+0x70/0xd0 [ 62.904295] ? pci_device_shutdown+0x90/0x90 [ 62.910833] work_for_cpu_fn+0x29/0x40 [ 62.916750] process_one_work+0x584/0x960 [ 62.922974] ? rcu_work_rcufn+0x40/0x40 [ 62.928991] ? __schedule+0x396/0xdc0 [ 62.934806] ? __sched_text_start+0x8/0x8 [ 62.941020] ? pick_next_task_fair+0x68b/0xc60 [ 62.947674] ? run_rebalance_domains+0x260/0x260 [ 62.954471] ? __list_add_valid+0x29/0xa0 [ 62.960607] ? move_linked_works+0x1c7/0x230 [ 62.967077] ? trace_event_raw_event_workqueue_execute_start+0x140/0x140 [ 62.976248] ? mutex_lock+0xa6/0x100 [ 62.982029] ? __mutex_lock_slowpath+0x10/0x10 [ 62.988795] ? __switch_to+0x37a/0x710 [ 62.994731] worker_thread+0x62e/0x9d0 [ 63.000602] ? max_active_store+0xf0/0xf0 [ 63.006828] ? __switch_to_asm+0x40/0x70 [ 63.012932] ? __switch_to_asm+0x34/0x70 [ 63.019013] ? __switch_to_asm+0x40/0x70 [ 63.025042] ? __switch_to_asm+0x34/0x70 [ 63.031030] ? __switch_to_asm+0x40/0x70 [ 63.037006] ? __schedule+0x396/0xdc0 [ 63.042660] ? kmem_cache_alloc_trace+0xf3/0x1f0 [ 63.049323] ? kthread+0x59/0x1d0 [ 63.054594] ? ret_from_fork+0x35/0x40 [ 63.060257] ? __sched_text_start+0x8/0x8 [ 63.066212] ? schedule+0xcf/0x250 [ 63.071529] ? __wake_up_common+0x110/0x350 [ 63.077794] ? __schedule+0xdc0/0xdc0 [ 63.083348] ? wait_woken+0x130/0x130 [ 63.088963] ? finish_task_switch+0x1f1/0x520 [ 63.095258] ? kasan_unpoison_shadow+0x30/0x40 [ 63.101792] ? __init_waitqueue_head+0xa0/0xd0 [ 63.108183] ? replenish_dl_entity.cold.60+0x18/0x18 [ 63.115151] ? _raw_spin_lock_irqsave+0x25/0x50 [ 63.121754] ? max_active_store+0xf0/0xf0 [ 63.127753] kthread+0x1ae/0x1d0 [ 63.132894] ? kthread_bind+0x30/0x30 [ 63.138422] ret_from_fork+0x35/0x40 [ 63.146973] Allocated by task 14: [ 63.152077] kasan_kmalloc+0xbf/0xe0 [ 63.157471] __kmalloc+0x110/0x240 [ 63.162804] init_cntrs+0x34d/0xdf0 [hfi1] [ 63.168883] hfi1_init_dd+0x29a3/0x2f90 [hfi1] [ 63.175244] init_one+0x551/0xae0 [hfi1] [ 63.181065] local_pci_probe+0x70/0xd0 [ 63.186759] work_for_cpu_fn+0x29/0x40 [ 63.192310] process_one_work+0x584/0x960 [ 63.198163] worker_thread+0x62e/0x9d0 [ 63.203843] kthread+0x1ae/0x1d0 [ 63.208874] ret_from_fork+0x35/0x40 [ 63.217203] Freed by task 1: [ 63.221844] __kasan_slab_free+0x12e/0x180 [ 63.227844] kfree+0x92/0x1a0 [ 63.232570] single_release+0x3a/0x60 [ 63.238024] __fput+0x1d9/0x480 [ 63.242911] task_work_run+0x139/0x190 [ 63.248440] exit_to_usermode_loop+0x191/0x1a0 [ 63.254814] do_syscall_64+0x301/0x330 [ 63.260283] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 63.270199] The buggy address belongs to the object at ffff88080e8d5500 which belongs to the cache kmalloc-4096 of size 4096 [ 63.287247] The buggy address is located 3632 bytes inside of 4096-byte region [ffff88080e8d5500, ffff88080e8d6500) [ 63.303564] The buggy address belongs to the page: [ 63.310447] page:ffffea00203a3400 count:1 mapcount:0 mapping:ffff88081380e840 index:0x0 compound_mapcount: 0 [ 63.323102] flags: 0x2fffff80008100(slab|head) [ 63.329775] raw: 002fffff80008100 0000000000000000 0000000100000001 ffff88081380e840 [ 63.340175] raw: 0000000000000000 0000000000070007 00000001ffffffff 0000000000000000 [ 63.350564] page dumped because: kasan: bad access detected [ 63.361974] Memory state around the buggy address: [ 63.369137] ffff88080e8d6200: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 63.379082] ffff88080e8d6280: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 63.389032] >ffff88080e8d6300: 00 00 00 00 00 00 fc fc fc fc fc fc fc fc fc fc [ 63.398944] ^ [ 63.406141] ffff88080e8d6380: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 63.416109] ffff88080e8d6400: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 63.426099] ================================================================== The trace happens because get_hw_stats() assumes there is room in the memory allocated in init_cntrs() to accommodate the driver counters. Unfortunately, that routine only allocated space for the device counters. Fix by insuring the allocation has room for the additional driver counters. Cc: # v4.14+ Fixes: b7481944b06e9 ("IB/hfi1: Show statistics counters under IB stats interface") Reviewed-by: Mike Marciniczyn Reviewed-by: Mike Ruhl Signed-off-by: Piotr Stankiewicz Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 3 ++- drivers/infiniband/hw/hfi1/hfi.h | 2 ++ drivers/infiniband/hw/hfi1/verbs.c | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 9b20479dc710..7e6d70936c63 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -12500,7 +12500,8 @@ static int init_cntrs(struct hfi1_devdata *dd) } /* allocate space for the counter values */ - dd->cntrs = kcalloc(dd->ndevcntrs, sizeof(u64), GFP_KERNEL); + dd->cntrs = kcalloc(dd->ndevcntrs + num_driver_cntrs, sizeof(u64), + GFP_KERNEL); if (!dd->cntrs) goto bail; diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 1401b6ea4a28..2b882347d0c2 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -155,6 +155,8 @@ struct hfi1_ib_stats { extern struct hfi1_ib_stats hfi1_stats; extern const struct pci_error_handlers hfi1_pci_err_handler; +extern int num_driver_cntrs; + /* * First-cut criterion for "device is active" is * two thousand dwords combined Tx, Rx traffic per diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 48e11e510358..a365089a9305 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1479,7 +1479,7 @@ static const char * const driver_cntr_names[] = { static DEFINE_MUTEX(cntr_names_lock); /* protects the *_cntr_names bufers */ static const char **dev_cntr_names; static const char **port_cntr_names; -static int num_driver_cntrs = ARRAY_SIZE(driver_cntr_names); +int num_driver_cntrs = ARRAY_SIZE(driver_cntr_names); static int num_dev_cntrs; static int num_port_cntrs; static int cntr_names_initialized; From 37b06e5078975bb4efe3cbd91e254112851b125f Mon Sep 17 00:00:00 2001 From: Artemy Kovalyov Date: Tue, 27 Nov 2018 08:51:25 +0200 Subject: [PATCH 3/5] IB/mlx5: Fix implicit ODP interrupted page fault Since any page fault may be interrupted by a MMU invalidation and implicit leaf MR may be released during this process. The check for parent value is unreliable condition for an implicit MR. Use other condition that we can rely on to determine if MR is implicit. Fixes: b4cfe447d47b ("IB/mlx5: Implement on demand paging by adding support for MMU notifiers") Signed-off-by: Artemy Kovalyov Signed-off-by: Moni Shoua Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/odp.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 2cc3d69ab6f6..4dc6cc640ce0 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -506,14 +506,13 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt, u32 *bytes_mapped) { + int npages = 0, current_seq, page_shift, ret, np; + bool implicit = false; struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem); u64 access_mask = ODP_READ_ALLOWED_BIT; - int npages = 0, page_shift, np; u64 start_idx, page_mask; struct ib_umem_odp *odp; - int current_seq; size_t size; - int ret; if (!odp_mr->page_list) { odp = implicit_mr_get_data(mr, io_virt, bcnt); @@ -521,7 +520,7 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, if (IS_ERR(odp)) return PTR_ERR(odp); mr = odp->private; - + implicit = true; } else { odp = odp_mr; } @@ -600,7 +599,7 @@ next_mr: out: if (ret == -EAGAIN) { - if (mr->parent || !odp->dying) { + if (implicit || !odp->dying) { unsigned long timeout = msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT); From 47f07f03b5ee436fe074c4fb1fb28d013c36a0d8 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Wed, 5 Dec 2018 15:50:21 +0200 Subject: [PATCH 4/5] IB/mlx5: Block DEVX umem from the non applicable cases Blocks creating a DEVX UMEM with the non applicable access flags as of ODP, MW_BIND, etc. Specifically when an ODP flag is used below WARN call trace is issued. [ 2510.404131] RIP: 0010:__mlx5_ib_populate_pas+0x207/0x220 [mlx5_ib] ... [ 2510.404143] Call Trace: [ 2510.404150] ? __kmalloc_node+0x1b3/0x280 [ 2510.404156] ? _uverbs_alloc+0x63/0x90 [ib_uverbs] [ 2510.404158] ? _uverbs_alloc+0x63/0x90 [ib_uverbs] [ 2510.404162] mlx5_ib_populate_pas+0x53/0x60 [mlx5_ib] [ 2510.404167] mlx5_ib_handler_MLX5_IB_METHOD_DEVX_UMEM_REG+0x273/0x3f0 [mlx5_ib] Fixes: aeae94579caf ("IB/mlx5: Add DEVX support for memory registration") Signed-off-by: Yishai Hadas Reviewed-by: Artemy Kovalyov Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/devx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 61aab7c0c513..45c421c87100 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -1066,7 +1066,9 @@ static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext, err = uverbs_get_flags32(&access, attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_ACCESS, - IB_ACCESS_SUPPORTED); + IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ); if (err) return err; From 37fbd834b4e492dc41743830cbe435f35120abd8 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Wed, 5 Dec 2018 15:50:49 +0200 Subject: [PATCH 5/5] IB/core: Fix oops in netdev_next_upper_dev_rcu() When support for bonding of RoCE devices was added, there was necessarily a link between the RoCE device and the paired netdevice that was part of the bond. If you remove the mlx4_en module, that paired association is broken (the RoCE device is still present but the paired netdevice has been released). We need to account for this in is_upper_ndev_bond_master_filter() and filter out those links with a broken pairing or else we later oops in netdev_next_upper_dev_rcu(). Fixes: 408f1242d940 ("IB/core: Delete lower netdevice default GID entries in bonding scenario") Signed-off-by: Mark Zhang Reviewed-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/roce_gid_mgmt.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c index 25d43c8f1c2a..558de0b9895c 100644 --- a/drivers/infiniband/core/roce_gid_mgmt.c +++ b/drivers/infiniband/core/roce_gid_mgmt.c @@ -267,6 +267,9 @@ is_upper_ndev_bond_master_filter(struct ib_device *ib_dev, u8 port, struct net_device *cookie_ndev = cookie; bool match = false; + if (!rdma_ndev) + return false; + rcu_read_lock(); if (netif_is_bond_master(cookie_ndev) && rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev))