diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile index 3ce9dc8c3463..4044a8c8dbf4 100644 --- a/drivers/infiniband/hw/hfi1/Makefile +++ b/drivers/infiniband/hw/hfi1/Makefile @@ -24,6 +24,7 @@ hfi1-y := \ mad.o \ mmu_rb.o \ msix.o \ + opfn.o \ pcie.o \ pio.o \ pio_copy.o \ diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index b443642eac02..4d40311f082e 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -5222,6 +5222,17 @@ int is_bx(struct hfi1_devdata *dd) return (chip_rev_minor & 0xF0) == 0x10; } +/* return true is kernel urg disabled for rcd */ +bool is_urg_masked(struct hfi1_ctxtdata *rcd) +{ + u64 mask; + u32 is = IS_RCVURGENT_START + rcd->ctxt; + u8 bit = is % 64; + + mask = read_csr(rcd->dd, CCE_INT_MASK + (8 * (is / 64))); + return !(mask & BIT_ULL(bit)); +} + /* * Append string s to buffer buf. Arguments curp and len are the current * position and remaining length, respectively. diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index 6b9c8f12dff8..ba3d99e6e33b 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -1,7 +1,7 @@ #ifndef _CHIP_H #define _CHIP_H /* - * Copyright(c) 2015 - 2017 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -804,6 +804,7 @@ void clear_linkup_counters(struct hfi1_devdata *dd); u32 hdrqempty(struct hfi1_ctxtdata *rcd); int is_ax(struct hfi1_devdata *dd); int is_bx(struct hfi1_devdata *dd); +bool is_urg_masked(struct hfi1_ctxtdata *rcd); u32 read_physical_state(struct hfi1_devdata *dd); u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate); const char *opa_lstate_name(u32 lstate); diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 6db2276f5c13..9aa0357e17b7 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -73,6 +73,7 @@ #include "chip_registers.h" #include "common.h" +#include "opfn.h" #include "verbs.h" #include "pio.h" #include "chip.h" @@ -98,6 +99,8 @@ #define NEIGHBOR_TYPE_HFI 0 #define NEIGHBOR_TYPE_SWITCH 1 +#define HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES 5 + extern unsigned long hfi1_cap_mask; #define HFI1_CAP_KGET_MASK(mask, cap) ((mask) & HFI1_CAP_##cap) #define HFI1_CAP_UGET_MASK(mask, cap) \ diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 7835eb52e7c5..a8dbd0f191f5 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -72,7 +72,6 @@ #undef pr_fmt #define pr_fmt(fmt) DRIVER_NAME ": " fmt -#define HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES 5 /* * min buffers we want to have per context, after driver */ @@ -927,6 +926,8 @@ int hfi1_init(struct hfi1_devdata *dd, int reinit) lastfail = hfi1_create_rcvhdrq(dd, rcd); if (!lastfail) lastfail = hfi1_setup_eagerbufs(rcd); + if (!lastfail) + lastfail = hfi1_kern_exp_rcv_init(rcd, reinit); if (lastfail) { dd_dev_err(dd, "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n"); @@ -1497,6 +1498,12 @@ static int __init hfi1_mod_init(void) /* sanitize link CRC options */ link_crc_mask &= SUPPORTED_CRCS; + ret = opfn_init(); + if (ret < 0) { + pr_err("Failed to allocate opfn_wq"); + goto bail_dev; + } + /* * These must be called before the driver is registered with * the PCI subsystem. @@ -1527,6 +1534,7 @@ module_init(hfi1_mod_init); static void __exit hfi1_mod_cleanup(void) { pci_unregister_driver(&hfi1_pci_driver); + opfn_exit(); node_affinity_destroy_all(); hfi1_dbg_exit(); diff --git a/drivers/infiniband/hw/hfi1/opfn.c b/drivers/infiniband/hw/hfi1/opfn.c new file mode 100644 index 000000000000..2ca070690b2f --- /dev/null +++ b/drivers/infiniband/hw/hfi1/opfn.c @@ -0,0 +1,318 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +/* + * Copyright(c) 2018 Intel Corporation. + * + */ +#include "hfi.h" +#include "trace.h" +#include "qp.h" +#include "opfn.h" + +#define IB_BTHE_E BIT(IB_BTHE_E_SHIFT) + +#define OPFN_CODE(code) BIT((code) - 1) +#define OPFN_MASK(code) OPFN_CODE(STL_VERBS_EXTD_##code) + +struct hfi1_opfn_type { + bool (*request)(struct rvt_qp *qp, u64 *data); + bool (*response)(struct rvt_qp *qp, u64 *data); + bool (*reply)(struct rvt_qp *qp, u64 data); + void (*error)(struct rvt_qp *qp); +}; + +static struct hfi1_opfn_type hfi1_opfn_handlers[STL_VERBS_EXTD_MAX] = { + [STL_VERBS_EXTD_TID_RDMA] = { + .request = tid_rdma_conn_req, + .response = tid_rdma_conn_resp, + .reply = tid_rdma_conn_reply, + .error = tid_rdma_conn_error, + }, +}; + +static struct workqueue_struct *opfn_wq; + +static void opfn_schedule_conn_request(struct rvt_qp *qp); + +static bool hfi1_opfn_extended(u32 bth1) +{ + return !!(bth1 & IB_BTHE_E); +} + +static void opfn_conn_request(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct ib_atomic_wr wr; + u16 mask, capcode; + struct hfi1_opfn_type *extd; + u64 data; + unsigned long flags; + int ret = 0; + + trace_hfi1_opfn_state_conn_request(qp); + spin_lock_irqsave(&priv->opfn.lock, flags); + /* + * Exit if the extended bit is not set, or if nothing is requested, or + * if we have completed all requests, or if a previous request is in + * progress + */ + if (!priv->opfn.extended || !priv->opfn.requested || + priv->opfn.requested == priv->opfn.completed || priv->opfn.curr) + goto done; + + mask = priv->opfn.requested & ~priv->opfn.completed; + capcode = ilog2(mask & ~(mask - 1)) + 1; + if (capcode >= STL_VERBS_EXTD_MAX) { + priv->opfn.completed |= OPFN_CODE(capcode); + goto done; + } + + extd = &hfi1_opfn_handlers[capcode]; + if (!extd || !extd->request || !extd->request(qp, &data)) { + /* + * Either there is no handler for this capability or the request + * packet could not be generated. Either way, mark it as done so + * we don't keep attempting to complete it. + */ + priv->opfn.completed |= OPFN_CODE(capcode); + goto done; + } + + trace_hfi1_opfn_data_conn_request(qp, capcode, data); + data = (data & ~0xf) | capcode; + + memset(&wr, 0, sizeof(wr)); + wr.wr.opcode = IB_WR_OPFN; + wr.remote_addr = HFI1_VERBS_E_ATOMIC_VADDR; + wr.compare_add = data; + + priv->opfn.curr = capcode; /* A new request is now in progress */ + /* Drop opfn.lock before calling ib_post_send() */ + spin_unlock_irqrestore(&priv->opfn.lock, flags); + + ret = ib_post_send(&qp->ibqp, &wr.wr, NULL); + if (ret) + goto err; + trace_hfi1_opfn_state_conn_request(qp); + return; +err: + trace_hfi1_msg_opfn_conn_request(qp, "ib_ost_send failed: ret = ", + (u64)ret); + spin_lock_irqsave(&priv->opfn.lock, flags); + /* + * In case of an unexpected error return from ib_post_send + * clear opfn.curr and reschedule to try again + */ + priv->opfn.curr = STL_VERBS_EXTD_NONE; + opfn_schedule_conn_request(qp); +done: + spin_unlock_irqrestore(&priv->opfn.lock, flags); +} + +void opfn_send_conn_request(struct work_struct *work) +{ + struct hfi1_opfn_data *od; + struct hfi1_qp_priv *qpriv; + + od = container_of(work, struct hfi1_opfn_data, opfn_work); + qpriv = container_of(od, struct hfi1_qp_priv, opfn); + + opfn_conn_request(qpriv->owner); +} + +/* + * When QP s_lock is held in the caller, the OPFN request must be scheduled + * to a different workqueue to avoid double locking QP s_lock in call to + * ib_post_send in opfn_conn_request + */ +static void opfn_schedule_conn_request(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + trace_hfi1_opfn_state_sched_conn_request(qp); + queue_work(opfn_wq, &priv->opfn.opfn_work); +} + +void opfn_conn_response(struct rvt_qp *qp, struct rvt_ack_entry *e, + struct ib_atomic_eth *ateth) +{ + struct hfi1_qp_priv *priv = qp->priv; + u64 data = be64_to_cpu(ateth->compare_data); + struct hfi1_opfn_type *extd; + u8 capcode; + unsigned long flags; + + trace_hfi1_opfn_state_conn_response(qp); + capcode = data & 0xf; + trace_hfi1_opfn_data_conn_response(qp, capcode, data); + if (!capcode || capcode >= STL_VERBS_EXTD_MAX) + return; + + extd = &hfi1_opfn_handlers[capcode]; + + if (!extd || !extd->response) { + e->atomic_data = capcode; + return; + } + + spin_lock_irqsave(&priv->opfn.lock, flags); + if (priv->opfn.completed & OPFN_CODE(capcode)) { + /* + * We are receiving a request for a feature that has already + * been negotiated. This may mean that the other side has reset + */ + priv->opfn.completed &= ~OPFN_CODE(capcode); + if (extd->error) + extd->error(qp); + } + + if (extd->response(qp, &data)) + priv->opfn.completed |= OPFN_CODE(capcode); + e->atomic_data = (data & ~0xf) | capcode; + trace_hfi1_opfn_state_conn_response(qp); + spin_unlock_irqrestore(&priv->opfn.lock, flags); +} + +void opfn_conn_reply(struct rvt_qp *qp, u64 data) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_opfn_type *extd; + u8 capcode; + unsigned long flags; + + trace_hfi1_opfn_state_conn_reply(qp); + capcode = data & 0xf; + trace_hfi1_opfn_data_conn_reply(qp, capcode, data); + if (!capcode || capcode >= STL_VERBS_EXTD_MAX) + return; + + spin_lock_irqsave(&priv->opfn.lock, flags); + /* + * Either there is no previous request or the reply is not for the + * current request + */ + if (!priv->opfn.curr || capcode != priv->opfn.curr) + goto done; + + extd = &hfi1_opfn_handlers[capcode]; + + if (!extd || !extd->reply) + goto clear; + + if (extd->reply(qp, data)) + priv->opfn.completed |= OPFN_CODE(capcode); +clear: + /* + * Clear opfn.curr to indicate that the previous request is no longer in + * progress + */ + priv->opfn.curr = STL_VERBS_EXTD_NONE; + trace_hfi1_opfn_state_conn_reply(qp); +done: + spin_unlock_irqrestore(&priv->opfn.lock, flags); +} + +void opfn_conn_error(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_opfn_type *extd = NULL; + unsigned long flags; + u16 capcode; + + trace_hfi1_opfn_state_conn_error(qp); + trace_hfi1_msg_opfn_conn_error(qp, "error. qp state ", (u64)qp->state); + /* + * The QP has gone into the Error state. We have to invalidate all + * negotiated feature, including the one in progress (if any). The RC + * QP handling will clean the WQE for the connection request. + */ + spin_lock_irqsave(&priv->opfn.lock, flags); + while (priv->opfn.completed) { + capcode = priv->opfn.completed & ~(priv->opfn.completed - 1); + extd = &hfi1_opfn_handlers[ilog2(capcode) + 1]; + if (extd->error) + extd->error(qp); + priv->opfn.completed &= ~OPFN_CODE(capcode); + } + priv->opfn.extended = 0; + priv->opfn.requested = 0; + priv->opfn.curr = STL_VERBS_EXTD_NONE; + spin_unlock_irqrestore(&priv->opfn.lock, flags); +} + +void opfn_qp_init(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask) +{ + struct ib_qp *ibqp = &qp->ibqp; + struct hfi1_qp_priv *priv = qp->priv; + unsigned long flags; + + spin_lock_irqsave(&priv->opfn.lock, flags); + if (ibqp->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { + struct tid_rdma_params *local = &priv->tid_rdma.local; + + if (qp->pmtu == enum_to_mtu(OPA_MTU_4096) || + qp->pmtu == enum_to_mtu(OPA_MTU_8192)) { + tid_rdma_opfn_init(qp, local); + /* + * We only want to set the OPFN requested bit when the + * QP transitions to RTS. + */ + if (attr_mask & IB_QP_STATE && + attr->qp_state == IB_QPS_RTS) { + priv->opfn.requested |= OPFN_MASK(TID_RDMA); + /* + * If the QP is transitioning to RTS and the + * opfn.completed for TID RDMA has already been + * set, the QP is being moved *back* into RTS. + * We can now renegotiate the TID RDMA + * parameters. + */ + if (priv->opfn.completed & + OPFN_MASK(TID_RDMA)) { + priv->opfn.completed &= + ~OPFN_MASK(TID_RDMA); + /* + * Since the opfn.completed bit was + * already set, it is safe to assume + * that the opfn.extended is also set. + */ + opfn_schedule_conn_request(qp); + } + } + } else { + memset(local, 0, sizeof(*local)); + } + } + spin_unlock_irqrestore(&priv->opfn.lock, flags); +} + +void opfn_trigger_conn_request(struct rvt_qp *qp, u32 bth1) +{ + struct hfi1_qp_priv *priv = qp->priv; + + if (!priv->opfn.extended && hfi1_opfn_extended(bth1) && + HFI1_CAP_IS_KSET(OPFN)) { + priv->opfn.extended = 1; + if (qp->state == IB_QPS_RTS) + opfn_conn_request(qp); + } +} + +int opfn_init(void) +{ + opfn_wq = alloc_workqueue("hfi_opfn", + WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE | + WQ_MEM_RECLAIM, + HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES); + if (!opfn_wq) + return -ENOMEM; + + return 0; +} + +void opfn_exit(void) +{ + if (opfn_wq) { + destroy_workqueue(opfn_wq); + opfn_wq = NULL; + } +} diff --git a/drivers/infiniband/hw/hfi1/opfn.h b/drivers/infiniband/hw/hfi1/opfn.h new file mode 100644 index 000000000000..5f2011cabc25 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/opfn.h @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ +/* + * Copyright(c) 2018 Intel Corporation. + * + */ +#ifndef _HFI1_OPFN_H +#define _HFI1_OPFN_H + +/** + * DOC: Omni Path Feature Negotion (OPFN) + * + * OPFN is a discovery protocol for Intel Omni-Path fabric that + * allows two RC QPs to negotiate a common feature that both QPs + * can support. Currently, the only OPA feature that OPFN + * supports is TID RDMA. + * + * Architecture + * + * OPFN involves the communication between two QPs on the HFI + * level on an Omni-Path fabric, and ULPs have no knowledge of + * OPFN at all. + * + * Implementation + * + * OPFN extends the existing IB RC protocol with the following + * changes: + * -- Uses Bit 24 (reserved) of DWORD 1 of Base Transport + * Header (BTH1) to indicate that the RC QP supports OPFN; + * -- Uses a combination of RC COMPARE_SWAP opcode (0x13) and + * the address U64_MAX (0xFFFFFFFFFFFFFFFF) as an OPFN + * request; The 64-bit data carried with the request/response + * contains the parameters for negotiation and will be + * defined in tid_rdma.c file; + * -- Defines IB_WR_RESERVED3 as IB_WR_OPFN. + * + * The OPFN communication will be triggered when an RC QP + * receives a request with Bit 24 of BTH1 set. The responder QP + * will then post send an OPFN request with its local + * parameters, which will be sent to the requester QP once all + * existing requests on the responder QP side have been sent. + * Once the requester QP receives the OPFN request, it will + * keep a copy of the responder QP's parameters, and return a + * response packet with its own local parameters. The responder + * QP receives the response packet and keeps a copy of the requester + * QP's parameters. After this exchange, each side has the parameters + * for both sides and therefore can select the right parameters + * for future transactions + */ + +/* STL Verbs Extended */ +#define IB_BTHE_E_SHIFT 24 +#define HFI1_VERBS_E_ATOMIC_VADDR U64_MAX + +struct ib_atomic_eth; + +enum hfi1_opfn_codes { + STL_VERBS_EXTD_NONE = 0, + STL_VERBS_EXTD_TID_RDMA, + STL_VERBS_EXTD_MAX +}; + +struct hfi1_opfn_data { + u8 extended; + u16 requested; + u16 completed; + enum hfi1_opfn_codes curr; + /* serialize opfn function calls */ + spinlock_t lock; + struct work_struct opfn_work; +}; + +/* WR opcode for OPFN */ +#define IB_WR_OPFN IB_WR_RESERVED3 + +void opfn_send_conn_request(struct work_struct *work); +void opfn_conn_response(struct rvt_qp *qp, struct rvt_ack_entry *e, + struct ib_atomic_eth *ateth); +void opfn_conn_reply(struct rvt_qp *qp, u64 data); +void opfn_conn_error(struct rvt_qp *qp); +void opfn_qp_init(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask); +void opfn_trigger_conn_request(struct rvt_qp *qp, u32 bth1); +int opfn_init(void); +void opfn_exit(void); + +#endif /* _HFI1_OPFN_H */ diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 5344e8993b28..f822f92b415f 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -132,6 +132,12 @@ const struct rvt_operation_params hfi1_post_parms[RVT_OPERATION_MAX] = { .qpt_support = BIT(IB_QPT_RC), }, +[IB_WR_OPFN] = { + .length = sizeof(struct ib_atomic_wr), + .qpt_support = BIT(IB_QPT_RC), + .flags = RVT_OPERATION_USE_RESERVE, +}, + }; static void flush_list_head(struct list_head *l) @@ -285,6 +291,8 @@ void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc); qp_set_16b(qp); } + + opfn_qp_init(qp, attr, attr_mask); } /** @@ -696,6 +704,7 @@ void qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp) { struct hfi1_qp_priv *priv = qp->priv; + hfi1_qp_priv_tid_free(rdi, qp); kfree(priv->s_ahg); kfree(priv); } @@ -751,6 +760,10 @@ void notify_qp_reset(struct rvt_qp *qp) { qp->r_adefered = 0; clear_ahg(qp); + + /* Clear any OPFN state */ + if (qp->ibqp.qp_type == IB_QPT_RC) + opfn_conn_error(qp); } /* diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index be603f35d7e4..092d5eba980f 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -57,6 +57,10 @@ /* cut down ridiculously long IB macro names */ #define OP(x) RC_OP(x) +static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, + struct rvt_swqe *wqe, + struct hfi1_ibport *ibp); + static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, u32 psn, u32 pmtu) { @@ -89,8 +93,8 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, struct rvt_ack_entry *e; u32 hwords; u32 len; - u32 bth0; - u32 bth2; + u32 bth0, bth2; + u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT); int middle = 0; u32 pmtu = qp->pmtu; struct hfi1_qp_priv *priv = qp->priv; @@ -122,7 +126,8 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, * response has been sent instead of only being * constructed. */ - if (++qp->s_tail_ack_queue > HFI1_MAX_RDMA_ATOMIC) + if (++qp->s_tail_ack_queue > + rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) qp->s_tail_ack_queue = 0; /* FALLTHROUGH */ case OP(SEND_ONLY): @@ -229,7 +234,7 @@ normal: ps->s_txreq->sde = priv->s_sde; ps->s_txreq->s_cur_size = len; ps->s_txreq->hdr_dwords = hwords; - hfi1_make_ruc_header(qp, ohdr, bth0, bth2, middle, ps); + hfi1_make_ruc_header(qp, ohdr, bth0, bth1, bth2, middle, ps); return 1; bail: @@ -262,8 +267,8 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) struct rvt_swqe *wqe; u32 hwords; u32 len; - u32 bth0 = 0; - u32 bth2; + u32 bth0 = 0, bth2; + u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT); u32 pmtu = qp->pmtu; char newreq; int middle = 0; @@ -516,10 +521,14 @@ no_flow_control: goto bail; } qp->s_num_rd_atomic++; - if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) - qp->s_lsn++; } - if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) { + + /* FALLTHROUGH */ + case IB_WR_OPFN: + if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) + qp->s_lsn++; + if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_OPFN) { qp->s_state = OP(COMPARE_SWAP); put_ib_ateth_swap(wqe->atomic_wr.swap, &ohdr->u.atomic_eth); @@ -693,6 +702,7 @@ no_flow_control: qp, ohdr, bth0 | (qp->s_state << 24), + bth1, bth2, middle, ps); @@ -796,6 +806,11 @@ static inline void hfi1_make_rc_ack_9B(struct hfi1_packet *packet, if (qp->s_mig_state == IB_MIG_MIGRATED) bth0 |= IB_BTH_MIG_REQ; bth1 = (!!is_fecn) << IB_BECN_SHIFT; + /* + * Inline ACKs go out without the use of the Verbs send engine, so + * we need to set the STL Verbs Extended bit here + */ + bth1 |= HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT; hfi1_make_bth_aeth(qp, ohdr, bth0, bth1); } @@ -1033,6 +1048,7 @@ done: */ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) { + struct hfi1_qp_priv *priv = qp->priv; struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked); struct hfi1_ibport *ibp; @@ -1043,8 +1059,26 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) hfi1_migrate_qp(qp); qp->s_retry = qp->s_retry_cnt; } else if (qp->s_last == qp->s_acked) { - rvt_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); - rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); + /* + * We need special handling for the OPFN request WQEs as + * they are not allowed to generate real user errors + */ + if (wqe->wr.opcode == IB_WR_OPFN) { + struct hfi1_ibport *ibp = + to_iport(qp->ibqp.device, qp->port_num); + /* + * Call opfn_conn_reply() with capcode and + * remaining data as 0 to close out the + * current request + */ + opfn_conn_reply(qp, priv->opfn.curr); + wqe = do_rc_completion(qp, wqe, ibp); + qp->s_flags &= ~RVT_S_WAIT_ACK; + } else { + rvt_send_complete(qp, wqe, + IB_WC_RETRY_EXC_ERR); + rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); + } return; } else { /* need to handle delayed completion */ return; @@ -1356,6 +1390,9 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, u64 *vaddr = wqe->sg_list[0].vaddr; *vaddr = val; } + if (wqe->wr.opcode == IB_WR_OPFN) + opfn_conn_reply(qp, val); + if (qp->s_num_rd_atomic && (wqe->wr.opcode == IB_WR_RDMA_READ || wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || @@ -1812,7 +1849,7 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, if (i) prev = i - 1; else - prev = HFI1_MAX_RDMA_ATOMIC; + prev = rvt_size_atomic(ib_to_rvt(qp->ibqp.device)); if (prev == qp->r_head_ack_queue) { e = NULL; break; @@ -1936,7 +1973,7 @@ static inline void update_ack_queue(struct rvt_qp *qp, unsigned n) unsigned next; next = n + 1; - if (next > HFI1_MAX_RDMA_ATOMIC) + if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) next = 0; qp->s_tail_ack_queue = next; qp->s_ack_state = OP(ACKNOWLEDGE); @@ -2061,6 +2098,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) return; fecn = process_ecn(qp, packet); + opfn_trigger_conn_request(qp, be32_to_cpu(ohdr->bth[1])); /* * Process responses (ACKs) before anything else. Note that the @@ -2292,8 +2330,8 @@ send_last: if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) goto nack_inv; next = qp->r_head_ack_queue + 1; - /* s_ack_queue is size HFI1_MAX_RDMA_ATOMIC+1 so use > not >= */ - if (next > HFI1_MAX_RDMA_ATOMIC) + /* s_ack_queue is size rvt_size_atomic()+1 so use > not >= */ + if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) next = 0; spin_lock_irqsave(&qp->s_lock, flags); if (unlikely(next == qp->s_tail_ack_queue)) { @@ -2356,18 +2394,21 @@ send_last: case OP(COMPARE_SWAP): case OP(FETCH_ADD): { - struct ib_atomic_eth *ateth; + struct ib_atomic_eth *ateth = &ohdr->u.atomic_eth; + u64 vaddr = get_ib_ateth_vaddr(ateth); + bool opfn = opcode == OP(COMPARE_SWAP) && + vaddr == HFI1_VERBS_E_ATOMIC_VADDR; struct rvt_ack_entry *e; - u64 vaddr; atomic64_t *maddr; u64 sdata; u32 rkey; u8 next; - if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) && + !opfn)) goto nack_inv; next = qp->r_head_ack_queue + 1; - if (next > HFI1_MAX_RDMA_ATOMIC) + if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) next = 0; spin_lock_irqsave(&qp->s_lock, flags); if (unlikely(next == qp->s_tail_ack_queue)) { @@ -2380,8 +2421,11 @@ send_last: rvt_put_mr(e->rdma_sge.mr); e->rdma_sge.mr = NULL; } - ateth = &ohdr->u.atomic_eth; - vaddr = get_ib_ateth_vaddr(ateth); + /* Process OPFN special virtual address */ + if (opfn) { + opfn_conn_response(qp, e, ateth); + goto ack; + } if (unlikely(vaddr & (sizeof(u64) - 1))) goto nack_inv_unlck; rkey = be32_to_cpu(ateth->rkey); @@ -2400,6 +2444,7 @@ send_last: sdata); rvt_put_mr(qp->r_sge.sge.mr); qp->r_sge.num_sge = 0; +ack: e->opcode = opcode; e->sent = 0; e->psn = psn; diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c index 7fb317c711df..f96c0f544cb0 100644 --- a/drivers/infiniband/hw/hfi1/ruc.c +++ b/drivers/infiniband/hw/hfi1/ruc.c @@ -250,7 +250,6 @@ static inline void hfi1_make_ruc_bth(struct rvt_qp *qp, struct ib_other_headers *ohdr, u32 bth0, u32 bth1, u32 bth2) { - bth1 |= qp->remote_qpn; ohdr->bth[0] = cpu_to_be32(bth0); ohdr->bth[1] = cpu_to_be32(bth1); ohdr->bth[2] = cpu_to_be32(bth2); @@ -272,13 +271,13 @@ static inline void hfi1_make_ruc_bth(struct rvt_qp *qp, */ static inline void hfi1_make_ruc_header_16B(struct rvt_qp *qp, struct ib_other_headers *ohdr, - u32 bth0, u32 bth2, int middle, + u32 bth0, u32 bth1, u32 bth2, + int middle, struct hfi1_pkt_state *ps) { struct hfi1_qp_priv *priv = qp->priv; struct hfi1_ibport *ibp = ps->ibp; struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); - u32 bth1 = 0; u32 slid; u16 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index); u8 l4 = OPA_16B_L4_IB_LOCAL; @@ -360,12 +359,12 @@ static inline void hfi1_make_ruc_header_16B(struct rvt_qp *qp, */ static inline void hfi1_make_ruc_header_9B(struct rvt_qp *qp, struct ib_other_headers *ohdr, - u32 bth0, u32 bth2, int middle, + u32 bth0, u32 bth1, u32 bth2, + int middle, struct hfi1_pkt_state *ps) { struct hfi1_qp_priv *priv = qp->priv; struct hfi1_ibport *ibp = ps->ibp; - u32 bth1 = 0; u16 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index); u16 lrh0 = HFI1_LRH_BTH; u8 extra_bytes = -ps->s_txreq->s_cur_size & 3; @@ -415,7 +414,7 @@ static inline void hfi1_make_ruc_header_9B(struct rvt_qp *qp, typedef void (*hfi1_make_ruc_hdr)(struct rvt_qp *qp, struct ib_other_headers *ohdr, - u32 bth0, u32 bth2, int middle, + u32 bth0, u32 bth1, u32 bth2, int middle, struct hfi1_pkt_state *ps); /* We support only two types - 9B and 16B for now */ @@ -425,7 +424,7 @@ static const hfi1_make_ruc_hdr hfi1_ruc_header_tbl[2] = { }; void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr, - u32 bth0, u32 bth2, int middle, + u32 bth0, u32 bth1, u32 bth2, int middle, struct hfi1_pkt_state *ps) { struct hfi1_qp_priv *priv = qp->priv; @@ -446,7 +445,8 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr, priv->s_ahg->ahgidx = 0; /* Make the appropriate header */ - hfi1_ruc_header_tbl[priv->hdr_type](qp, ohdr, bth0, bth2, middle, ps); + hfi1_ruc_header_tbl[priv->hdr_type](qp, ohdr, bth0, bth1, bth2, middle, + ps); } /* when sending, force a reschedule every one of these periods */ diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index da1ecb68a928..e8f57c0cd8bc 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -7,6 +7,211 @@ #include "hfi.h" #include "verbs.h" #include "tid_rdma.h" +#include "trace.h" + +/* + * J_KEY for kernel contexts when TID RDMA is used. + * See generate_jkey() in hfi.h for more information. + */ +#define TID_RDMA_JKEY 32 +#define HFI1_KERNEL_MIN_JKEY HFI1_ADMIN_JKEY_RANGE +#define HFI1_KERNEL_MAX_JKEY (2 * HFI1_ADMIN_JKEY_RANGE - 1) + +#define TID_RDMA_MAX_READ_SEGS_PER_REQ 6 +#define TID_RDMA_MAX_WRITE_SEGS_PER_REQ 4 + +#define TID_OPFN_QP_CTXT_MASK 0xff +#define TID_OPFN_QP_CTXT_SHIFT 56 +#define TID_OPFN_QP_KDETH_MASK 0xff +#define TID_OPFN_QP_KDETH_SHIFT 48 +#define TID_OPFN_MAX_LEN_MASK 0x7ff +#define TID_OPFN_MAX_LEN_SHIFT 37 +#define TID_OPFN_TIMEOUT_MASK 0x1f +#define TID_OPFN_TIMEOUT_SHIFT 32 +#define TID_OPFN_RESERVED_MASK 0x3f +#define TID_OPFN_RESERVED_SHIFT 26 +#define TID_OPFN_URG_MASK 0x1 +#define TID_OPFN_URG_SHIFT 25 +#define TID_OPFN_VER_MASK 0x7 +#define TID_OPFN_VER_SHIFT 22 +#define TID_OPFN_JKEY_MASK 0x3f +#define TID_OPFN_JKEY_SHIFT 16 +#define TID_OPFN_MAX_READ_MASK 0x3f +#define TID_OPFN_MAX_READ_SHIFT 10 +#define TID_OPFN_MAX_WRITE_MASK 0x3f +#define TID_OPFN_MAX_WRITE_SHIFT 4 + +/* + * OPFN TID layout + * + * 63 47 31 15 + * NNNNNNNNKKKKKKKK MMMMMMMMMMMTTTTT DDDDDDUVVVJJJJJJ RRRRRRWWWWWWCCCC + * 3210987654321098 7654321098765432 1098765432109876 5432109876543210 + * N - the context Number + * K - the Kdeth_qp + * M - Max_len + * T - Timeout + * D - reserveD + * V - version + * U - Urg capable + * J - Jkey + * R - max_Read + * W - max_Write + * C - Capcode + */ + +static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p) +{ + return + (((u64)p->qp & TID_OPFN_QP_CTXT_MASK) << + TID_OPFN_QP_CTXT_SHIFT) | + ((((u64)p->qp >> 16) & TID_OPFN_QP_KDETH_MASK) << + TID_OPFN_QP_KDETH_SHIFT) | + (((u64)((p->max_len >> PAGE_SHIFT) - 1) & + TID_OPFN_MAX_LEN_MASK) << TID_OPFN_MAX_LEN_SHIFT) | + (((u64)p->timeout & TID_OPFN_TIMEOUT_MASK) << + TID_OPFN_TIMEOUT_SHIFT) | + (((u64)p->urg & TID_OPFN_URG_MASK) << TID_OPFN_URG_SHIFT) | + (((u64)p->jkey & TID_OPFN_JKEY_MASK) << TID_OPFN_JKEY_SHIFT) | + (((u64)p->max_read & TID_OPFN_MAX_READ_MASK) << + TID_OPFN_MAX_READ_SHIFT) | + (((u64)p->max_write & TID_OPFN_MAX_WRITE_MASK) << + TID_OPFN_MAX_WRITE_SHIFT); +} + +static void tid_rdma_opfn_decode(struct tid_rdma_params *p, u64 data) +{ + p->max_len = (((data >> TID_OPFN_MAX_LEN_SHIFT) & + TID_OPFN_MAX_LEN_MASK) + 1) << PAGE_SHIFT; + p->jkey = (data >> TID_OPFN_JKEY_SHIFT) & TID_OPFN_JKEY_MASK; + p->max_write = (data >> TID_OPFN_MAX_WRITE_SHIFT) & + TID_OPFN_MAX_WRITE_MASK; + p->max_read = (data >> TID_OPFN_MAX_READ_SHIFT) & + TID_OPFN_MAX_READ_MASK; + p->qp = + ((((data >> TID_OPFN_QP_KDETH_SHIFT) & TID_OPFN_QP_KDETH_MASK) + << 16) | + ((data >> TID_OPFN_QP_CTXT_SHIFT) & TID_OPFN_QP_CTXT_MASK)); + p->urg = (data >> TID_OPFN_URG_SHIFT) & TID_OPFN_URG_MASK; + p->timeout = (data >> TID_OPFN_TIMEOUT_SHIFT) & TID_OPFN_TIMEOUT_MASK; +} + +void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p) +{ + struct hfi1_qp_priv *priv = qp->priv; + + p->qp = (kdeth_qp << 16) | priv->rcd->ctxt; + p->max_len = TID_RDMA_MAX_SEGMENT_SIZE; + p->jkey = priv->rcd->jkey; + p->max_read = TID_RDMA_MAX_READ_SEGS_PER_REQ; + p->max_write = TID_RDMA_MAX_WRITE_SEGS_PER_REQ; + p->timeout = qp->timeout; + p->urg = is_urg_masked(priv->rcd); +} + +bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data) +{ + struct hfi1_qp_priv *priv = qp->priv; + + *data = tid_rdma_opfn_encode(&priv->tid_rdma.local); + return true; +} + +bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct tid_rdma_params *remote, *old; + bool ret = true; + + old = rcu_dereference_protected(priv->tid_rdma.remote, + lockdep_is_held(&priv->opfn.lock)); + data &= ~0xfULL; + /* + * If data passed in is zero, return true so as not to continue the + * negotiation process + */ + if (!data || !HFI1_CAP_IS_KSET(TID_RDMA)) + goto null; + /* + * If kzalloc fails, return false. This will result in: + * * at the requester a new OPFN request being generated to retry + * the negotiation + * * at the responder, 0 being returned to the requester so as to + * disable TID RDMA at both the requester and the responder + */ + remote = kzalloc(sizeof(*remote), GFP_ATOMIC); + if (!remote) { + ret = false; + goto null; + } + + tid_rdma_opfn_decode(remote, data); + priv->tid_timer_timeout_jiffies = + usecs_to_jiffies((((4096UL * (1UL << remote->timeout)) / + 1000UL) << 3) * 7); + trace_hfi1_opfn_param(qp, 0, &priv->tid_rdma.local); + trace_hfi1_opfn_param(qp, 1, remote); + rcu_assign_pointer(priv->tid_rdma.remote, remote); + /* + * A TID RDMA READ request's segment size is not equal to + * remote->max_len only when the request's data length is smaller + * than remote->max_len. In that case, there will be only one segment. + * Therefore, when priv->pkts_ps is used to calculate req->cur_seg + * during retry, it will lead to req->cur_seg = 0, which is exactly + * what is expected. + */ + priv->pkts_ps = (u16)rvt_div_mtu(qp, remote->max_len); + priv->timeout_shift = ilog2(priv->pkts_ps - 1) + 1; + goto free; +null: + RCU_INIT_POINTER(priv->tid_rdma.remote, NULL); + priv->timeout_shift = 0; +free: + if (old) + kfree_rcu(old, rcu_head); + return ret; +} + +bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data) +{ + bool ret; + + ret = tid_rdma_conn_reply(qp, *data); + *data = 0; + /* + * If tid_rdma_conn_reply() returns error, set *data as 0 to indicate + * TID RDMA could not be enabled. This will result in TID RDMA being + * disabled at the requester too. + */ + if (ret) + (void)tid_rdma_conn_req(qp, data); + return ret; +} + +void tid_rdma_conn_error(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct tid_rdma_params *old; + + old = rcu_dereference_protected(priv->tid_rdma.remote, + lockdep_is_held(&priv->opfn.lock)); + RCU_INIT_POINTER(priv->tid_rdma.remote, NULL); + if (old) + kfree_rcu(old, rcu_head); +} + +/* This is called at context initialization time */ +int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit) +{ + if (reinit) + return 0; + + BUILD_BUG_ON(TID_RDMA_JKEY < HFI1_KERNEL_MIN_JKEY); + BUILD_BUG_ON(TID_RDMA_JKEY > HFI1_KERNEL_MAX_JKEY); + rcd->jkey = TID_RDMA_JKEY; + hfi1_set_ctxt_jkey(rcd->dd, rcd, rcd->jkey); + return 0; +} /** * qp_to_rcd - determine the receive context used by a qp @@ -44,5 +249,16 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, qpriv->rcd = qp_to_rcd(rdi, qp); + spin_lock_init(&qpriv->opfn.lock); + INIT_WORK(&qpriv->opfn.opfn_work, opfn_send_conn_request); + return 0; } + +void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + if (qp->ibqp.qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) + cancel_work_sync(&priv->opfn.opfn_work); +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 6fcd3adcdcc3..ee8151558e3f 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -6,8 +6,35 @@ #ifndef HFI1_TID_RDMA_H #define HFI1_TID_RDMA_H +#define TID_RDMA_MAX_SEGMENT_SIZE BIT(18) /* 256 KiB (for now) */ + +struct tid_rdma_params { + struct rcu_head rcu_head; + u32 qp; + u32 max_len; + u16 jkey; + u8 max_read; + u8 max_write; + u8 timeout; + u8 urg; + u8 version; +}; + +struct tid_rdma_qp_params { + struct tid_rdma_params local; + struct tid_rdma_params __rcu *remote; +}; + +bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data); +bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data); +bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data); +void tid_rdma_conn_error(struct rvt_qp *qp); +void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p); + +int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit); + int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, struct ib_qp_init_attr *init_attr); +void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp); #endif /* HFI1_TID_RDMA_H */ - diff --git a/drivers/infiniband/hw/hfi1/trace.h b/drivers/infiniband/hw/hfi1/trace.h index 84458f1325e1..1ce551864118 100644 --- a/drivers/infiniband/hw/hfi1/trace.h +++ b/drivers/infiniband/hw/hfi1/trace.h @@ -63,3 +63,4 @@ __print_symbolic(etype, \ #include "trace_tx.h" #include "trace_mmu.h" #include "trace_iowait.h" +#include "trace_tid.h" diff --git a/drivers/infiniband/hw/hfi1/trace_rx.h b/drivers/infiniband/hw/hfi1/trace_rx.h index 7eceb57e0415..3cec960e9674 100644 --- a/drivers/infiniband/hw/hfi1/trace_rx.h +++ b/drivers/infiniband/hw/hfi1/trace_rx.h @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2017 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -128,111 +128,6 @@ TRACE_EVENT(hfi1_receive_interrupt, ) ); -DECLARE_EVENT_CLASS( - hfi1_exp_tid_reg_unreg, - TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, - u32 npages, unsigned long va, unsigned long pa, - dma_addr_t dma), - TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma), - TP_STRUCT__entry( - __field(unsigned int, ctxt) - __field(u16, subctxt) - __field(u32, rarr) - __field(u32, npages) - __field(unsigned long, va) - __field(unsigned long, pa) - __field(dma_addr_t, dma) - ), - TP_fast_assign( - __entry->ctxt = ctxt; - __entry->subctxt = subctxt; - __entry->rarr = rarr; - __entry->npages = npages; - __entry->va = va; - __entry->pa = pa; - __entry->dma = dma; - ), - TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx", - __entry->ctxt, - __entry->subctxt, - __entry->rarr, - __entry->npages, - __entry->pa, - __entry->va, - __entry->dma - ) - ); - -DEFINE_EVENT( - hfi1_exp_tid_reg_unreg, hfi1_exp_tid_unreg, - TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages, - unsigned long va, unsigned long pa, dma_addr_t dma), - TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma)); - -DEFINE_EVENT( - hfi1_exp_tid_reg_unreg, hfi1_exp_tid_reg, - TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages, - unsigned long va, unsigned long pa, dma_addr_t dma), - TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma)); - -TRACE_EVENT( - hfi1_put_tid, - TP_PROTO(struct hfi1_devdata *dd, - u32 index, u32 type, unsigned long pa, u16 order), - TP_ARGS(dd, index, type, pa, order), - TP_STRUCT__entry( - DD_DEV_ENTRY(dd) - __field(unsigned long, pa); - __field(u32, index); - __field(u32, type); - __field(u16, order); - ), - TP_fast_assign( - DD_DEV_ASSIGN(dd); - __entry->pa = pa; - __entry->index = index; - __entry->type = type; - __entry->order = order; - ), - TP_printk("[%s] type %s pa %lx index %u order %u", - __get_str(dev), - show_tidtype(__entry->type), - __entry->pa, - __entry->index, - __entry->order - ) -); - -TRACE_EVENT(hfi1_exp_tid_inval, - TP_PROTO(unsigned int ctxt, u16 subctxt, unsigned long va, u32 rarr, - u32 npages, dma_addr_t dma), - TP_ARGS(ctxt, subctxt, va, rarr, npages, dma), - TP_STRUCT__entry( - __field(unsigned int, ctxt) - __field(u16, subctxt) - __field(unsigned long, va) - __field(u32, rarr) - __field(u32, npages) - __field(dma_addr_t, dma) - ), - TP_fast_assign( - __entry->ctxt = ctxt; - __entry->subctxt = subctxt; - __entry->va = va; - __entry->rarr = rarr; - __entry->npages = npages; - __entry->dma = dma; - ), - TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx dma: 0x%llx", - __entry->ctxt, - __entry->subctxt, - __entry->rarr, - __entry->npages, - __entry->va, - __entry->dma - ) - ); - TRACE_EVENT(hfi1_mmu_invalidate, TP_PROTO(unsigned int ctxt, u16 subctxt, const char *type, unsigned long start, unsigned long end), diff --git a/drivers/infiniband/hw/hfi1/trace_tid.h b/drivers/infiniband/hw/hfi1/trace_tid.h new file mode 100644 index 000000000000..57a973c97cde --- /dev/null +++ b/drivers/infiniband/hw/hfi1/trace_tid.h @@ -0,0 +1,332 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ +/* + * Copyright(c) 2018 Intel Corporation. + * + */ +#if !defined(__HFI1_TRACE_TID_H) || defined(TRACE_HEADER_MULTI_READ) +#define __HFI1_TRACE_TID_H + +#include +#include + +#include "hfi.h" + +#define tidtype_name(type) { PT_##type, #type } +#define show_tidtype(type) \ +__print_symbolic(type, \ + tidtype_name(EXPECTED), \ + tidtype_name(EAGER), \ + tidtype_name(INVALID)) \ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_tid + +#define OPFN_PARAM_PRN "[%s] qpn 0x%x %s OPFN: qp 0x%x, max read %u, " \ + "max write %u, max length %u, jkey 0x%x timeout %u " \ + "urg %u" + +DECLARE_EVENT_CLASS(/* class */ + hfi1_exp_tid_reg_unreg, + TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages, + unsigned long va, unsigned long pa, dma_addr_t dma), + TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma), + TP_STRUCT__entry(/* entry */ + __field(unsigned int, ctxt) + __field(u16, subctxt) + __field(u32, rarr) + __field(u32, npages) + __field(unsigned long, va) + __field(unsigned long, pa) + __field(dma_addr_t, dma) + ), + TP_fast_assign(/* assign */ + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->rarr = rarr; + __entry->npages = npages; + __entry->va = va; + __entry->pa = pa; + __entry->dma = dma; + ), + TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx", + __entry->ctxt, + __entry->subctxt, + __entry->rarr, + __entry->npages, + __entry->pa, + __entry->va, + __entry->dma + ) +); + +DEFINE_EVENT(/* exp_tid_unreg */ + hfi1_exp_tid_reg_unreg, hfi1_exp_tid_unreg, + TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages, + unsigned long va, unsigned long pa, dma_addr_t dma), + TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma) +); + +DEFINE_EVENT(/* exp_tid_reg */ + hfi1_exp_tid_reg_unreg, hfi1_exp_tid_reg, + TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages, + unsigned long va, unsigned long pa, dma_addr_t dma), + TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma) +); + +TRACE_EVENT(/* put_tid */ + hfi1_put_tid, + TP_PROTO(struct hfi1_devdata *dd, + u32 index, u32 type, unsigned long pa, u16 order), + TP_ARGS(dd, index, type, pa, order), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd) + __field(unsigned long, pa); + __field(u32, index); + __field(u32, type); + __field(u16, order); + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd); + __entry->pa = pa; + __entry->index = index; + __entry->type = type; + __entry->order = order; + ), + TP_printk("[%s] type %s pa %lx index %u order %u", + __get_str(dev), + show_tidtype(__entry->type), + __entry->pa, + __entry->index, + __entry->order + ) +); + +TRACE_EVENT(/* exp_tid_inval */ + hfi1_exp_tid_inval, + TP_PROTO(unsigned int ctxt, u16 subctxt, unsigned long va, u32 rarr, + u32 npages, dma_addr_t dma), + TP_ARGS(ctxt, subctxt, va, rarr, npages, dma), + TP_STRUCT__entry(/* entry */ + __field(unsigned int, ctxt) + __field(u16, subctxt) + __field(unsigned long, va) + __field(u32, rarr) + __field(u32, npages) + __field(dma_addr_t, dma) + ), + TP_fast_assign(/* assign */ + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->va = va; + __entry->rarr = rarr; + __entry->npages = npages; + __entry->dma = dma; + ), + TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx dma: 0x%llx", + __entry->ctxt, + __entry->subctxt, + __entry->rarr, + __entry->npages, + __entry->va, + __entry->dma + ) +); + +DECLARE_EVENT_CLASS(/* opfn_state */ + hfi1_opfn_state_template, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u16, requested) + __field(u16, completed) + __field(u8, curr) + ), + TP_fast_assign(/* assign */ + struct hfi1_qp_priv *priv = qp->priv; + + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->requested = priv->opfn.requested; + __entry->completed = priv->opfn.completed; + __entry->curr = priv->opfn.curr; + ), + TP_printk(/* print */ + "[%s] qpn 0x%x requested 0x%x completed 0x%x curr 0x%x", + __get_str(dev), + __entry->qpn, + __entry->requested, + __entry->completed, + __entry->curr + ) +); + +DEFINE_EVENT(/* event */ + hfi1_opfn_state_template, hfi1_opfn_state_conn_request, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_opfn_state_template, hfi1_opfn_state_sched_conn_request, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_opfn_state_template, hfi1_opfn_state_conn_response, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_opfn_state_template, hfi1_opfn_state_conn_reply, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DEFINE_EVENT(/* event */ + hfi1_opfn_state_template, hfi1_opfn_state_conn_error, + TP_PROTO(struct rvt_qp *qp), + TP_ARGS(qp) +); + +DECLARE_EVENT_CLASS(/* opfn_data */ + hfi1_opfn_data_template, + TP_PROTO(struct rvt_qp *qp, u8 capcode, u64 data), + TP_ARGS(qp, capcode, data), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u32, state) + __field(u8, capcode) + __field(u64, data) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->state = qp->state; + __entry->capcode = capcode; + __entry->data = data; + ), + TP_printk(/* printk */ + "[%s] qpn 0x%x (state 0x%x) Capcode %u data 0x%llx", + __get_str(dev), + __entry->qpn, + __entry->state, + __entry->capcode, + __entry->data + ) +); + +DEFINE_EVENT(/* event */ + hfi1_opfn_data_template, hfi1_opfn_data_conn_request, + TP_PROTO(struct rvt_qp *qp, u8 capcode, u64 data), + TP_ARGS(qp, capcode, data) +); + +DEFINE_EVENT(/* event */ + hfi1_opfn_data_template, hfi1_opfn_data_conn_response, + TP_PROTO(struct rvt_qp *qp, u8 capcode, u64 data), + TP_ARGS(qp, capcode, data) +); + +DEFINE_EVENT(/* event */ + hfi1_opfn_data_template, hfi1_opfn_data_conn_reply, + TP_PROTO(struct rvt_qp *qp, u8 capcode, u64 data), + TP_ARGS(qp, capcode, data) +); + +DECLARE_EVENT_CLASS(/* opfn_param */ + hfi1_opfn_param_template, + TP_PROTO(struct rvt_qp *qp, char remote, + struct tid_rdma_params *param), + TP_ARGS(qp, remote, param), + TP_STRUCT__entry(/* entry */ + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(char, remote) + __field(u32, param_qp) + __field(u32, max_len) + __field(u16, jkey) + __field(u8, max_read) + __field(u8, max_write) + __field(u8, timeout) + __field(u8, urg) + ), + TP_fast_assign(/* assign */ + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); + __entry->qpn = qp->ibqp.qp_num; + __entry->remote = remote; + __entry->param_qp = param->qp; + __entry->max_len = param->max_len; + __entry->jkey = param->jkey; + __entry->max_read = param->max_read; + __entry->max_write = param->max_write; + __entry->timeout = param->timeout; + __entry->urg = param->urg; + ), + TP_printk(/* print */ + OPFN_PARAM_PRN, + __get_str(dev), + __entry->qpn, + __entry->remote ? "remote" : "local", + __entry->param_qp, + __entry->max_read, + __entry->max_write, + __entry->max_len, + __entry->jkey, + __entry->timeout, + __entry->urg + ) +); + +DEFINE_EVENT(/* event */ + hfi1_opfn_param_template, hfi1_opfn_param, + TP_PROTO(struct rvt_qp *qp, char remote, + struct tid_rdma_params *param), + TP_ARGS(qp, remote, param) +); + +DECLARE_EVENT_CLASS(/* msg */ + hfi1_msg_template, + TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more), + TP_ARGS(qp, msg, more), + TP_STRUCT__entry(/* entry */ + __field(u32, qpn) + __string(msg, msg) + __field(u64, more) + ), + TP_fast_assign(/* assign */ + __entry->qpn = qp ? qp->ibqp.qp_num : 0; + __assign_str(msg, msg); + __entry->more = more; + ), + TP_printk(/* print */ + "qpn 0x%x %s 0x%llx", + __entry->qpn, + __get_str(msg), + __entry->more + ) +); + +DEFINE_EVENT(/* event */ + hfi1_msg_template, hfi1_msg_opfn_conn_request, + TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more), + TP_ARGS(qp, msg, more) +); + +DEFINE_EVENT(/* event */ + hfi1_msg_template, hfi1_msg_opfn_conn_error, + TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more), + TP_ARGS(qp, msg, more) +); + +#endif /* __HFI1_TRACE_TID_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_tid +#include diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c index 6ba47037c424..4ed4fcfabd6c 100644 --- a/drivers/infiniband/hw/hfi1/uc.c +++ b/drivers/infiniband/hw/hfi1/uc.c @@ -271,7 +271,8 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) ps->s_txreq->ss = &qp->s_sge; ps->s_txreq->s_cur_size = len; hfi1_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24), - mask_psn(qp->s_psn++), middle, ps); + qp->remote_qpn, mask_psn(qp->s_psn++), + middle, ps); return 1; done_free_tx: diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index c980345cf1e1..571bfd549c2a 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1735,6 +1735,8 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) dd->verbs_dev.rdi.dparms.sge_copy_mode = sge_copy_mode; dd->verbs_dev.rdi.dparms.wss_threshold = wss_threshold; dd->verbs_dev.rdi.dparms.wss_clean_period = wss_clean_period; + dd->verbs_dev.rdi.dparms.reserved_operations = 1; + dd->verbs_dev.rdi.dparms.extra_rdma_atomic = 1; /* post send table */ dd->verbs_dev.rdi.post_parms = hfi1_post_parms; diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 1ad0b14bdb3c..c8baa1e38ff6 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -72,6 +72,7 @@ struct hfi1_packet; #include "iowait.h" #include "tid_rdma.h" +#include "opfn.h" #define HFI1_MAX_RDMA_ATOMIC 16 @@ -160,8 +161,13 @@ struct hfi1_qp_priv { struct hfi1_ctxtdata *rcd; /* QP's receive context */ u8 s_sc; /* SC[0..4] for next packet */ struct iowait s_iowait; + struct hfi1_opfn_data opfn; + struct tid_rdma_qp_params tid_rdma; struct rvt_qp *owner; u8 hdr_type; /* 9B or 16B */ + unsigned long tid_timer_timeout_jiffies; + u16 pkts_ps; /* packets per segment */ + u8 timeout_shift; /* account for number of packets per segment */ }; /* @@ -356,7 +362,7 @@ u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr, const struct ib_global_route *grh, u32 hwords, u32 nwords); void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr, - u32 bth0, u32 bth2, int middle, + u32 bth0, u32 bth1, u32 bth2, int middle, struct hfi1_pkt_state *ps); void _hfi1_do_send(struct work_struct *work); diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index acb3bc96dfa7..168e40be183c 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -182,6 +182,7 @@ struct rvt_driver_params { u32 max_mad_size; u8 qos_shift; u8 max_rdma_atomic; + u8 extra_rdma_atomic; u8 reserved_operations; }; @@ -519,7 +520,14 @@ static inline unsigned rvt_get_npkeys(struct rvt_dev_info *rdi) */ static inline unsigned int rvt_max_atomic(struct rvt_dev_info *rdi) { - return rdi->dparms.max_rdma_atomic + 1; + return rdi->dparms.max_rdma_atomic + + rdi->dparms.extra_rdma_atomic + 1; +} + +static inline unsigned int rvt_size_atomic(struct rvt_dev_info *rdi) +{ + return rdi->dparms.max_rdma_atomic + + rdi->dparms.extra_rdma_atomic; } /*