Merge branch 'opfn' into hfi1-tid

This series adds the OPFN feature, which is used as the negotiation
protocol by TID RDMA. This adds a totally hidden, in-band negotiation
transfer that happens on the consumer's queue pair but without the
consumer's knowledge.  For that reason, things like completions for OPFN
transfers must be filtered out of the completion queue and not sent to
the consumer.  This feature does not impact any consumer APIs, but does
impact the driver/driver wire API.

At a high level OPFN enables exchanging parameters between two hosts
using IB compare and swap requests to a special virtual address. The
request uses a reserved IB work request opcode (see patch 3).

* opfn:
  IB/hfi1: Add static trace for OPFN
  IB/hfi1: Integrate OPFN into RC transactions
  IB/hfi1, IB/rdmavt: Allow for extending of QP's s_ack_queue
  IB/hfi1: OPFN interface
  IB/hfi1: Add OPFN helper functions for TID RDMA feature
  IB/hfi1: OPFN support discovery

Signed-off-by: Doug Ledford <dledford@redhat.com>
This commit is contained in:
Doug Ledford 2019-01-31 11:56:25 -05:00
Родитель db421a5499 a131d16460
Коммит 2a6423961e
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B826A3330E572FDD
19 изменённых файлов: 1114 добавлений и 141 удалений

Просмотреть файл

@ -24,6 +24,7 @@ hfi1-y := \
mad.o \ mad.o \
mmu_rb.o \ mmu_rb.o \
msix.o \ msix.o \
opfn.o \
pcie.o \ pcie.o \
pio.o \ pio.o \
pio_copy.o \ pio_copy.o \

Просмотреть файл

@ -5222,6 +5222,17 @@ int is_bx(struct hfi1_devdata *dd)
return (chip_rev_minor & 0xF0) == 0x10; return (chip_rev_minor & 0xF0) == 0x10;
} }
/* return true is kernel urg disabled for rcd */
bool is_urg_masked(struct hfi1_ctxtdata *rcd)
{
u64 mask;
u32 is = IS_RCVURGENT_START + rcd->ctxt;
u8 bit = is % 64;
mask = read_csr(rcd->dd, CCE_INT_MASK + (8 * (is / 64)));
return !(mask & BIT_ULL(bit));
}
/* /*
* Append string s to buffer buf. Arguments curp and len are the current * Append string s to buffer buf. Arguments curp and len are the current
* position and remaining length, respectively. * position and remaining length, respectively.

Просмотреть файл

@ -1,7 +1,7 @@
#ifndef _CHIP_H #ifndef _CHIP_H
#define _CHIP_H #define _CHIP_H
/* /*
* Copyright(c) 2015 - 2017 Intel Corporation. * Copyright(c) 2015 - 2018 Intel Corporation.
* *
* This file is provided under a dual BSD/GPLv2 license. When using or * This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license. * redistributing this file, you may do so under either license.
@ -804,6 +804,7 @@ void clear_linkup_counters(struct hfi1_devdata *dd);
u32 hdrqempty(struct hfi1_ctxtdata *rcd); u32 hdrqempty(struct hfi1_ctxtdata *rcd);
int is_ax(struct hfi1_devdata *dd); int is_ax(struct hfi1_devdata *dd);
int is_bx(struct hfi1_devdata *dd); int is_bx(struct hfi1_devdata *dd);
bool is_urg_masked(struct hfi1_ctxtdata *rcd);
u32 read_physical_state(struct hfi1_devdata *dd); u32 read_physical_state(struct hfi1_devdata *dd);
u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate); u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate);
const char *opa_lstate_name(u32 lstate); const char *opa_lstate_name(u32 lstate);

Просмотреть файл

@ -73,6 +73,7 @@
#include "chip_registers.h" #include "chip_registers.h"
#include "common.h" #include "common.h"
#include "opfn.h"
#include "verbs.h" #include "verbs.h"
#include "pio.h" #include "pio.h"
#include "chip.h" #include "chip.h"
@ -98,6 +99,8 @@
#define NEIGHBOR_TYPE_HFI 0 #define NEIGHBOR_TYPE_HFI 0
#define NEIGHBOR_TYPE_SWITCH 1 #define NEIGHBOR_TYPE_SWITCH 1
#define HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES 5
extern unsigned long hfi1_cap_mask; extern unsigned long hfi1_cap_mask;
#define HFI1_CAP_KGET_MASK(mask, cap) ((mask) & HFI1_CAP_##cap) #define HFI1_CAP_KGET_MASK(mask, cap) ((mask) & HFI1_CAP_##cap)
#define HFI1_CAP_UGET_MASK(mask, cap) \ #define HFI1_CAP_UGET_MASK(mask, cap) \

Просмотреть файл

@ -72,7 +72,6 @@
#undef pr_fmt #undef pr_fmt
#define pr_fmt(fmt) DRIVER_NAME ": " fmt #define pr_fmt(fmt) DRIVER_NAME ": " fmt
#define HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES 5
/* /*
* min buffers we want to have per context, after driver * min buffers we want to have per context, after driver
*/ */
@ -927,6 +926,8 @@ int hfi1_init(struct hfi1_devdata *dd, int reinit)
lastfail = hfi1_create_rcvhdrq(dd, rcd); lastfail = hfi1_create_rcvhdrq(dd, rcd);
if (!lastfail) if (!lastfail)
lastfail = hfi1_setup_eagerbufs(rcd); lastfail = hfi1_setup_eagerbufs(rcd);
if (!lastfail)
lastfail = hfi1_kern_exp_rcv_init(rcd, reinit);
if (lastfail) { if (lastfail) {
dd_dev_err(dd, dd_dev_err(dd,
"failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n"); "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n");
@ -1497,6 +1498,12 @@ static int __init hfi1_mod_init(void)
/* sanitize link CRC options */ /* sanitize link CRC options */
link_crc_mask &= SUPPORTED_CRCS; link_crc_mask &= SUPPORTED_CRCS;
ret = opfn_init();
if (ret < 0) {
pr_err("Failed to allocate opfn_wq");
goto bail_dev;
}
/* /*
* These must be called before the driver is registered with * These must be called before the driver is registered with
* the PCI subsystem. * the PCI subsystem.
@ -1527,6 +1534,7 @@ module_init(hfi1_mod_init);
static void __exit hfi1_mod_cleanup(void) static void __exit hfi1_mod_cleanup(void)
{ {
pci_unregister_driver(&hfi1_pci_driver); pci_unregister_driver(&hfi1_pci_driver);
opfn_exit();
node_affinity_destroy_all(); node_affinity_destroy_all();
hfi1_dbg_exit(); hfi1_dbg_exit();

Просмотреть файл

@ -0,0 +1,318 @@
// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
/*
* Copyright(c) 2018 Intel Corporation.
*
*/
#include "hfi.h"
#include "trace.h"
#include "qp.h"
#include "opfn.h"
#define IB_BTHE_E BIT(IB_BTHE_E_SHIFT)
#define OPFN_CODE(code) BIT((code) - 1)
#define OPFN_MASK(code) OPFN_CODE(STL_VERBS_EXTD_##code)
struct hfi1_opfn_type {
bool (*request)(struct rvt_qp *qp, u64 *data);
bool (*response)(struct rvt_qp *qp, u64 *data);
bool (*reply)(struct rvt_qp *qp, u64 data);
void (*error)(struct rvt_qp *qp);
};
static struct hfi1_opfn_type hfi1_opfn_handlers[STL_VERBS_EXTD_MAX] = {
[STL_VERBS_EXTD_TID_RDMA] = {
.request = tid_rdma_conn_req,
.response = tid_rdma_conn_resp,
.reply = tid_rdma_conn_reply,
.error = tid_rdma_conn_error,
},
};
static struct workqueue_struct *opfn_wq;
static void opfn_schedule_conn_request(struct rvt_qp *qp);
static bool hfi1_opfn_extended(u32 bth1)
{
return !!(bth1 & IB_BTHE_E);
}
static void opfn_conn_request(struct rvt_qp *qp)
{
struct hfi1_qp_priv *priv = qp->priv;
struct ib_atomic_wr wr;
u16 mask, capcode;
struct hfi1_opfn_type *extd;
u64 data;
unsigned long flags;
int ret = 0;
trace_hfi1_opfn_state_conn_request(qp);
spin_lock_irqsave(&priv->opfn.lock, flags);
/*
* Exit if the extended bit is not set, or if nothing is requested, or
* if we have completed all requests, or if a previous request is in
* progress
*/
if (!priv->opfn.extended || !priv->opfn.requested ||
priv->opfn.requested == priv->opfn.completed || priv->opfn.curr)
goto done;
mask = priv->opfn.requested & ~priv->opfn.completed;
capcode = ilog2(mask & ~(mask - 1)) + 1;
if (capcode >= STL_VERBS_EXTD_MAX) {
priv->opfn.completed |= OPFN_CODE(capcode);
goto done;
}
extd = &hfi1_opfn_handlers[capcode];
if (!extd || !extd->request || !extd->request(qp, &data)) {
/*
* Either there is no handler for this capability or the request
* packet could not be generated. Either way, mark it as done so
* we don't keep attempting to complete it.
*/
priv->opfn.completed |= OPFN_CODE(capcode);
goto done;
}
trace_hfi1_opfn_data_conn_request(qp, capcode, data);
data = (data & ~0xf) | capcode;
memset(&wr, 0, sizeof(wr));
wr.wr.opcode = IB_WR_OPFN;
wr.remote_addr = HFI1_VERBS_E_ATOMIC_VADDR;
wr.compare_add = data;
priv->opfn.curr = capcode; /* A new request is now in progress */
/* Drop opfn.lock before calling ib_post_send() */
spin_unlock_irqrestore(&priv->opfn.lock, flags);
ret = ib_post_send(&qp->ibqp, &wr.wr, NULL);
if (ret)
goto err;
trace_hfi1_opfn_state_conn_request(qp);
return;
err:
trace_hfi1_msg_opfn_conn_request(qp, "ib_ost_send failed: ret = ",
(u64)ret);
spin_lock_irqsave(&priv->opfn.lock, flags);
/*
* In case of an unexpected error return from ib_post_send
* clear opfn.curr and reschedule to try again
*/
priv->opfn.curr = STL_VERBS_EXTD_NONE;
opfn_schedule_conn_request(qp);
done:
spin_unlock_irqrestore(&priv->opfn.lock, flags);
}
void opfn_send_conn_request(struct work_struct *work)
{
struct hfi1_opfn_data *od;
struct hfi1_qp_priv *qpriv;
od = container_of(work, struct hfi1_opfn_data, opfn_work);
qpriv = container_of(od, struct hfi1_qp_priv, opfn);
opfn_conn_request(qpriv->owner);
}
/*
* When QP s_lock is held in the caller, the OPFN request must be scheduled
* to a different workqueue to avoid double locking QP s_lock in call to
* ib_post_send in opfn_conn_request
*/
static void opfn_schedule_conn_request(struct rvt_qp *qp)
{
struct hfi1_qp_priv *priv = qp->priv;
trace_hfi1_opfn_state_sched_conn_request(qp);
queue_work(opfn_wq, &priv->opfn.opfn_work);
}
void opfn_conn_response(struct rvt_qp *qp, struct rvt_ack_entry *e,
struct ib_atomic_eth *ateth)
{
struct hfi1_qp_priv *priv = qp->priv;
u64 data = be64_to_cpu(ateth->compare_data);
struct hfi1_opfn_type *extd;
u8 capcode;
unsigned long flags;
trace_hfi1_opfn_state_conn_response(qp);
capcode = data & 0xf;
trace_hfi1_opfn_data_conn_response(qp, capcode, data);
if (!capcode || capcode >= STL_VERBS_EXTD_MAX)
return;
extd = &hfi1_opfn_handlers[capcode];
if (!extd || !extd->response) {
e->atomic_data = capcode;
return;
}
spin_lock_irqsave(&priv->opfn.lock, flags);
if (priv->opfn.completed & OPFN_CODE(capcode)) {
/*
* We are receiving a request for a feature that has already
* been negotiated. This may mean that the other side has reset
*/
priv->opfn.completed &= ~OPFN_CODE(capcode);
if (extd->error)
extd->error(qp);
}
if (extd->response(qp, &data))
priv->opfn.completed |= OPFN_CODE(capcode);
e->atomic_data = (data & ~0xf) | capcode;
trace_hfi1_opfn_state_conn_response(qp);
spin_unlock_irqrestore(&priv->opfn.lock, flags);
}
void opfn_conn_reply(struct rvt_qp *qp, u64 data)
{
struct hfi1_qp_priv *priv = qp->priv;
struct hfi1_opfn_type *extd;
u8 capcode;
unsigned long flags;
trace_hfi1_opfn_state_conn_reply(qp);
capcode = data & 0xf;
trace_hfi1_opfn_data_conn_reply(qp, capcode, data);
if (!capcode || capcode >= STL_VERBS_EXTD_MAX)
return;
spin_lock_irqsave(&priv->opfn.lock, flags);
/*
* Either there is no previous request or the reply is not for the
* current request
*/
if (!priv->opfn.curr || capcode != priv->opfn.curr)
goto done;
extd = &hfi1_opfn_handlers[capcode];
if (!extd || !extd->reply)
goto clear;
if (extd->reply(qp, data))
priv->opfn.completed |= OPFN_CODE(capcode);
clear:
/*
* Clear opfn.curr to indicate that the previous request is no longer in
* progress
*/
priv->opfn.curr = STL_VERBS_EXTD_NONE;
trace_hfi1_opfn_state_conn_reply(qp);
done:
spin_unlock_irqrestore(&priv->opfn.lock, flags);
}
void opfn_conn_error(struct rvt_qp *qp)
{
struct hfi1_qp_priv *priv = qp->priv;
struct hfi1_opfn_type *extd = NULL;
unsigned long flags;
u16 capcode;
trace_hfi1_opfn_state_conn_error(qp);
trace_hfi1_msg_opfn_conn_error(qp, "error. qp state ", (u64)qp->state);
/*
* The QP has gone into the Error state. We have to invalidate all
* negotiated feature, including the one in progress (if any). The RC
* QP handling will clean the WQE for the connection request.
*/
spin_lock_irqsave(&priv->opfn.lock, flags);
while (priv->opfn.completed) {
capcode = priv->opfn.completed & ~(priv->opfn.completed - 1);
extd = &hfi1_opfn_handlers[ilog2(capcode) + 1];
if (extd->error)
extd->error(qp);
priv->opfn.completed &= ~OPFN_CODE(capcode);
}
priv->opfn.extended = 0;
priv->opfn.requested = 0;
priv->opfn.curr = STL_VERBS_EXTD_NONE;
spin_unlock_irqrestore(&priv->opfn.lock, flags);
}
void opfn_qp_init(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask)
{
struct ib_qp *ibqp = &qp->ibqp;
struct hfi1_qp_priv *priv = qp->priv;
unsigned long flags;
spin_lock_irqsave(&priv->opfn.lock, flags);
if (ibqp->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
struct tid_rdma_params *local = &priv->tid_rdma.local;
if (qp->pmtu == enum_to_mtu(OPA_MTU_4096) ||
qp->pmtu == enum_to_mtu(OPA_MTU_8192)) {
tid_rdma_opfn_init(qp, local);
/*
* We only want to set the OPFN requested bit when the
* QP transitions to RTS.
*/
if (attr_mask & IB_QP_STATE &&
attr->qp_state == IB_QPS_RTS) {
priv->opfn.requested |= OPFN_MASK(TID_RDMA);
/*
* If the QP is transitioning to RTS and the
* opfn.completed for TID RDMA has already been
* set, the QP is being moved *back* into RTS.
* We can now renegotiate the TID RDMA
* parameters.
*/
if (priv->opfn.completed &
OPFN_MASK(TID_RDMA)) {
priv->opfn.completed &=
~OPFN_MASK(TID_RDMA);
/*
* Since the opfn.completed bit was
* already set, it is safe to assume
* that the opfn.extended is also set.
*/
opfn_schedule_conn_request(qp);
}
}
} else {
memset(local, 0, sizeof(*local));
}
}
spin_unlock_irqrestore(&priv->opfn.lock, flags);
}
void opfn_trigger_conn_request(struct rvt_qp *qp, u32 bth1)
{
struct hfi1_qp_priv *priv = qp->priv;
if (!priv->opfn.extended && hfi1_opfn_extended(bth1) &&
HFI1_CAP_IS_KSET(OPFN)) {
priv->opfn.extended = 1;
if (qp->state == IB_QPS_RTS)
opfn_conn_request(qp);
}
}
int opfn_init(void)
{
opfn_wq = alloc_workqueue("hfi_opfn",
WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE |
WQ_MEM_RECLAIM,
HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES);
if (!opfn_wq)
return -ENOMEM;
return 0;
}
void opfn_exit(void)
{
if (opfn_wq) {
destroy_workqueue(opfn_wq);
opfn_wq = NULL;
}
}

Просмотреть файл

@ -0,0 +1,85 @@
/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
/*
* Copyright(c) 2018 Intel Corporation.
*
*/
#ifndef _HFI1_OPFN_H
#define _HFI1_OPFN_H
/**
* DOC: Omni Path Feature Negotion (OPFN)
*
* OPFN is a discovery protocol for Intel Omni-Path fabric that
* allows two RC QPs to negotiate a common feature that both QPs
* can support. Currently, the only OPA feature that OPFN
* supports is TID RDMA.
*
* Architecture
*
* OPFN involves the communication between two QPs on the HFI
* level on an Omni-Path fabric, and ULPs have no knowledge of
* OPFN at all.
*
* Implementation
*
* OPFN extends the existing IB RC protocol with the following
* changes:
* -- Uses Bit 24 (reserved) of DWORD 1 of Base Transport
* Header (BTH1) to indicate that the RC QP supports OPFN;
* -- Uses a combination of RC COMPARE_SWAP opcode (0x13) and
* the address U64_MAX (0xFFFFFFFFFFFFFFFF) as an OPFN
* request; The 64-bit data carried with the request/response
* contains the parameters for negotiation and will be
* defined in tid_rdma.c file;
* -- Defines IB_WR_RESERVED3 as IB_WR_OPFN.
*
* The OPFN communication will be triggered when an RC QP
* receives a request with Bit 24 of BTH1 set. The responder QP
* will then post send an OPFN request with its local
* parameters, which will be sent to the requester QP once all
* existing requests on the responder QP side have been sent.
* Once the requester QP receives the OPFN request, it will
* keep a copy of the responder QP's parameters, and return a
* response packet with its own local parameters. The responder
* QP receives the response packet and keeps a copy of the requester
* QP's parameters. After this exchange, each side has the parameters
* for both sides and therefore can select the right parameters
* for future transactions
*/
/* STL Verbs Extended */
#define IB_BTHE_E_SHIFT 24
#define HFI1_VERBS_E_ATOMIC_VADDR U64_MAX
struct ib_atomic_eth;
enum hfi1_opfn_codes {
STL_VERBS_EXTD_NONE = 0,
STL_VERBS_EXTD_TID_RDMA,
STL_VERBS_EXTD_MAX
};
struct hfi1_opfn_data {
u8 extended;
u16 requested;
u16 completed;
enum hfi1_opfn_codes curr;
/* serialize opfn function calls */
spinlock_t lock;
struct work_struct opfn_work;
};
/* WR opcode for OPFN */
#define IB_WR_OPFN IB_WR_RESERVED3
void opfn_send_conn_request(struct work_struct *work);
void opfn_conn_response(struct rvt_qp *qp, struct rvt_ack_entry *e,
struct ib_atomic_eth *ateth);
void opfn_conn_reply(struct rvt_qp *qp, u64 data);
void opfn_conn_error(struct rvt_qp *qp);
void opfn_qp_init(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask);
void opfn_trigger_conn_request(struct rvt_qp *qp, u32 bth1);
int opfn_init(void);
void opfn_exit(void);
#endif /* _HFI1_OPFN_H */

Просмотреть файл

@ -132,6 +132,12 @@ const struct rvt_operation_params hfi1_post_parms[RVT_OPERATION_MAX] = {
.qpt_support = BIT(IB_QPT_RC), .qpt_support = BIT(IB_QPT_RC),
}, },
[IB_WR_OPFN] = {
.length = sizeof(struct ib_atomic_wr),
.qpt_support = BIT(IB_QPT_RC),
.flags = RVT_OPERATION_USE_RESERVE,
},
}; };
static void flush_list_head(struct list_head *l) static void flush_list_head(struct list_head *l)
@ -285,6 +291,8 @@ void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc); priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc);
qp_set_16b(qp); qp_set_16b(qp);
} }
opfn_qp_init(qp, attr, attr_mask);
} }
/** /**
@ -696,6 +704,7 @@ void qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp)
{ {
struct hfi1_qp_priv *priv = qp->priv; struct hfi1_qp_priv *priv = qp->priv;
hfi1_qp_priv_tid_free(rdi, qp);
kfree(priv->s_ahg); kfree(priv->s_ahg);
kfree(priv); kfree(priv);
} }
@ -751,6 +760,10 @@ void notify_qp_reset(struct rvt_qp *qp)
{ {
qp->r_adefered = 0; qp->r_adefered = 0;
clear_ahg(qp); clear_ahg(qp);
/* Clear any OPFN state */
if (qp->ibqp.qp_type == IB_QPT_RC)
opfn_conn_error(qp);
} }
/* /*

Просмотреть файл

@ -57,6 +57,10 @@
/* cut down ridiculously long IB macro names */ /* cut down ridiculously long IB macro names */
#define OP(x) RC_OP(x) #define OP(x) RC_OP(x)
static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
struct rvt_swqe *wqe,
struct hfi1_ibport *ibp);
static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
u32 psn, u32 pmtu) u32 psn, u32 pmtu)
{ {
@ -89,8 +93,8 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
struct rvt_ack_entry *e; struct rvt_ack_entry *e;
u32 hwords; u32 hwords;
u32 len; u32 len;
u32 bth0; u32 bth0, bth2;
u32 bth2; u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT);
int middle = 0; int middle = 0;
u32 pmtu = qp->pmtu; u32 pmtu = qp->pmtu;
struct hfi1_qp_priv *priv = qp->priv; struct hfi1_qp_priv *priv = qp->priv;
@ -122,7 +126,8 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
* response has been sent instead of only being * response has been sent instead of only being
* constructed. * constructed.
*/ */
if (++qp->s_tail_ack_queue > HFI1_MAX_RDMA_ATOMIC) if (++qp->s_tail_ack_queue >
rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
qp->s_tail_ack_queue = 0; qp->s_tail_ack_queue = 0;
/* FALLTHROUGH */ /* FALLTHROUGH */
case OP(SEND_ONLY): case OP(SEND_ONLY):
@ -229,7 +234,7 @@ normal:
ps->s_txreq->sde = priv->s_sde; ps->s_txreq->sde = priv->s_sde;
ps->s_txreq->s_cur_size = len; ps->s_txreq->s_cur_size = len;
ps->s_txreq->hdr_dwords = hwords; ps->s_txreq->hdr_dwords = hwords;
hfi1_make_ruc_header(qp, ohdr, bth0, bth2, middle, ps); hfi1_make_ruc_header(qp, ohdr, bth0, bth1, bth2, middle, ps);
return 1; return 1;
bail: bail:
@ -262,8 +267,8 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
struct rvt_swqe *wqe; struct rvt_swqe *wqe;
u32 hwords; u32 hwords;
u32 len; u32 len;
u32 bth0 = 0; u32 bth0 = 0, bth2;
u32 bth2; u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT);
u32 pmtu = qp->pmtu; u32 pmtu = qp->pmtu;
char newreq; char newreq;
int middle = 0; int middle = 0;
@ -516,10 +521,14 @@ no_flow_control:
goto bail; goto bail;
} }
qp->s_num_rd_atomic++; qp->s_num_rd_atomic++;
if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
qp->s_lsn++;
} }
if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
/* FALLTHROUGH */
case IB_WR_OPFN:
if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
qp->s_lsn++;
if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
wqe->wr.opcode == IB_WR_OPFN) {
qp->s_state = OP(COMPARE_SWAP); qp->s_state = OP(COMPARE_SWAP);
put_ib_ateth_swap(wqe->atomic_wr.swap, put_ib_ateth_swap(wqe->atomic_wr.swap,
&ohdr->u.atomic_eth); &ohdr->u.atomic_eth);
@ -693,6 +702,7 @@ no_flow_control:
qp, qp,
ohdr, ohdr,
bth0 | (qp->s_state << 24), bth0 | (qp->s_state << 24),
bth1,
bth2, bth2,
middle, middle,
ps); ps);
@ -796,6 +806,11 @@ static inline void hfi1_make_rc_ack_9B(struct hfi1_packet *packet,
if (qp->s_mig_state == IB_MIG_MIGRATED) if (qp->s_mig_state == IB_MIG_MIGRATED)
bth0 |= IB_BTH_MIG_REQ; bth0 |= IB_BTH_MIG_REQ;
bth1 = (!!is_fecn) << IB_BECN_SHIFT; bth1 = (!!is_fecn) << IB_BECN_SHIFT;
/*
* Inline ACKs go out without the use of the Verbs send engine, so
* we need to set the STL Verbs Extended bit here
*/
bth1 |= HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT;
hfi1_make_bth_aeth(qp, ohdr, bth0, bth1); hfi1_make_bth_aeth(qp, ohdr, bth0, bth1);
} }
@ -1033,6 +1048,7 @@ done:
*/ */
void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
{ {
struct hfi1_qp_priv *priv = qp->priv;
struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked); struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
struct hfi1_ibport *ibp; struct hfi1_ibport *ibp;
@ -1043,8 +1059,26 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
hfi1_migrate_qp(qp); hfi1_migrate_qp(qp);
qp->s_retry = qp->s_retry_cnt; qp->s_retry = qp->s_retry_cnt;
} else if (qp->s_last == qp->s_acked) { } else if (qp->s_last == qp->s_acked) {
rvt_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); /*
rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); * We need special handling for the OPFN request WQEs as
* they are not allowed to generate real user errors
*/
if (wqe->wr.opcode == IB_WR_OPFN) {
struct hfi1_ibport *ibp =
to_iport(qp->ibqp.device, qp->port_num);
/*
* Call opfn_conn_reply() with capcode and
* remaining data as 0 to close out the
* current request
*/
opfn_conn_reply(qp, priv->opfn.curr);
wqe = do_rc_completion(qp, wqe, ibp);
qp->s_flags &= ~RVT_S_WAIT_ACK;
} else {
rvt_send_complete(qp, wqe,
IB_WC_RETRY_EXC_ERR);
rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
}
return; return;
} else { /* need to handle delayed completion */ } else { /* need to handle delayed completion */
return; return;
@ -1356,6 +1390,9 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
u64 *vaddr = wqe->sg_list[0].vaddr; u64 *vaddr = wqe->sg_list[0].vaddr;
*vaddr = val; *vaddr = val;
} }
if (wqe->wr.opcode == IB_WR_OPFN)
opfn_conn_reply(qp, val);
if (qp->s_num_rd_atomic && if (qp->s_num_rd_atomic &&
(wqe->wr.opcode == IB_WR_RDMA_READ || (wqe->wr.opcode == IB_WR_RDMA_READ ||
wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
@ -1812,7 +1849,7 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
if (i) if (i)
prev = i - 1; prev = i - 1;
else else
prev = HFI1_MAX_RDMA_ATOMIC; prev = rvt_size_atomic(ib_to_rvt(qp->ibqp.device));
if (prev == qp->r_head_ack_queue) { if (prev == qp->r_head_ack_queue) {
e = NULL; e = NULL;
break; break;
@ -1936,7 +1973,7 @@ static inline void update_ack_queue(struct rvt_qp *qp, unsigned n)
unsigned next; unsigned next;
next = n + 1; next = n + 1;
if (next > HFI1_MAX_RDMA_ATOMIC) if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
next = 0; next = 0;
qp->s_tail_ack_queue = next; qp->s_tail_ack_queue = next;
qp->s_ack_state = OP(ACKNOWLEDGE); qp->s_ack_state = OP(ACKNOWLEDGE);
@ -2061,6 +2098,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet)
return; return;
fecn = process_ecn(qp, packet); fecn = process_ecn(qp, packet);
opfn_trigger_conn_request(qp, be32_to_cpu(ohdr->bth[1]));
/* /*
* Process responses (ACKs) before anything else. Note that the * Process responses (ACKs) before anything else. Note that the
@ -2292,8 +2330,8 @@ send_last:
if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
goto nack_inv; goto nack_inv;
next = qp->r_head_ack_queue + 1; next = qp->r_head_ack_queue + 1;
/* s_ack_queue is size HFI1_MAX_RDMA_ATOMIC+1 so use > not >= */ /* s_ack_queue is size rvt_size_atomic()+1 so use > not >= */
if (next > HFI1_MAX_RDMA_ATOMIC) if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
next = 0; next = 0;
spin_lock_irqsave(&qp->s_lock, flags); spin_lock_irqsave(&qp->s_lock, flags);
if (unlikely(next == qp->s_tail_ack_queue)) { if (unlikely(next == qp->s_tail_ack_queue)) {
@ -2356,18 +2394,21 @@ send_last:
case OP(COMPARE_SWAP): case OP(COMPARE_SWAP):
case OP(FETCH_ADD): { case OP(FETCH_ADD): {
struct ib_atomic_eth *ateth; struct ib_atomic_eth *ateth = &ohdr->u.atomic_eth;
u64 vaddr = get_ib_ateth_vaddr(ateth);
bool opfn = opcode == OP(COMPARE_SWAP) &&
vaddr == HFI1_VERBS_E_ATOMIC_VADDR;
struct rvt_ack_entry *e; struct rvt_ack_entry *e;
u64 vaddr;
atomic64_t *maddr; atomic64_t *maddr;
u64 sdata; u64 sdata;
u32 rkey; u32 rkey;
u8 next; u8 next;
if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
!opfn))
goto nack_inv; goto nack_inv;
next = qp->r_head_ack_queue + 1; next = qp->r_head_ack_queue + 1;
if (next > HFI1_MAX_RDMA_ATOMIC) if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
next = 0; next = 0;
spin_lock_irqsave(&qp->s_lock, flags); spin_lock_irqsave(&qp->s_lock, flags);
if (unlikely(next == qp->s_tail_ack_queue)) { if (unlikely(next == qp->s_tail_ack_queue)) {
@ -2380,8 +2421,11 @@ send_last:
rvt_put_mr(e->rdma_sge.mr); rvt_put_mr(e->rdma_sge.mr);
e->rdma_sge.mr = NULL; e->rdma_sge.mr = NULL;
} }
ateth = &ohdr->u.atomic_eth; /* Process OPFN special virtual address */
vaddr = get_ib_ateth_vaddr(ateth); if (opfn) {
opfn_conn_response(qp, e, ateth);
goto ack;
}
if (unlikely(vaddr & (sizeof(u64) - 1))) if (unlikely(vaddr & (sizeof(u64) - 1)))
goto nack_inv_unlck; goto nack_inv_unlck;
rkey = be32_to_cpu(ateth->rkey); rkey = be32_to_cpu(ateth->rkey);
@ -2400,6 +2444,7 @@ send_last:
sdata); sdata);
rvt_put_mr(qp->r_sge.sge.mr); rvt_put_mr(qp->r_sge.sge.mr);
qp->r_sge.num_sge = 0; qp->r_sge.num_sge = 0;
ack:
e->opcode = opcode; e->opcode = opcode;
e->sent = 0; e->sent = 0;
e->psn = psn; e->psn = psn;

Просмотреть файл

@ -250,7 +250,6 @@ static inline void hfi1_make_ruc_bth(struct rvt_qp *qp,
struct ib_other_headers *ohdr, struct ib_other_headers *ohdr,
u32 bth0, u32 bth1, u32 bth2) u32 bth0, u32 bth1, u32 bth2)
{ {
bth1 |= qp->remote_qpn;
ohdr->bth[0] = cpu_to_be32(bth0); ohdr->bth[0] = cpu_to_be32(bth0);
ohdr->bth[1] = cpu_to_be32(bth1); ohdr->bth[1] = cpu_to_be32(bth1);
ohdr->bth[2] = cpu_to_be32(bth2); ohdr->bth[2] = cpu_to_be32(bth2);
@ -272,13 +271,13 @@ static inline void hfi1_make_ruc_bth(struct rvt_qp *qp,
*/ */
static inline void hfi1_make_ruc_header_16B(struct rvt_qp *qp, static inline void hfi1_make_ruc_header_16B(struct rvt_qp *qp,
struct ib_other_headers *ohdr, struct ib_other_headers *ohdr,
u32 bth0, u32 bth2, int middle, u32 bth0, u32 bth1, u32 bth2,
int middle,
struct hfi1_pkt_state *ps) struct hfi1_pkt_state *ps)
{ {
struct hfi1_qp_priv *priv = qp->priv; struct hfi1_qp_priv *priv = qp->priv;
struct hfi1_ibport *ibp = ps->ibp; struct hfi1_ibport *ibp = ps->ibp;
struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
u32 bth1 = 0;
u32 slid; u32 slid;
u16 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index); u16 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index);
u8 l4 = OPA_16B_L4_IB_LOCAL; u8 l4 = OPA_16B_L4_IB_LOCAL;
@ -360,12 +359,12 @@ static inline void hfi1_make_ruc_header_16B(struct rvt_qp *qp,
*/ */
static inline void hfi1_make_ruc_header_9B(struct rvt_qp *qp, static inline void hfi1_make_ruc_header_9B(struct rvt_qp *qp,
struct ib_other_headers *ohdr, struct ib_other_headers *ohdr,
u32 bth0, u32 bth2, int middle, u32 bth0, u32 bth1, u32 bth2,
int middle,
struct hfi1_pkt_state *ps) struct hfi1_pkt_state *ps)
{ {
struct hfi1_qp_priv *priv = qp->priv; struct hfi1_qp_priv *priv = qp->priv;
struct hfi1_ibport *ibp = ps->ibp; struct hfi1_ibport *ibp = ps->ibp;
u32 bth1 = 0;
u16 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index); u16 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index);
u16 lrh0 = HFI1_LRH_BTH; u16 lrh0 = HFI1_LRH_BTH;
u8 extra_bytes = -ps->s_txreq->s_cur_size & 3; u8 extra_bytes = -ps->s_txreq->s_cur_size & 3;
@ -415,7 +414,7 @@ static inline void hfi1_make_ruc_header_9B(struct rvt_qp *qp,
typedef void (*hfi1_make_ruc_hdr)(struct rvt_qp *qp, typedef void (*hfi1_make_ruc_hdr)(struct rvt_qp *qp,
struct ib_other_headers *ohdr, struct ib_other_headers *ohdr,
u32 bth0, u32 bth2, int middle, u32 bth0, u32 bth1, u32 bth2, int middle,
struct hfi1_pkt_state *ps); struct hfi1_pkt_state *ps);
/* We support only two types - 9B and 16B for now */ /* We support only two types - 9B and 16B for now */
@ -425,7 +424,7 @@ static const hfi1_make_ruc_hdr hfi1_ruc_header_tbl[2] = {
}; };
void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr, void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
u32 bth0, u32 bth2, int middle, u32 bth0, u32 bth1, u32 bth2, int middle,
struct hfi1_pkt_state *ps) struct hfi1_pkt_state *ps)
{ {
struct hfi1_qp_priv *priv = qp->priv; struct hfi1_qp_priv *priv = qp->priv;
@ -446,7 +445,8 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
priv->s_ahg->ahgidx = 0; priv->s_ahg->ahgidx = 0;
/* Make the appropriate header */ /* Make the appropriate header */
hfi1_ruc_header_tbl[priv->hdr_type](qp, ohdr, bth0, bth2, middle, ps); hfi1_ruc_header_tbl[priv->hdr_type](qp, ohdr, bth0, bth1, bth2, middle,
ps);
} }
/* when sending, force a reschedule every one of these periods */ /* when sending, force a reschedule every one of these periods */

Просмотреть файл

@ -7,6 +7,211 @@
#include "hfi.h" #include "hfi.h"
#include "verbs.h" #include "verbs.h"
#include "tid_rdma.h" #include "tid_rdma.h"
#include "trace.h"
/*
* J_KEY for kernel contexts when TID RDMA is used.
* See generate_jkey() in hfi.h for more information.
*/
#define TID_RDMA_JKEY 32
#define HFI1_KERNEL_MIN_JKEY HFI1_ADMIN_JKEY_RANGE
#define HFI1_KERNEL_MAX_JKEY (2 * HFI1_ADMIN_JKEY_RANGE - 1)
#define TID_RDMA_MAX_READ_SEGS_PER_REQ 6
#define TID_RDMA_MAX_WRITE_SEGS_PER_REQ 4
#define TID_OPFN_QP_CTXT_MASK 0xff
#define TID_OPFN_QP_CTXT_SHIFT 56
#define TID_OPFN_QP_KDETH_MASK 0xff
#define TID_OPFN_QP_KDETH_SHIFT 48
#define TID_OPFN_MAX_LEN_MASK 0x7ff
#define TID_OPFN_MAX_LEN_SHIFT 37
#define TID_OPFN_TIMEOUT_MASK 0x1f
#define TID_OPFN_TIMEOUT_SHIFT 32
#define TID_OPFN_RESERVED_MASK 0x3f
#define TID_OPFN_RESERVED_SHIFT 26
#define TID_OPFN_URG_MASK 0x1
#define TID_OPFN_URG_SHIFT 25
#define TID_OPFN_VER_MASK 0x7
#define TID_OPFN_VER_SHIFT 22
#define TID_OPFN_JKEY_MASK 0x3f
#define TID_OPFN_JKEY_SHIFT 16
#define TID_OPFN_MAX_READ_MASK 0x3f
#define TID_OPFN_MAX_READ_SHIFT 10
#define TID_OPFN_MAX_WRITE_MASK 0x3f
#define TID_OPFN_MAX_WRITE_SHIFT 4
/*
* OPFN TID layout
*
* 63 47 31 15
* NNNNNNNNKKKKKKKK MMMMMMMMMMMTTTTT DDDDDDUVVVJJJJJJ RRRRRRWWWWWWCCCC
* 3210987654321098 7654321098765432 1098765432109876 5432109876543210
* N - the context Number
* K - the Kdeth_qp
* M - Max_len
* T - Timeout
* D - reserveD
* V - version
* U - Urg capable
* J - Jkey
* R - max_Read
* W - max_Write
* C - Capcode
*/
static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p)
{
return
(((u64)p->qp & TID_OPFN_QP_CTXT_MASK) <<
TID_OPFN_QP_CTXT_SHIFT) |
((((u64)p->qp >> 16) & TID_OPFN_QP_KDETH_MASK) <<
TID_OPFN_QP_KDETH_SHIFT) |
(((u64)((p->max_len >> PAGE_SHIFT) - 1) &
TID_OPFN_MAX_LEN_MASK) << TID_OPFN_MAX_LEN_SHIFT) |
(((u64)p->timeout & TID_OPFN_TIMEOUT_MASK) <<
TID_OPFN_TIMEOUT_SHIFT) |
(((u64)p->urg & TID_OPFN_URG_MASK) << TID_OPFN_URG_SHIFT) |
(((u64)p->jkey & TID_OPFN_JKEY_MASK) << TID_OPFN_JKEY_SHIFT) |
(((u64)p->max_read & TID_OPFN_MAX_READ_MASK) <<
TID_OPFN_MAX_READ_SHIFT) |
(((u64)p->max_write & TID_OPFN_MAX_WRITE_MASK) <<
TID_OPFN_MAX_WRITE_SHIFT);
}
static void tid_rdma_opfn_decode(struct tid_rdma_params *p, u64 data)
{
p->max_len = (((data >> TID_OPFN_MAX_LEN_SHIFT) &
TID_OPFN_MAX_LEN_MASK) + 1) << PAGE_SHIFT;
p->jkey = (data >> TID_OPFN_JKEY_SHIFT) & TID_OPFN_JKEY_MASK;
p->max_write = (data >> TID_OPFN_MAX_WRITE_SHIFT) &
TID_OPFN_MAX_WRITE_MASK;
p->max_read = (data >> TID_OPFN_MAX_READ_SHIFT) &
TID_OPFN_MAX_READ_MASK;
p->qp =
((((data >> TID_OPFN_QP_KDETH_SHIFT) & TID_OPFN_QP_KDETH_MASK)
<< 16) |
((data >> TID_OPFN_QP_CTXT_SHIFT) & TID_OPFN_QP_CTXT_MASK));
p->urg = (data >> TID_OPFN_URG_SHIFT) & TID_OPFN_URG_MASK;
p->timeout = (data >> TID_OPFN_TIMEOUT_SHIFT) & TID_OPFN_TIMEOUT_MASK;
}
void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p)
{
struct hfi1_qp_priv *priv = qp->priv;
p->qp = (kdeth_qp << 16) | priv->rcd->ctxt;
p->max_len = TID_RDMA_MAX_SEGMENT_SIZE;
p->jkey = priv->rcd->jkey;
p->max_read = TID_RDMA_MAX_READ_SEGS_PER_REQ;
p->max_write = TID_RDMA_MAX_WRITE_SEGS_PER_REQ;
p->timeout = qp->timeout;
p->urg = is_urg_masked(priv->rcd);
}
bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data)
{
struct hfi1_qp_priv *priv = qp->priv;
*data = tid_rdma_opfn_encode(&priv->tid_rdma.local);
return true;
}
bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data)
{
struct hfi1_qp_priv *priv = qp->priv;
struct tid_rdma_params *remote, *old;
bool ret = true;
old = rcu_dereference_protected(priv->tid_rdma.remote,
lockdep_is_held(&priv->opfn.lock));
data &= ~0xfULL;
/*
* If data passed in is zero, return true so as not to continue the
* negotiation process
*/
if (!data || !HFI1_CAP_IS_KSET(TID_RDMA))
goto null;
/*
* If kzalloc fails, return false. This will result in:
* * at the requester a new OPFN request being generated to retry
* the negotiation
* * at the responder, 0 being returned to the requester so as to
* disable TID RDMA at both the requester and the responder
*/
remote = kzalloc(sizeof(*remote), GFP_ATOMIC);
if (!remote) {
ret = false;
goto null;
}
tid_rdma_opfn_decode(remote, data);
priv->tid_timer_timeout_jiffies =
usecs_to_jiffies((((4096UL * (1UL << remote->timeout)) /
1000UL) << 3) * 7);
trace_hfi1_opfn_param(qp, 0, &priv->tid_rdma.local);
trace_hfi1_opfn_param(qp, 1, remote);
rcu_assign_pointer(priv->tid_rdma.remote, remote);
/*
* A TID RDMA READ request's segment size is not equal to
* remote->max_len only when the request's data length is smaller
* than remote->max_len. In that case, there will be only one segment.
* Therefore, when priv->pkts_ps is used to calculate req->cur_seg
* during retry, it will lead to req->cur_seg = 0, which is exactly
* what is expected.
*/
priv->pkts_ps = (u16)rvt_div_mtu(qp, remote->max_len);
priv->timeout_shift = ilog2(priv->pkts_ps - 1) + 1;
goto free;
null:
RCU_INIT_POINTER(priv->tid_rdma.remote, NULL);
priv->timeout_shift = 0;
free:
if (old)
kfree_rcu(old, rcu_head);
return ret;
}
bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data)
{
bool ret;
ret = tid_rdma_conn_reply(qp, *data);
*data = 0;
/*
* If tid_rdma_conn_reply() returns error, set *data as 0 to indicate
* TID RDMA could not be enabled. This will result in TID RDMA being
* disabled at the requester too.
*/
if (ret)
(void)tid_rdma_conn_req(qp, data);
return ret;
}
void tid_rdma_conn_error(struct rvt_qp *qp)
{
struct hfi1_qp_priv *priv = qp->priv;
struct tid_rdma_params *old;
old = rcu_dereference_protected(priv->tid_rdma.remote,
lockdep_is_held(&priv->opfn.lock));
RCU_INIT_POINTER(priv->tid_rdma.remote, NULL);
if (old)
kfree_rcu(old, rcu_head);
}
/* This is called at context initialization time */
int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit)
{
if (reinit)
return 0;
BUILD_BUG_ON(TID_RDMA_JKEY < HFI1_KERNEL_MIN_JKEY);
BUILD_BUG_ON(TID_RDMA_JKEY > HFI1_KERNEL_MAX_JKEY);
rcd->jkey = TID_RDMA_JKEY;
hfi1_set_ctxt_jkey(rcd->dd, rcd, rcd->jkey);
return 0;
}
/** /**
* qp_to_rcd - determine the receive context used by a qp * qp_to_rcd - determine the receive context used by a qp
@ -44,5 +249,16 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp,
qpriv->rcd = qp_to_rcd(rdi, qp); qpriv->rcd = qp_to_rcd(rdi, qp);
spin_lock_init(&qpriv->opfn.lock);
INIT_WORK(&qpriv->opfn.opfn_work, opfn_send_conn_request);
return 0; return 0;
} }
void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp)
{
struct hfi1_qp_priv *priv = qp->priv;
if (qp->ibqp.qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA))
cancel_work_sync(&priv->opfn.opfn_work);
}

Просмотреть файл

@ -6,8 +6,35 @@
#ifndef HFI1_TID_RDMA_H #ifndef HFI1_TID_RDMA_H
#define HFI1_TID_RDMA_H #define HFI1_TID_RDMA_H
#define TID_RDMA_MAX_SEGMENT_SIZE BIT(18) /* 256 KiB (for now) */
struct tid_rdma_params {
struct rcu_head rcu_head;
u32 qp;
u32 max_len;
u16 jkey;
u8 max_read;
u8 max_write;
u8 timeout;
u8 urg;
u8 version;
};
struct tid_rdma_qp_params {
struct tid_rdma_params local;
struct tid_rdma_params __rcu *remote;
};
bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data);
bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data);
bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data);
void tid_rdma_conn_error(struct rvt_qp *qp);
void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p);
int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit);
int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp,
struct ib_qp_init_attr *init_attr); struct ib_qp_init_attr *init_attr);
void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp);
#endif /* HFI1_TID_RDMA_H */ #endif /* HFI1_TID_RDMA_H */

Просмотреть файл

@ -63,3 +63,4 @@ __print_symbolic(etype, \
#include "trace_tx.h" #include "trace_tx.h"
#include "trace_mmu.h" #include "trace_mmu.h"
#include "trace_iowait.h" #include "trace_iowait.h"
#include "trace_tid.h"

Просмотреть файл

@ -1,5 +1,5 @@
/* /*
* Copyright(c) 2015 - 2017 Intel Corporation. * Copyright(c) 2015 - 2018 Intel Corporation.
* *
* This file is provided under a dual BSD/GPLv2 license. When using or * This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license. * redistributing this file, you may do so under either license.
@ -128,111 +128,6 @@ TRACE_EVENT(hfi1_receive_interrupt,
) )
); );
DECLARE_EVENT_CLASS(
hfi1_exp_tid_reg_unreg,
TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr,
u32 npages, unsigned long va, unsigned long pa,
dma_addr_t dma),
TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma),
TP_STRUCT__entry(
__field(unsigned int, ctxt)
__field(u16, subctxt)
__field(u32, rarr)
__field(u32, npages)
__field(unsigned long, va)
__field(unsigned long, pa)
__field(dma_addr_t, dma)
),
TP_fast_assign(
__entry->ctxt = ctxt;
__entry->subctxt = subctxt;
__entry->rarr = rarr;
__entry->npages = npages;
__entry->va = va;
__entry->pa = pa;
__entry->dma = dma;
),
TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx",
__entry->ctxt,
__entry->subctxt,
__entry->rarr,
__entry->npages,
__entry->pa,
__entry->va,
__entry->dma
)
);
DEFINE_EVENT(
hfi1_exp_tid_reg_unreg, hfi1_exp_tid_unreg,
TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages,
unsigned long va, unsigned long pa, dma_addr_t dma),
TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma));
DEFINE_EVENT(
hfi1_exp_tid_reg_unreg, hfi1_exp_tid_reg,
TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages,
unsigned long va, unsigned long pa, dma_addr_t dma),
TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma));
TRACE_EVENT(
hfi1_put_tid,
TP_PROTO(struct hfi1_devdata *dd,
u32 index, u32 type, unsigned long pa, u16 order),
TP_ARGS(dd, index, type, pa, order),
TP_STRUCT__entry(
DD_DEV_ENTRY(dd)
__field(unsigned long, pa);
__field(u32, index);
__field(u32, type);
__field(u16, order);
),
TP_fast_assign(
DD_DEV_ASSIGN(dd);
__entry->pa = pa;
__entry->index = index;
__entry->type = type;
__entry->order = order;
),
TP_printk("[%s] type %s pa %lx index %u order %u",
__get_str(dev),
show_tidtype(__entry->type),
__entry->pa,
__entry->index,
__entry->order
)
);
TRACE_EVENT(hfi1_exp_tid_inval,
TP_PROTO(unsigned int ctxt, u16 subctxt, unsigned long va, u32 rarr,
u32 npages, dma_addr_t dma),
TP_ARGS(ctxt, subctxt, va, rarr, npages, dma),
TP_STRUCT__entry(
__field(unsigned int, ctxt)
__field(u16, subctxt)
__field(unsigned long, va)
__field(u32, rarr)
__field(u32, npages)
__field(dma_addr_t, dma)
),
TP_fast_assign(
__entry->ctxt = ctxt;
__entry->subctxt = subctxt;
__entry->va = va;
__entry->rarr = rarr;
__entry->npages = npages;
__entry->dma = dma;
),
TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx dma: 0x%llx",
__entry->ctxt,
__entry->subctxt,
__entry->rarr,
__entry->npages,
__entry->va,
__entry->dma
)
);
TRACE_EVENT(hfi1_mmu_invalidate, TRACE_EVENT(hfi1_mmu_invalidate,
TP_PROTO(unsigned int ctxt, u16 subctxt, const char *type, TP_PROTO(unsigned int ctxt, u16 subctxt, const char *type,
unsigned long start, unsigned long end), unsigned long start, unsigned long end),

Просмотреть файл

@ -0,0 +1,332 @@
/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
/*
* Copyright(c) 2018 Intel Corporation.
*
*/
#if !defined(__HFI1_TRACE_TID_H) || defined(TRACE_HEADER_MULTI_READ)
#define __HFI1_TRACE_TID_H
#include <linux/tracepoint.h>
#include <linux/trace_seq.h>
#include "hfi.h"
#define tidtype_name(type) { PT_##type, #type }
#define show_tidtype(type) \
__print_symbolic(type, \
tidtype_name(EXPECTED), \
tidtype_name(EAGER), \
tidtype_name(INVALID)) \
#undef TRACE_SYSTEM
#define TRACE_SYSTEM hfi1_tid
#define OPFN_PARAM_PRN "[%s] qpn 0x%x %s OPFN: qp 0x%x, max read %u, " \
"max write %u, max length %u, jkey 0x%x timeout %u " \
"urg %u"
DECLARE_EVENT_CLASS(/* class */
hfi1_exp_tid_reg_unreg,
TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages,
unsigned long va, unsigned long pa, dma_addr_t dma),
TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma),
TP_STRUCT__entry(/* entry */
__field(unsigned int, ctxt)
__field(u16, subctxt)
__field(u32, rarr)
__field(u32, npages)
__field(unsigned long, va)
__field(unsigned long, pa)
__field(dma_addr_t, dma)
),
TP_fast_assign(/* assign */
__entry->ctxt = ctxt;
__entry->subctxt = subctxt;
__entry->rarr = rarr;
__entry->npages = npages;
__entry->va = va;
__entry->pa = pa;
__entry->dma = dma;
),
TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx",
__entry->ctxt,
__entry->subctxt,
__entry->rarr,
__entry->npages,
__entry->pa,
__entry->va,
__entry->dma
)
);
DEFINE_EVENT(/* exp_tid_unreg */
hfi1_exp_tid_reg_unreg, hfi1_exp_tid_unreg,
TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages,
unsigned long va, unsigned long pa, dma_addr_t dma),
TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma)
);
DEFINE_EVENT(/* exp_tid_reg */
hfi1_exp_tid_reg_unreg, hfi1_exp_tid_reg,
TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages,
unsigned long va, unsigned long pa, dma_addr_t dma),
TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma)
);
TRACE_EVENT(/* put_tid */
hfi1_put_tid,
TP_PROTO(struct hfi1_devdata *dd,
u32 index, u32 type, unsigned long pa, u16 order),
TP_ARGS(dd, index, type, pa, order),
TP_STRUCT__entry(/* entry */
DD_DEV_ENTRY(dd)
__field(unsigned long, pa);
__field(u32, index);
__field(u32, type);
__field(u16, order);
),
TP_fast_assign(/* assign */
DD_DEV_ASSIGN(dd);
__entry->pa = pa;
__entry->index = index;
__entry->type = type;
__entry->order = order;
),
TP_printk("[%s] type %s pa %lx index %u order %u",
__get_str(dev),
show_tidtype(__entry->type),
__entry->pa,
__entry->index,
__entry->order
)
);
TRACE_EVENT(/* exp_tid_inval */
hfi1_exp_tid_inval,
TP_PROTO(unsigned int ctxt, u16 subctxt, unsigned long va, u32 rarr,
u32 npages, dma_addr_t dma),
TP_ARGS(ctxt, subctxt, va, rarr, npages, dma),
TP_STRUCT__entry(/* entry */
__field(unsigned int, ctxt)
__field(u16, subctxt)
__field(unsigned long, va)
__field(u32, rarr)
__field(u32, npages)
__field(dma_addr_t, dma)
),
TP_fast_assign(/* assign */
__entry->ctxt = ctxt;
__entry->subctxt = subctxt;
__entry->va = va;
__entry->rarr = rarr;
__entry->npages = npages;
__entry->dma = dma;
),
TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx dma: 0x%llx",
__entry->ctxt,
__entry->subctxt,
__entry->rarr,
__entry->npages,
__entry->va,
__entry->dma
)
);
DECLARE_EVENT_CLASS(/* opfn_state */
hfi1_opfn_state_template,
TP_PROTO(struct rvt_qp *qp),
TP_ARGS(qp),
TP_STRUCT__entry(/* entry */
DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
__field(u32, qpn)
__field(u16, requested)
__field(u16, completed)
__field(u8, curr)
),
TP_fast_assign(/* assign */
struct hfi1_qp_priv *priv = qp->priv;
DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
__entry->qpn = qp->ibqp.qp_num;
__entry->requested = priv->opfn.requested;
__entry->completed = priv->opfn.completed;
__entry->curr = priv->opfn.curr;
),
TP_printk(/* print */
"[%s] qpn 0x%x requested 0x%x completed 0x%x curr 0x%x",
__get_str(dev),
__entry->qpn,
__entry->requested,
__entry->completed,
__entry->curr
)
);
DEFINE_EVENT(/* event */
hfi1_opfn_state_template, hfi1_opfn_state_conn_request,
TP_PROTO(struct rvt_qp *qp),
TP_ARGS(qp)
);
DEFINE_EVENT(/* event */
hfi1_opfn_state_template, hfi1_opfn_state_sched_conn_request,
TP_PROTO(struct rvt_qp *qp),
TP_ARGS(qp)
);
DEFINE_EVENT(/* event */
hfi1_opfn_state_template, hfi1_opfn_state_conn_response,
TP_PROTO(struct rvt_qp *qp),
TP_ARGS(qp)
);
DEFINE_EVENT(/* event */
hfi1_opfn_state_template, hfi1_opfn_state_conn_reply,
TP_PROTO(struct rvt_qp *qp),
TP_ARGS(qp)
);
DEFINE_EVENT(/* event */
hfi1_opfn_state_template, hfi1_opfn_state_conn_error,
TP_PROTO(struct rvt_qp *qp),
TP_ARGS(qp)
);
DECLARE_EVENT_CLASS(/* opfn_data */
hfi1_opfn_data_template,
TP_PROTO(struct rvt_qp *qp, u8 capcode, u64 data),
TP_ARGS(qp, capcode, data),
TP_STRUCT__entry(/* entry */
DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
__field(u32, qpn)
__field(u32, state)
__field(u8, capcode)
__field(u64, data)
),
TP_fast_assign(/* assign */
DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
__entry->qpn = qp->ibqp.qp_num;
__entry->state = qp->state;
__entry->capcode = capcode;
__entry->data = data;
),
TP_printk(/* printk */
"[%s] qpn 0x%x (state 0x%x) Capcode %u data 0x%llx",
__get_str(dev),
__entry->qpn,
__entry->state,
__entry->capcode,
__entry->data
)
);
DEFINE_EVENT(/* event */
hfi1_opfn_data_template, hfi1_opfn_data_conn_request,
TP_PROTO(struct rvt_qp *qp, u8 capcode, u64 data),
TP_ARGS(qp, capcode, data)
);
DEFINE_EVENT(/* event */
hfi1_opfn_data_template, hfi1_opfn_data_conn_response,
TP_PROTO(struct rvt_qp *qp, u8 capcode, u64 data),
TP_ARGS(qp, capcode, data)
);
DEFINE_EVENT(/* event */
hfi1_opfn_data_template, hfi1_opfn_data_conn_reply,
TP_PROTO(struct rvt_qp *qp, u8 capcode, u64 data),
TP_ARGS(qp, capcode, data)
);
DECLARE_EVENT_CLASS(/* opfn_param */
hfi1_opfn_param_template,
TP_PROTO(struct rvt_qp *qp, char remote,
struct tid_rdma_params *param),
TP_ARGS(qp, remote, param),
TP_STRUCT__entry(/* entry */
DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
__field(u32, qpn)
__field(char, remote)
__field(u32, param_qp)
__field(u32, max_len)
__field(u16, jkey)
__field(u8, max_read)
__field(u8, max_write)
__field(u8, timeout)
__field(u8, urg)
),
TP_fast_assign(/* assign */
DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
__entry->qpn = qp->ibqp.qp_num;
__entry->remote = remote;
__entry->param_qp = param->qp;
__entry->max_len = param->max_len;
__entry->jkey = param->jkey;
__entry->max_read = param->max_read;
__entry->max_write = param->max_write;
__entry->timeout = param->timeout;
__entry->urg = param->urg;
),
TP_printk(/* print */
OPFN_PARAM_PRN,
__get_str(dev),
__entry->qpn,
__entry->remote ? "remote" : "local",
__entry->param_qp,
__entry->max_read,
__entry->max_write,
__entry->max_len,
__entry->jkey,
__entry->timeout,
__entry->urg
)
);
DEFINE_EVENT(/* event */
hfi1_opfn_param_template, hfi1_opfn_param,
TP_PROTO(struct rvt_qp *qp, char remote,
struct tid_rdma_params *param),
TP_ARGS(qp, remote, param)
);
DECLARE_EVENT_CLASS(/* msg */
hfi1_msg_template,
TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
TP_ARGS(qp, msg, more),
TP_STRUCT__entry(/* entry */
__field(u32, qpn)
__string(msg, msg)
__field(u64, more)
),
TP_fast_assign(/* assign */
__entry->qpn = qp ? qp->ibqp.qp_num : 0;
__assign_str(msg, msg);
__entry->more = more;
),
TP_printk(/* print */
"qpn 0x%x %s 0x%llx",
__entry->qpn,
__get_str(msg),
__entry->more
)
);
DEFINE_EVENT(/* event */
hfi1_msg_template, hfi1_msg_opfn_conn_request,
TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
TP_ARGS(qp, msg, more)
);
DEFINE_EVENT(/* event */
hfi1_msg_template, hfi1_msg_opfn_conn_error,
TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
TP_ARGS(qp, msg, more)
);
#endif /* __HFI1_TRACE_TID_H */
#undef TRACE_INCLUDE_PATH
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_PATH .
#define TRACE_INCLUDE_FILE trace_tid
#include <trace/define_trace.h>

Просмотреть файл

@ -271,7 +271,8 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
ps->s_txreq->ss = &qp->s_sge; ps->s_txreq->ss = &qp->s_sge;
ps->s_txreq->s_cur_size = len; ps->s_txreq->s_cur_size = len;
hfi1_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24), hfi1_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24),
mask_psn(qp->s_psn++), middle, ps); qp->remote_qpn, mask_psn(qp->s_psn++),
middle, ps);
return 1; return 1;
done_free_tx: done_free_tx:

Просмотреть файл

@ -1735,6 +1735,8 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
dd->verbs_dev.rdi.dparms.sge_copy_mode = sge_copy_mode; dd->verbs_dev.rdi.dparms.sge_copy_mode = sge_copy_mode;
dd->verbs_dev.rdi.dparms.wss_threshold = wss_threshold; dd->verbs_dev.rdi.dparms.wss_threshold = wss_threshold;
dd->verbs_dev.rdi.dparms.wss_clean_period = wss_clean_period; dd->verbs_dev.rdi.dparms.wss_clean_period = wss_clean_period;
dd->verbs_dev.rdi.dparms.reserved_operations = 1;
dd->verbs_dev.rdi.dparms.extra_rdma_atomic = 1;
/* post send table */ /* post send table */
dd->verbs_dev.rdi.post_parms = hfi1_post_parms; dd->verbs_dev.rdi.post_parms = hfi1_post_parms;

Просмотреть файл

@ -72,6 +72,7 @@ struct hfi1_packet;
#include "iowait.h" #include "iowait.h"
#include "tid_rdma.h" #include "tid_rdma.h"
#include "opfn.h"
#define HFI1_MAX_RDMA_ATOMIC 16 #define HFI1_MAX_RDMA_ATOMIC 16
@ -160,8 +161,13 @@ struct hfi1_qp_priv {
struct hfi1_ctxtdata *rcd; /* QP's receive context */ struct hfi1_ctxtdata *rcd; /* QP's receive context */
u8 s_sc; /* SC[0..4] for next packet */ u8 s_sc; /* SC[0..4] for next packet */
struct iowait s_iowait; struct iowait s_iowait;
struct hfi1_opfn_data opfn;
struct tid_rdma_qp_params tid_rdma;
struct rvt_qp *owner; struct rvt_qp *owner;
u8 hdr_type; /* 9B or 16B */ u8 hdr_type; /* 9B or 16B */
unsigned long tid_timer_timeout_jiffies;
u16 pkts_ps; /* packets per segment */
u8 timeout_shift; /* account for number of packets per segment */
}; };
/* /*
@ -356,7 +362,7 @@ u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr,
const struct ib_global_route *grh, u32 hwords, u32 nwords); const struct ib_global_route *grh, u32 hwords, u32 nwords);
void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr, void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
u32 bth0, u32 bth2, int middle, u32 bth0, u32 bth1, u32 bth2, int middle,
struct hfi1_pkt_state *ps); struct hfi1_pkt_state *ps);
void _hfi1_do_send(struct work_struct *work); void _hfi1_do_send(struct work_struct *work);

Просмотреть файл

@ -182,6 +182,7 @@ struct rvt_driver_params {
u32 max_mad_size; u32 max_mad_size;
u8 qos_shift; u8 qos_shift;
u8 max_rdma_atomic; u8 max_rdma_atomic;
u8 extra_rdma_atomic;
u8 reserved_operations; u8 reserved_operations;
}; };
@ -519,7 +520,14 @@ static inline unsigned rvt_get_npkeys(struct rvt_dev_info *rdi)
*/ */
static inline unsigned int rvt_max_atomic(struct rvt_dev_info *rdi) static inline unsigned int rvt_max_atomic(struct rvt_dev_info *rdi)
{ {
return rdi->dparms.max_rdma_atomic + 1; return rdi->dparms.max_rdma_atomic +
rdi->dparms.extra_rdma_atomic + 1;
}
static inline unsigned int rvt_size_atomic(struct rvt_dev_info *rdi)
{
return rdi->dparms.max_rdma_atomic +
rdi->dparms.extra_rdma_atomic;
} }
/* /*