Merge HFI1 updates into k.o/for-next

Based on rdma.git for-rc for dependencies. From Dennis Dalessandro: ==================== Here are some code improvement patches and fixes for less serious bugs to TID RDMA than we sent for RC. ==================== * HFI1 updates: IB/hfi1: Implement CCA for TID RDMA protocol IB/hfi1: Remove WARN_ON when freeing expected receive groups IB/hfi1: Unify the software PSN check for TID RDMA READ/WRITE IB/hfi1: Add a function to read next expected psn from hardware flow IB/hfi1: Delay the release of destination mr for TID RDMA WRITE DATA Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2019-04-03 15:28:05 -03:00 · 2019-04-03 15:28:05 -03:00 · 1c726c4421
--- a/drivers/infiniband/hw/hfi1/chip.c
+++ b/drivers/infiniband/hw/hfi1/chip.c
@ -13232,7 +13232,7 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
 	int total_contexts;
 	int ret;
 	unsigned ngroups;
-	int qos_rmt_count;
+	int rmt_count;
 	int user_rmt_reduced;
 	u32 n_usr_ctxts;
 	u32 send_contexts = chip_send_contexts(dd);
@ -13294,10 +13294,23 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
 		n_usr_ctxts = rcv_contexts - total_contexts;
 	}

-	/* each user context requires an entry in the RMT */
-	qos_rmt_count = qos_rmt_entries(dd, NULL, NULL);
-	if (qos_rmt_count + n_usr_ctxts > NUM_MAP_ENTRIES) {
-		user_rmt_reduced = NUM_MAP_ENTRIES - qos_rmt_count;
+	/*
+	 * The RMT entries are currently allocated as shown below:
+	 * 1. QOS (0 to 128 entries);
+	 * 2. FECN (num_kernel_context - 1 + num_user_contexts +
+	 *    num_vnic_contexts);
+	 * 3. VNIC (num_vnic_contexts).
+	 * It should be noted that FECN oversubscribe num_vnic_contexts
+	 * entries of RMT because both VNIC and PSM could allocate any receive
+	 * context between dd->first_dyn_alloc_text and dd->num_rcv_contexts,
+	 * and PSM FECN must reserve an RMT entry for each possible PSM receive
+	 * context.
+	 */
+	rmt_count = qos_rmt_entries(dd, NULL, NULL) + (num_vnic_contexts * 2);
+	if (HFI1_CAP_IS_KSET(TID_RDMA))
+		rmt_count += num_kernel_contexts - 1;
+	if (rmt_count + n_usr_ctxts > NUM_MAP_ENTRIES) {
+		user_rmt_reduced = NUM_MAP_ENTRIES - rmt_count;
 		dd_dev_err(dd,
 			   "RMT size is reducing the number of user receive contexts from %u to %d\n",
 			   n_usr_ctxts,
@ -14278,35 +14291,43 @@ bail:
 	init_qpmap_table(dd, FIRST_KERNEL_KCTXT, dd->n_krcv_queues - 1);
 }

-static void init_user_fecn_handling(struct hfi1_devdata *dd,
-				    struct rsm_map_table *rmt)
+static void init_fecn_handling(struct hfi1_devdata *dd,
+			       struct rsm_map_table *rmt)
 {
 	struct rsm_rule_data rrd;
 	u64 reg;
-	int i, idx, regoff, regidx;
+	int i, idx, regoff, regidx, start;
 	u8 offset;
+	u32 total_cnt;
+
+	if (HFI1_CAP_IS_KSET(TID_RDMA))
+		/* Exclude context 0 */
+		start = 1;
+	else
+		start = dd->first_dyn_alloc_ctxt;
+
+	total_cnt = dd->num_rcv_contexts - start;

 	/* there needs to be enough room in the map table */
-	if (rmt->used + dd->num_user_contexts >= NUM_MAP_ENTRIES) {
-		dd_dev_err(dd, "User FECN handling disabled - too many user contexts allocated\n");
+	if (rmt->used + total_cnt >= NUM_MAP_ENTRIES) {
+		dd_dev_err(dd, "FECN handling disabled - too many contexts allocated\n");
 		return;
 	}

 	/*
 	 * RSM will extract the destination context as an index into the
 	 * map table.  The destination contexts are a sequential block
-	 * in the range first_dyn_alloc_ctxt...num_rcv_contexts-1 (inclusive).
+	 * in the range start...num_rcv_contexts-1 (inclusive).
 	 * Map entries are accessed as offset + extracted value.  Adjust
 	 * the added offset so this sequence can be placed anywhere in
 	 * the table - as long as the entries themselves do not wrap.
 	 * There are only enough bits in offset for the table size, so
 	 * start with that to allow for a "negative" offset.
 	 */
-	offset = (u8)(NUM_MAP_ENTRIES + (int)rmt->used -
-						(int)dd->first_dyn_alloc_ctxt);
+	offset = (u8)(NUM_MAP_ENTRIES + rmt->used - start);

-	for (i = dd->first_dyn_alloc_ctxt, idx = rmt->used;
-				i < dd->num_rcv_contexts; i++, idx++) {
+	for (i = start, idx = rmt->used; i < dd->num_rcv_contexts;
+	     i++, idx++) {
 		/* replace with identity mapping */
 		regoff = (idx % 8) * 8;
 		regidx = idx / 8;
@ -14341,7 +14362,7 @@ static void init_user_fecn_handling(struct hfi1_devdata *dd,
 	/* add rule 1 */
 	add_rsm_rule(dd, RSM_INS_FECN, &rrd);

-	rmt->used += dd->num_user_contexts;
+	rmt->used += total_cnt;
 }

 /* Initialize RSM for VNIC */
@ -14428,7 +14449,7 @@ static void init_rxe(struct hfi1_devdata *dd)
 	rmt = alloc_rsm_map_table(dd);
 	/* set up QOS, including the QPN map table */
 	init_qos(dd, rmt);
-	init_user_fecn_handling(dd, rmt);
+	init_fecn_handling(dd, rmt);
 	complete_rsm_map_table(dd, rmt);
 	/* record number of used rsm map entries for vnic */
 	dd->vnic.rmt_start = rmt->used;
--- a/drivers/infiniband/hw/hfi1/driver.c
+++ b/drivers/infiniband/hw/hfi1/driver.c
@ -514,7 +514,9 @@ bool hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt,
 	 */
 	do_cnp = prescan ||
 		(opcode >= IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST &&
-		 opcode <= IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE);
+		 opcode <= IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE) ||
+		opcode == TID_OP(READ_RESP) ||
+		opcode == TID_OP(ACK);

 	/* Call appropriate CNP handler */
 	if (!ignore_fecn && do_cnp && fecn)
--- a/drivers/infiniband/hw/hfi1/exp_rcv.c
+++ b/drivers/infiniband/hw/hfi1/exp_rcv.c
@ -112,9 +112,6 @@ int hfi1_alloc_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd)
 */
 void hfi1_free_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd)
 {
-	WARN_ON(!EXP_TID_SET_EMPTY(rcd->tid_full_list));
-	WARN_ON(!EXP_TID_SET_EMPTY(rcd->tid_used_list));
-
 	kfree(rcd->groups);
 	rcd->groups = NULL;
 	hfi1_exp_tid_group_init(rcd);
--- a/drivers/infiniband/hw/hfi1/qp.c
+++ b/drivers/infiniband/hw/hfi1/qp.c
@ -900,7 +900,9 @@ void notify_error_qp(struct rvt_qp *qp)
 		if (!list_empty(&priv->s_iowait.list) &&
 		    !(qp->s_flags & RVT_S_BUSY) &&
 		    !(priv->s_flags & RVT_S_BUSY)) {
-			qp->s_flags &= ~RVT_S_ANY_WAIT_IO;
+			qp->s_flags &= ~HFI1_S_ANY_WAIT_IO;
+			iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_IB);
+			iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
 			list_del_init(&priv->s_iowait.list);
 			priv->s_iowait.lock = NULL;
 			rvt_put_qp(qp);
--- a/drivers/infiniband/hw/hfi1/rc.c
+++ b/drivers/infiniband/hw/hfi1/rc.c
@ -140,10 +140,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
 	case OP(RDMA_READ_RESPONSE_LAST):
 	case OP(RDMA_READ_RESPONSE_ONLY):
 		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
-		if (e->rdma_sge.mr) {
-			rvt_put_mr(e->rdma_sge.mr);
-			e->rdma_sge.mr = NULL;
-		}
+		release_rdma_sge_mr(e);
 		/* FALLTHROUGH */
 	case OP(ATOMIC_ACKNOWLEDGE):
 		/*
@ -343,7 +340,8 @@ write_resp:
 			break;

 		e->sent = 1;
-		qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
+		/* Do not free e->rdma_sge until all data are received */
+		qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
 		break;

 	case TID_OP(READ_RESP):
@ -2643,10 +2641,7 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
 		len = be32_to_cpu(reth->length);
 		if (unlikely(offset + len != e->rdma_sge.sge_length))
 			goto unlock_done;
-		if (e->rdma_sge.mr) {
-			rvt_put_mr(e->rdma_sge.mr);
-			e->rdma_sge.mr = NULL;
-		}
+		release_rdma_sge_mr(e);
 		if (len != 0) {
 			u32 rkey = be32_to_cpu(reth->rkey);
 			u64 vaddr = get_ib_reth_vaddr(reth);
@ -3088,10 +3083,7 @@ send_last:
 			update_ack_queue(qp, next);
 		}
 		e = &qp->s_ack_queue[qp->r_head_ack_queue];
-		if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
-			rvt_put_mr(e->rdma_sge.mr);
-			e->rdma_sge.mr = NULL;
-		}
+		release_rdma_sge_mr(e);
 		reth = &ohdr->u.rc.reth;
 		len = be32_to_cpu(reth->length);
 		if (len) {
@ -3166,10 +3158,7 @@ send_last:
 			update_ack_queue(qp, next);
 		}
 		e = &qp->s_ack_queue[qp->r_head_ack_queue];
-		if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
-			rvt_put_mr(e->rdma_sge.mr);
-			e->rdma_sge.mr = NULL;
-		}
+		release_rdma_sge_mr(e);
 		/* Process OPFN special virtual address */
 		if (opfn) {
 			opfn_conn_response(qp, e, ateth);
--- a/drivers/infiniband/hw/hfi1/rc.h
+++ b/drivers/infiniband/hw/hfi1/rc.h
@ -41,6 +41,14 @@ static inline u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
 	return rvt_restart_sge(ss, wqe, len);
 }

+static inline void release_rdma_sge_mr(struct rvt_ack_entry *e)
+{
+	if (e->rdma_sge.mr) {
+		rvt_put_mr(e->rdma_sge.mr);
+		e->rdma_sge.mr = NULL;
+	}
+}
+
 struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev,
 				      u8 *prev_ack, bool *scheduled);
 int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, u64 val,
--- a/drivers/infiniband/hw/hfi1/tid_rdma.c
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.c
@ -67,8 +67,6 @@ static u32 mask_generation(u32 a)
 #define TID_RDMA_DESTQP_FLOW_SHIFT      11
 #define TID_RDMA_DESTQP_FLOW_MASK       0x1f

-#define TID_FLOW_SW_PSN BIT(0)
-
 #define TID_OPFN_QP_CTXT_MASK 0xff
 #define TID_OPFN_QP_CTXT_SHIFT 56
 #define TID_OPFN_QP_KDETH_MASK 0xff
@ -128,6 +126,15 @@ static int make_tid_rdma_ack(struct rvt_qp *qp,
 			     struct ib_other_headers *ohdr,
 			     struct hfi1_pkt_state *ps);
 static void hfi1_do_tid_send(struct rvt_qp *qp);
+static u32 read_r_next_psn(struct hfi1_devdata *dd, u8 ctxt, u8 fidx);
+static void tid_rdma_rcv_err(struct hfi1_packet *packet,
+			     struct ib_other_headers *ohdr,
+			     struct rvt_qp *qp, u32 psn, int diff, bool fecn);
+static void update_r_next_psn_fecn(struct hfi1_packet *packet,
+				   struct hfi1_qp_priv *priv,
+				   struct hfi1_ctxtdata *rcd,
+				   struct tid_rdma_flow *flow,
+				   bool fecn);

 static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p)
 {
@ -776,7 +783,6 @@ int hfi1_kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp)
 		rcd->flows[fs->index].generation = fs->generation;
 	fs->generation = kern_setup_hw_flow(rcd, fs->index);
 	fs->psn = 0;
-	fs->flags = 0;
 	dequeue_tid_waiter(rcd, &rcd->flow_queue, qp);
 	/* get head before dropping lock */
 	fqp = first_qp(rcd, &rcd->flow_queue);
@ -1807,6 +1813,7 @@ sync_check:
 			goto done;

 		hfi1_kern_clear_hw_flow(req->rcd, qp);
+		qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
 		req->state = TID_REQUEST_ACTIVE;
 	}

@ -2036,10 +2043,7 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet,
 		if (psn != e->psn || len != req->total_len)
 			goto unlock;

-		if (e->rdma_sge.mr) {
-			rvt_put_mr(e->rdma_sge.mr);
-			e->rdma_sge.mr = NULL;
-		}
+		release_rdma_sge_mr(e);

 		rkey = be32_to_cpu(reth->rkey);
 		vaddr = get_ib_reth_vaddr(reth);
@ -2238,7 +2242,7 @@ void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet)
 	struct ib_reth *reth;
 	struct hfi1_qp_priv *qpriv = qp->priv;
 	u32 bth0, psn, len, rkey;
-	bool is_fecn;
+	bool fecn;
 	u8 next;
 	u64 vaddr;
 	int diff;
@ -2248,7 +2252,7 @@ void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet)
 	if (hfi1_ruc_check_hdr(ibp, packet))
 		return;

-	is_fecn = process_ecn(qp, packet);
+	fecn = process_ecn(qp, packet);
 	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
 	trace_hfi1_rsp_rcv_tid_read_req(qp, psn);

@ -2267,9 +2271,8 @@ void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet)

 	diff = delta_psn(psn, qp->r_psn);
 	if (unlikely(diff)) {
-		if (tid_rdma_rcv_error(packet, ohdr, qp, psn, diff))
-			return;
-		goto send_ack;
+		tid_rdma_rcv_err(packet, ohdr, qp, psn, diff, fecn);
+		return;
 	}

 	/* We've verified the request, insert it into the ack queue. */
@ -2285,10 +2288,7 @@ void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet)
 		update_ack_queue(qp, next);
 	}
 	e = &qp->s_ack_queue[qp->r_head_ack_queue];
-	if (e->rdma_sge.mr) {
-		rvt_put_mr(e->rdma_sge.mr);
-		e->rdma_sge.mr = NULL;
-	}
+	release_rdma_sge_mr(e);

 	rkey = be32_to_cpu(reth->rkey);
 	qp->r_len = len;
@ -2324,11 +2324,11 @@ void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet)

 	/* Schedule the send tasklet. */
 	qp->s_flags |= RVT_S_RESP_PENDING;
+	if (fecn)
+		qp->s_flags |= RVT_S_ECN;
 	hfi1_schedule_send(qp);

 	spin_unlock_irqrestore(&qp->s_lock, flags);
-	if (is_fecn)
-		goto send_ack;
 	return;

 nack_inv_unlock:
@ -2345,8 +2345,6 @@ nack_acc:
 	rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
 	qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
 	qp->r_ack_psn = qp->r_psn;
-send_ack:
-	hfi1_send_rc_ack(packet, is_fecn);
 }

 u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
@ -2463,12 +2461,12 @@ void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet)
 	struct tid_rdma_request *req;
 	struct tid_rdma_flow *flow;
 	u32 opcode, aeth;
-	bool is_fecn;
+	bool fecn;
 	unsigned long flags;
 	u32 kpsn, ipsn;

 	trace_hfi1_sender_rcv_tid_read_resp(qp);
-	is_fecn = process_ecn(qp, packet);
+	fecn = process_ecn(qp, packet);
 	kpsn = mask_psn(be32_to_cpu(ohdr->bth[2]));
 	aeth = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.aeth);
 	opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
@ -2481,8 +2479,43 @@ void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet)

 	flow = &req->flows[req->clear_tail];
 	/* When header suppression is disabled */
-	if (cmp_psn(ipsn, flow->flow_state.ib_lpsn))
+	if (cmp_psn(ipsn, flow->flow_state.ib_lpsn)) {
+		update_r_next_psn_fecn(packet, priv, rcd, flow, fecn);
+
+		if (cmp_psn(kpsn, flow->flow_state.r_next_psn))
+			goto ack_done;
+		flow->flow_state.r_next_psn = mask_psn(kpsn + 1);
+		/*
+		 * Copy the payload to destination buffer if this packet is
+		 * delivered as an eager packet due to RSM rule and FECN.
+		 * The RSM rule selects FECN bit in BTH and SH bit in
+		 * KDETH header and therefore will not match the last
+		 * packet of each segment that has SH bit cleared.
+		 */
+		if (fecn && packet->etype == RHF_RCV_TYPE_EAGER) {
+			struct rvt_sge_state ss;
+			u32 len;
+			u32 tlen = packet->tlen;
+			u16 hdrsize = packet->hlen;
+			u8 pad = packet->pad;
+			u8 extra_bytes = pad + packet->extra_byte +
+				(SIZE_OF_CRC << 2);
+			u32 pmtu = qp->pmtu;
+
+			if (unlikely(tlen != (hdrsize + pmtu + extra_bytes)))
+				goto ack_op_err;
+			len = restart_sge(&ss, req->e.swqe, ipsn, pmtu);
+			if (unlikely(len < pmtu))
+				goto ack_op_err;
+			rvt_copy_sge(qp, &ss, packet->payload, pmtu, false,
+				     false);
+			/* Raise the sw sequence check flag for next packet */
+			priv->s_flags |= HFI1_R_TID_SW_PSN;
+		}
+
 		goto ack_done;
+	}
+	flow->flow_state.r_next_psn = mask_psn(kpsn + 1);
 	req->ack_pending--;
 	priv->pending_tid_r_segs--;
 	qp->s_num_rd_atomic--;
@ -2524,6 +2557,7 @@ void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet)
 	     req->comp_seg == req->cur_seg) ||
 	    priv->tid_r_comp == priv->tid_r_reqs) {
 		hfi1_kern_clear_hw_flow(priv->rcd, qp);
+		priv->s_flags &= ~HFI1_R_TID_SW_PSN;
 		if (req->state == TID_REQUEST_SYNC)
 			req->state = TID_REQUEST_ACTIVE;
 	}
@ -2545,8 +2579,6 @@ ack_op_err:

 ack_done:
 	spin_unlock_irqrestore(&qp->s_lock, flags);
-	if (is_fecn)
-		hfi1_send_rc_ack(packet, is_fecn);
 }

 void hfi1_kern_read_tid_flow_free(struct rvt_qp *qp)
@ -2773,9 +2805,9 @@ static bool handle_read_kdeth_eflags(struct hfi1_ctxtdata *rcd,
 				rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
 				return ret;
 			}
-			if (priv->flow_state.flags & TID_FLOW_SW_PSN) {
+			if (priv->s_flags & HFI1_R_TID_SW_PSN) {
 				diff = cmp_psn(psn,
-					       priv->flow_state.r_next_psn);
+					       flow->flow_state.r_next_psn);
 				if (diff > 0) {
 					if (!(qp->r_flags & RVT_R_RDMAR_SEQ))
 						restart_tid_rdma_read_req(rcd,
@ -2811,22 +2843,15 @@ static bool handle_read_kdeth_eflags(struct hfi1_ctxtdata *rcd,
 						qp->r_flags &=
 							~RVT_R_RDMAR_SEQ;
 				}
-				priv->flow_state.r_next_psn++;
+				flow->flow_state.r_next_psn =
+					mask_psn(psn + 1);
 			} else {
-				u64 reg;
 				u32 last_psn;

-				/*
-				 * The only sane way to get the amount of
-				 * progress is to read the HW flow state.
-				 */
-				reg = read_uctxt_csr(dd, rcd->ctxt,
-						     RCV_TID_FLOW_TABLE +
-						     (8 * flow->idx));
-				last_psn = mask_psn(reg);
-
-				priv->flow_state.r_next_psn = last_psn;
-				priv->flow_state.flags |= TID_FLOW_SW_PSN;
+				last_psn = read_r_next_psn(dd, rcd->ctxt,
+							   flow->idx);
+				flow->flow_state.r_next_psn = last_psn;
+				priv->s_flags |= HFI1_R_TID_SW_PSN;
 				/*
 				 * If no request has been restarted yet,
 				 * restart the current one.
@ -2891,6 +2916,7 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
 	struct rvt_ack_entry *e;
 	struct tid_rdma_request *req;
 	struct tid_rdma_flow *flow;
+	int diff = 0;

 	trace_hfi1_msg_handle_kdeth_eflags(NULL, "Kdeth error: rhf ",
 					   packet->rhf);
@ -2974,17 +3000,10 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
 		switch (rte) {
 		case RHF_RTE_EXPECTED_FLOW_SEQ_ERR:
 			if (!(qpriv->s_flags & HFI1_R_TID_SW_PSN)) {
-				u64 reg;
-
 				qpriv->s_flags |= HFI1_R_TID_SW_PSN;
-				/*
-				 * The only sane way to get the amount of
-				 * progress is to read the HW flow state.
-				 */
-				reg = read_uctxt_csr(dd, rcd->ctxt,
-						     RCV_TID_FLOW_TABLE +
-						     (8 * flow->idx));
-				flow->flow_state.r_next_psn = mask_psn(reg);
+				flow->flow_state.r_next_psn =
+					read_r_next_psn(dd, rcd->ctxt,
+							flow->idx);
 				qpriv->r_next_psn_kdeth =
 					flow->flow_state.r_next_psn;
 				goto nak_psn;
@ -2997,10 +3016,12 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
 				 * mismatch could be due to packets that were
 				 * already in flight.
 				 */
-				if (psn != flow->flow_state.r_next_psn) {
-					psn = flow->flow_state.r_next_psn;
+				diff = cmp_psn(psn,
+					       flow->flow_state.r_next_psn);
+				if (diff > 0)
 					goto nak_psn;
-				}
+				else if (diff < 0)
+					break;

 				qpriv->s_nak_state = 0;
 				/*
@ -3011,8 +3032,10 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
 				if (psn == full_flow_psn(flow,
 							 flow->flow_state.lpsn))
 					ret = false;
+				flow->flow_state.r_next_psn =
+					mask_psn(psn + 1);
 				qpriv->r_next_psn_kdeth =
-					++flow->flow_state.r_next_psn;
+					flow->flow_state.r_next_psn;
 			}
 			break;

@ -3517,8 +3540,10 @@ static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx)
 		if (qpriv->r_tid_alloc == qpriv->r_tid_head) {
 			/* If all data has been received, clear the flow */
 			if (qpriv->flow_state.index < RXE_NUM_TID_FLOWS &&
-			    !qpriv->alloc_w_segs)
+			    !qpriv->alloc_w_segs) {
 				hfi1_kern_clear_hw_flow(rcd, qp);
+				qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
+			}
 			break;
 		}

@ -3544,8 +3569,7 @@ static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx)
 		if (qpriv->sync_pt && !qpriv->alloc_w_segs) {
 			hfi1_kern_clear_hw_flow(rcd, qp);
 			qpriv->sync_pt = false;
-			if (qpriv->s_flags & HFI1_R_TID_SW_PSN)
-				qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
+			qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
 		}

 		/* Allocate flow if we don't have one */
@ -3687,7 +3711,7 @@ void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet)
 	struct hfi1_qp_priv *qpriv = qp->priv;
 	struct tid_rdma_request *req;
 	u32 bth0, psn, len, rkey, num_segs;
-	bool is_fecn;
+	bool fecn;
 	u8 next;
 	u64 vaddr;
 	int diff;
@ -3696,7 +3720,7 @@ void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet)
 	if (hfi1_ruc_check_hdr(ibp, packet))
 		return;

-	is_fecn = process_ecn(qp, packet);
+	fecn = process_ecn(qp, packet);
 	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
 	trace_hfi1_rsp_rcv_tid_write_req(qp, psn);

@ -3713,9 +3737,8 @@ void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet)
 	num_segs = DIV_ROUND_UP(len, qpriv->tid_rdma.local.max_len);
 	diff = delta_psn(psn, qp->r_psn);
 	if (unlikely(diff)) {
-		if (tid_rdma_rcv_error(packet, ohdr, qp, psn, diff))
-			return;
-		goto send_ack;
+		tid_rdma_rcv_err(packet, ohdr, qp, psn, diff, fecn);
+		return;
 	}

 	/*
@ -3751,10 +3774,7 @@ void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet)
 		goto update_head;
 	}

-	if (e->rdma_sge.mr) {
-		rvt_put_mr(e->rdma_sge.mr);
-		e->rdma_sge.mr = NULL;
-	}
+	release_rdma_sge_mr(e);

 	/* The length needs to be in multiples of PAGE_SIZE */
 	if (!len || len & ~PAGE_MASK)
@ -3834,11 +3854,11 @@ update_head:

 	/* Schedule the send tasklet. */
 	qp->s_flags |= RVT_S_RESP_PENDING;
+	if (fecn)
+		qp->s_flags |= RVT_S_ECN;
 	hfi1_schedule_send(qp);

 	spin_unlock_irqrestore(&qp->s_lock, flags);
-	if (is_fecn)
-		goto send_ack;
 	return;

 nack_inv_unlock:
@ -3855,8 +3875,6 @@ nack_acc:
 	rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
 	qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
 	qp->r_ack_psn = qp->r_psn;
-send_ack:
-	hfi1_send_rc_ack(packet, is_fecn);
 }

 u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
@ -4073,10 +4091,10 @@ void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet)
 	struct tid_rdma_flow *flow;
 	enum ib_wc_status status;
 	u32 opcode, aeth, psn, flow_psn, i, tidlen = 0, pktlen;
-	bool is_fecn;
+	bool fecn;
 	unsigned long flags;

-	is_fecn = process_ecn(qp, packet);
+	fecn = process_ecn(qp, packet);
 	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
 	aeth = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.aeth);
 	opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
@ -4216,7 +4234,6 @@ void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet)
 		qpriv->s_tid_cur = i;
 	}
 	qp->s_flags &= ~HFI1_S_WAIT_TID_RESP;
-
 	hfi1_schedule_tid_send(qp);
 	goto ack_done;

@ -4225,9 +4242,9 @@ ack_op_err:
 ack_err:
 	rvt_error_qp(qp, status);
 ack_done:
+	if (fecn)
+		qp->s_flags |= RVT_S_ECN;
 	spin_unlock_irqrestore(&qp->s_lock, flags);
-	if (is_fecn)
-		hfi1_send_rc_ack(packet, is_fecn);
 }

 bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe,
@ -4307,7 +4324,9 @@ void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet)
 	unsigned long flags;
 	u32 psn, next;
 	u8 opcode;
+	bool fecn;

+	fecn = process_ecn(qp, packet);
 	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
 	opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;

@ -4320,9 +4339,53 @@ void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet)
 	req = ack_to_tid_req(e);
 	flow = &req->flows[req->clear_tail];
 	if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.lpsn))) {
+		update_r_next_psn_fecn(packet, priv, rcd, flow, fecn);
+
 		if (cmp_psn(psn, flow->flow_state.r_next_psn))
 			goto send_nak;
-		flow->flow_state.r_next_psn++;
+
+		flow->flow_state.r_next_psn = mask_psn(psn + 1);
+		/*
+		 * Copy the payload to destination buffer if this packet is
+		 * delivered as an eager packet due to RSM rule and FECN.
+		 * The RSM rule selects FECN bit in BTH and SH bit in
+		 * KDETH header and therefore will not match the last
+		 * packet of each segment that has SH bit cleared.
+		 */
+		if (fecn && packet->etype == RHF_RCV_TYPE_EAGER) {
+			struct rvt_sge_state ss;
+			u32 len;
+			u32 tlen = packet->tlen;
+			u16 hdrsize = packet->hlen;
+			u8 pad = packet->pad;
+			u8 extra_bytes = pad + packet->extra_byte +
+				(SIZE_OF_CRC << 2);
+			u32 pmtu = qp->pmtu;
+
+			if (unlikely(tlen != (hdrsize + pmtu + extra_bytes)))
+				goto send_nak;
+			len = req->comp_seg * req->seg_len;
+			len += delta_psn(psn,
+				full_flow_psn(flow, flow->flow_state.spsn)) *
+				pmtu;
+			if (unlikely(req->total_len - len < pmtu))
+				goto send_nak;
+
+			/*
+			 * The e->rdma_sge field is set when TID RDMA WRITE REQ
+			 * is first received and is never modified thereafter.
+			 */
+			ss.sge = e->rdma_sge;
+			ss.sg_list = NULL;
+			ss.num_sge = 1;
+			ss.total_len = req->total_len;
+			rvt_skip_sge(&ss, len, false);
+			rvt_copy_sge(qp, &ss, packet->payload, pmtu, false,
+				     false);
+			/* Raise the sw sequence check flag for next packet */
+			priv->r_next_psn_kdeth = mask_psn(psn + 1);
+			priv->s_flags |= HFI1_R_TID_SW_PSN;
+		}
 		goto exit;
 	}
 	flow->flow_state.r_next_psn = mask_psn(psn + 1);
@ -4347,6 +4410,7 @@ void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet)
 		priv->r_tid_ack = priv->r_tid_tail;

 	if (opcode == TID_OP(WRITE_DATA_LAST)) {
+		release_rdma_sge_mr(e);
 		for (next = priv->r_tid_tail + 1; ; next++) {
 			if (next > rvt_size_atomic(&dev->rdi))
 				next = 0;
@ -4386,6 +4450,8 @@ done:
 	hfi1_schedule_tid_send(qp);
 exit:
 	priv->r_next_psn_kdeth = flow->flow_state.r_next_psn;
+	if (fecn)
+		qp->s_flags |= RVT_S_ECN;
 	spin_unlock_irqrestore(&qp->s_lock, flags);
 	return;

@ -4487,12 +4553,11 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet)
 	struct tid_rdma_request *req;
 	struct tid_rdma_flow *flow;
 	u32 aeth, psn, req_psn, ack_psn, fspsn, resync_psn, ack_kpsn;
-	bool is_fecn;
 	unsigned long flags;
 	u16 fidx;

 	trace_hfi1_tid_write_sender_rcv_tid_ack(qp, 0);
-	is_fecn = process_ecn(qp, packet);
+	process_ecn(qp, packet);
 	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
 	aeth = be32_to_cpu(ohdr->u.tid_rdma.ack.aeth);
 	req_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.verbs_psn));
@ -4846,10 +4911,10 @@ void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet)
 	struct tid_rdma_flow *flow;
 	struct tid_flow_state *fs = &qpriv->flow_state;
 	u32 psn, generation, idx, gen_next;
-	bool is_fecn;
+	bool fecn;
 	unsigned long flags;

-	is_fecn = process_ecn(qp, packet);
+	fecn = process_ecn(qp, packet);
 	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));

 	generation = mask_psn(psn + 1) >> HFI1_KDETH_BTH_SEQ_SHIFT;
@ -4940,6 +5005,8 @@ void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet)
 	qpriv->s_flags |= RVT_S_ACK_PENDING;
 	hfi1_schedule_tid_send(qp);
 bail:
+	if (fecn)
+		qp->s_flags |= RVT_S_ECN;
 	spin_unlock_irqrestore(&qp->s_lock, flags);
 }

@ -5464,3 +5531,48 @@ bool hfi1_tid_rdma_ack_interlock(struct rvt_qp *qp, struct rvt_ack_entry *e)
 	}
 	return false;
 }
+
+static u32 read_r_next_psn(struct hfi1_devdata *dd, u8 ctxt, u8 fidx)
+{
+	u64 reg;
+
+	/*
+	 * The only sane way to get the amount of
+	 * progress is to read the HW flow state.
+	 */
+	reg = read_uctxt_csr(dd, ctxt, RCV_TID_FLOW_TABLE + (8 * fidx));
+	return mask_psn(reg);
+}
+
+static void tid_rdma_rcv_err(struct hfi1_packet *packet,
+			     struct ib_other_headers *ohdr,
+			     struct rvt_qp *qp, u32 psn, int diff, bool fecn)
+{
+	unsigned long flags;
+
+	tid_rdma_rcv_error(packet, ohdr, qp, psn, diff);
+	if (fecn) {
+		spin_lock_irqsave(&qp->s_lock, flags);
+		qp->s_flags |= RVT_S_ECN;
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+	}
+}
+
+static void update_r_next_psn_fecn(struct hfi1_packet *packet,
+				   struct hfi1_qp_priv *priv,
+				   struct hfi1_ctxtdata *rcd,
+				   struct tid_rdma_flow *flow,
+				   bool fecn)
+{
+	/*
+	 * If a start/middle packet is delivered here due to
+	 * RSM rule and FECN, we need to update the r_next_psn.
+	 */
+	if (fecn && packet->etype == RHF_RCV_TYPE_EAGER &&
+	    !(priv->s_flags & HFI1_R_TID_SW_PSN)) {
+		struct hfi1_devdata *dd = rcd->dd;
+
+		flow->flow_state.r_next_psn =
+			read_r_next_psn(dd, rcd->ctxt, flow->idx);
+	}
+}
--- a/drivers/infiniband/hw/hfi1/tid_rdma.h
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.h
@ -76,10 +76,8 @@ struct tid_rdma_qp_params {
 struct tid_flow_state {
 	u32 generation;
 	u32 psn;
-	u32 r_next_psn;      /* next PSN to be received (in TID space) */
 	u8 index;
 	u8 last_index;
-	u8 flags;
 };

 enum tid_rdma_req_state {
--- a/drivers/infiniband/hw/hfi1/trace_tid.h
+++ b/drivers/infiniband/hw/hfi1/trace_tid.h
@ -53,7 +53,7 @@ u16 hfi1_trace_get_tid_idx(u32 ent);
 			    "tid_r_comp %u pending_tid_r_segs %u " \
 			    "s_flags 0x%x ps_flags 0x%x iow_flags 0x%lx " \
 			    "s_state 0x%x hw_flow_index %u generation 0x%x " \
-			    "fpsn 0x%x flow_flags 0x%x"
+			    "fpsn 0x%x"

 #define TID_REQ_PRN "[%s] qpn 0x%x newreq %u opcode 0x%x psn 0x%x lpsn 0x%x " \
 		    "cur_seg %u comp_seg %u ack_seg %u alloc_seg %u " \
@ -71,7 +71,7 @@ u16 hfi1_trace_get_tid_idx(u32 ent);
 			    "pending_tid_w_segs %u sync_pt %s " \
 			    "ps_nak_psn 0x%x ps_nak_state 0x%x " \
 			    "prnr_nak_state 0x%x hw_flow_index %u generation "\
-			    "0x%x fpsn 0x%x flow_flags 0x%x resync %s" \
+			    "0x%x fpsn 0x%x resync %s" \
 			    "r_next_psn_kdeth 0x%x"

 #define TID_WRITE_SENDER_PRN "[%s] qpn 0x%x newreq %u s_tid_cur %u " \
@ -973,7 +973,6 @@ DECLARE_EVENT_CLASS(/* tid_read_sender */
 		__field(u32, hw_flow_index)
 		__field(u32, generation)
 		__field(u32, fpsn)
-		__field(u32, flow_flags)
 	),
 	TP_fast_assign(/* assign */
 		struct hfi1_qp_priv *priv = qp->priv;
@ -991,7 +990,6 @@ DECLARE_EVENT_CLASS(/* tid_read_sender */
 		__entry->hw_flow_index = priv->flow_state.index;
 		__entry->generation = priv->flow_state.generation;
 		__entry->fpsn = priv->flow_state.psn;
-		__entry->flow_flags = priv->flow_state.flags;
 	),
 	TP_printk(/* print */
 		TID_READ_SENDER_PRN,
@ -1007,8 +1005,7 @@ DECLARE_EVENT_CLASS(/* tid_read_sender */
 		__entry->s_state,
 		__entry->hw_flow_index,
 		__entry->generation,
-		__entry->fpsn,
-		__entry->flow_flags
+		__entry->fpsn
 	)
 );

@ -1338,7 +1335,6 @@ DECLARE_EVENT_CLASS(/* tid_write_sp */
 		__field(u32, hw_flow_index)
 		__field(u32, generation)
 		__field(u32, fpsn)
-		__field(u32, flow_flags)
 		__field(bool, resync)
 		__field(u32, r_next_psn_kdeth)
 	),
@ -1360,7 +1356,6 @@ DECLARE_EVENT_CLASS(/* tid_write_sp */
 		__entry->hw_flow_index = priv->flow_state.index;
 		__entry->generation = priv->flow_state.generation;
 		__entry->fpsn = priv->flow_state.psn;
-		__entry->flow_flags = priv->flow_state.flags;
 		__entry->resync = priv->resync;
 		__entry->r_next_psn_kdeth = priv->r_next_psn_kdeth;
 	),
@ -1381,7 +1376,6 @@ DECLARE_EVENT_CLASS(/* tid_write_sp */
 		__entry->hw_flow_index,
 		__entry->generation,
 		__entry->fpsn,
-		__entry->flow_flags,
 		__entry->resync ? "yes" : "no",
 		__entry->r_next_psn_kdeth
 	)
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@ -585,7 +585,7 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
 	struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem);
 	bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
 	bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH;
-	u64 access_mask = ODP_READ_ALLOWED_BIT;
+	u64 access_mask;
 	u64 start_idx, page_mask;
 	struct ib_umem_odp *odp;
 	size_t size;
@ -607,6 +607,7 @@ next_mr:
 	page_shift = mr->umem->page_shift;
 	page_mask = ~(BIT(page_shift) - 1);
 	start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift;
+	access_mask = ODP_READ_ALLOWED_BIT;

 	if (prefetch && !downgrade && !mr->umem->writable) {
 		/* prefetch with write-access must