From ad4fe1264b396e94b78d91c49ecea425a593b28d Mon Sep 17 00:00:00 2001
From: Serge Semin <Sergey.Semin@baikalelectronics.ru>
Date: Sun, 20 Sep 2020 14:23:22 +0300
Subject: [PATCH] spi: dw-dma: Add one-by-one SG list entries transfer

In case if at least one of the requested DMA engine channels doesn't
support the hardware accelerated SG list entries traverse, the DMA driver
will most likely work that around by performing the IRQ-based SG list
entries resubmission. That might and will cause a problem if the DMA Tx
channel is recharged and re-executed before the Rx DMA channel. Due to
non-deterministic IRQ-handler execution latency the DMA Tx channel will
start pushing data to the SPI bus before the Rx DMA channel is even
reinitialized with the next inbound SG list entry. By doing so the DMA
Tx channel will implicitly start filling the DW APB SSI Rx FIFO up, which
while the DMA Rx channel being recharged and re-executed will eventually
be overflown.

In order to solve the problem we have to feed the DMA engine with SG
list entries one-by-one. It shall keep the DW APB SSI Tx and Rx FIFOs
synchronized and prevent the Rx FIFO overflow. Since in general the SPI
tx_sg and rx_sg lists may have different number of entries of different
lengths (though total length should match) we virtually split the
SG-lists to the set of DMA transfers, which length is a minimum of the
ordered SG-entries lengths.

The solution described above is only executed if a full-duplex SPI
transfer is requested and the DMA engine hasn't provided channels with
hardware accelerated SG list traverse capability to handle both SG
lists at once.

Signed-off-by: Serge Semin <Sergey.Semin@baikalelectronics.ru>
Suggested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20200920112322.24585-12-Sergey.Semin@baikalelectronics.ru
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-dw-dma.c | 139 ++++++++++++++++++++++++++++++++++++++-
 drivers/spi/spi-dw.h     |   1 +
 2 files changed, 138 insertions(+), 2 deletions(-)

diff --git a/drivers/spi/spi-dw-dma.c b/drivers/spi/spi-dw-dma.c
index f333c2e23bf6..1cbb5a9efbba 100644
--- a/drivers/spi/spi-dw-dma.c
+++ b/drivers/spi/spi-dw-dma.c
@@ -72,6 +72,23 @@ static void dw_spi_dma_maxburst_init(struct dw_spi *dws)
 	dw_writel(dws, DW_SPI_DMATDLR, dws->txburst);
 }
 
+static void dw_spi_dma_sg_burst_init(struct dw_spi *dws)
+{
+	struct dma_slave_caps tx = {0}, rx = {0};
+
+	dma_get_slave_caps(dws->txchan, &tx);
+	dma_get_slave_caps(dws->rxchan, &rx);
+
+	if (tx.max_sg_burst > 0 && rx.max_sg_burst > 0)
+		dws->dma_sg_burst = min(tx.max_sg_burst, rx.max_sg_burst);
+	else if (tx.max_sg_burst > 0)
+		dws->dma_sg_burst = tx.max_sg_burst;
+	else if (rx.max_sg_burst > 0)
+		dws->dma_sg_burst = rx.max_sg_burst;
+	else
+		dws->dma_sg_burst = 0;
+}
+
 static int dw_spi_dma_init_mfld(struct device *dev, struct dw_spi *dws)
 {
 	struct dw_dma_slave dma_tx = { .dst_id = 1 }, *tx = &dma_tx;
@@ -109,6 +126,8 @@ static int dw_spi_dma_init_mfld(struct device *dev, struct dw_spi *dws)
 
 	dw_spi_dma_maxburst_init(dws);
 
+	dw_spi_dma_sg_burst_init(dws);
+
 	return 0;
 
 free_rxchan:
@@ -138,6 +157,8 @@ static int dw_spi_dma_init_generic(struct device *dev, struct dw_spi *dws)
 
 	dw_spi_dma_maxburst_init(dws);
 
+	dw_spi_dma_sg_burst_init(dws);
+
 	return 0;
 }
 
@@ -466,11 +487,125 @@ err_clear_dmac:
 	return ret;
 }
 
-static int dw_spi_dma_transfer(struct dw_spi *dws, struct spi_transfer *xfer)
+/*
+ * In case if at least one of the requested DMA channels doesn't support the
+ * hardware accelerated SG list entries traverse, the DMA driver will most
+ * likely work that around by performing the IRQ-based SG list entries
+ * resubmission. That might and will cause a problem if the DMA Tx channel is
+ * recharged and re-executed before the Rx DMA channel. Due to
+ * non-deterministic IRQ-handler execution latency the DMA Tx channel will
+ * start pushing data to the SPI bus before the Rx DMA channel is even
+ * reinitialized with the next inbound SG list entry. By doing so the DMA Tx
+ * channel will implicitly start filling the DW APB SSI Rx FIFO up, which while
+ * the DMA Rx channel being recharged and re-executed will eventually be
+ * overflown.
+ *
+ * In order to solve the problem we have to feed the DMA engine with SG list
+ * entries one-by-one. It shall keep the DW APB SSI Tx and Rx FIFOs
+ * synchronized and prevent the Rx FIFO overflow. Since in general the tx_sg
+ * and rx_sg lists may have different number of entries of different lengths
+ * (though total length should match) let's virtually split the SG-lists to the
+ * set of DMA transfers, which length is a minimum of the ordered SG-entries
+ * lengths. An ASCII-sketch of the implemented algo is following:
+ *                  xfer->len
+ *                |___________|
+ * tx_sg list:    |___|____|__|
+ * rx_sg list:    |_|____|____|
+ * DMA transfers: |_|_|__|_|__|
+ *
+ * Note in order to have this workaround solving the denoted problem the DMA
+ * engine driver should properly initialize the max_sg_burst capability and set
+ * the DMA device max segment size parameter with maximum data block size the
+ * DMA engine supports.
+ */
+
+static int dw_spi_dma_transfer_one(struct dw_spi *dws,
+				   struct spi_transfer *xfer)
 {
+	struct scatterlist *tx_sg = NULL, *rx_sg = NULL, tx_tmp, rx_tmp;
+	unsigned int tx_len = 0, rx_len = 0;
+	unsigned int base, len;
 	int ret;
 
-	ret = dw_spi_dma_transfer_all(dws, xfer);
+	sg_init_table(&tx_tmp, 1);
+	sg_init_table(&rx_tmp, 1);
+
+	for (base = 0, len = 0; base < xfer->len; base += len) {
+		/* Fetch next Tx DMA data chunk */
+		if (!tx_len) {
+			tx_sg = !tx_sg ? &xfer->tx_sg.sgl[0] : sg_next(tx_sg);
+			sg_dma_address(&tx_tmp) = sg_dma_address(tx_sg);
+			tx_len = sg_dma_len(tx_sg);
+		}
+
+		/* Fetch next Rx DMA data chunk */
+		if (!rx_len) {
+			rx_sg = !rx_sg ? &xfer->rx_sg.sgl[0] : sg_next(rx_sg);
+			sg_dma_address(&rx_tmp) = sg_dma_address(rx_sg);
+			rx_len = sg_dma_len(rx_sg);
+		}
+
+		len = min(tx_len, rx_len);
+
+		sg_dma_len(&tx_tmp) = len;
+		sg_dma_len(&rx_tmp) = len;
+
+		/* Submit DMA Tx transfer */
+		ret = dw_spi_dma_submit_tx(dws, &tx_tmp, 1);
+		if (ret)
+			break;
+
+		/* Submit DMA Rx transfer */
+		ret = dw_spi_dma_submit_rx(dws, &rx_tmp, 1);
+		if (ret)
+			break;
+
+		/* Rx must be started before Tx due to SPI instinct */
+		dma_async_issue_pending(dws->rxchan);
+
+		dma_async_issue_pending(dws->txchan);
+
+		/*
+		 * Here we only need to wait for the DMA transfer to be
+		 * finished since SPI controller is kept enabled during the
+		 * procedure this loop implements and there is no risk to lose
+		 * data left in the Tx/Rx FIFOs.
+		 */
+		ret = dw_spi_dma_wait(dws, len, xfer->effective_speed_hz);
+		if (ret)
+			break;
+
+		reinit_completion(&dws->dma_completion);
+
+		sg_dma_address(&tx_tmp) += len;
+		sg_dma_address(&rx_tmp) += len;
+		tx_len -= len;
+		rx_len -= len;
+	}
+
+	dw_writel(dws, DW_SPI_DMACR, 0);
+
+	return ret;
+}
+
+static int dw_spi_dma_transfer(struct dw_spi *dws, struct spi_transfer *xfer)
+{
+	unsigned int nents;
+	int ret;
+
+	nents = max(xfer->tx_sg.nents, xfer->rx_sg.nents);
+
+	/*
+	 * Execute normal DMA-based transfer (which submits the Rx and Tx SG
+	 * lists directly to the DMA engine at once) if either full hardware
+	 * accelerated SG list traverse is supported by both channels, or the
+	 * Tx-only SPI transfer is requested, or the DMA engine is capable to
+	 * handle both SG lists on hardware accelerated basis.
+	 */
+	if (!dws->dma_sg_burst || !xfer->rx_buf || nents <= dws->dma_sg_burst)
+		ret = dw_spi_dma_transfer_all(dws, xfer);
+	else
+		ret = dw_spi_dma_transfer_one(dws, xfer);
 	if (ret)
 		return ret;
 
diff --git a/drivers/spi/spi-dw.h b/drivers/spi/spi-dw.h
index 90dfd21622d6..f04075413dba 100644
--- a/drivers/spi/spi-dw.h
+++ b/drivers/spi/spi-dw.h
@@ -149,6 +149,7 @@ struct dw_spi {
 	u32			txburst;
 	struct dma_chan		*rxchan;
 	u32			rxburst;
+	u32			dma_sg_burst;
 	unsigned long		dma_chan_busy;
 	dma_addr_t		dma_addr; /* phy address of the Data register */
 	const struct dw_spi_dma_ops *dma_ops;