From b2cdeb19f16ad984eb5bb9193f793d05a8101511 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 13 Oct 2016 11:54:31 +0300
Subject: [PATCH 1/5] drm/vc4: Fix a couple error codes in vc4_cl_lookup_bos()

If the allocation fails the current code returns success.  If
copy_from_user() fails it returns the number of bytes remaining instead
of -EFAULT.

Fixes: d5b1a78a772f ("drm/vc4: Add support for drawing 3D frames.")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/vc4_gem.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
index 47a095f392f8..303f23c96220 100644
--- a/drivers/gpu/drm/vc4/vc4_gem.c
+++ b/drivers/gpu/drm/vc4/vc4_gem.c
@@ -544,14 +544,15 @@ vc4_cl_lookup_bos(struct drm_device *dev,
 
 	handles = drm_malloc_ab(exec->bo_count, sizeof(uint32_t));
 	if (!handles) {
+		ret = -ENOMEM;
 		DRM_ERROR("Failed to allocate incoming GEM handles\n");
 		goto fail;
 	}
 
-	ret = copy_from_user(handles,
-			     (void __user *)(uintptr_t)args->bo_handles,
-			     exec->bo_count * sizeof(uint32_t));
-	if (ret) {
+	if (copy_from_user(handles,
+			   (void __user *)(uintptr_t)args->bo_handles,
+			   exec->bo_count * sizeof(uint32_t))) {
+		ret = -EFAULT;
 		DRM_ERROR("Failed to copy in GEM handles\n");
 		goto fail;
 	}

From 457e67a728696c4f8e6423c64e93def50530db9a Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Thu, 20 Oct 2016 16:48:12 -0700
Subject: [PATCH 2/5] drm/vc4: Fix termination of the initial scan for branch
 targets.

The loop is scanning until the original max_ip (size of the BO), but
we want to not examine any code after the PROG_END's delay slots.
There was a block trying to do that, except that we had some early
continue statements if the signal wasn't a PROG_END or a BRANCH.

The failure mode would be that a valid shader is rejected because some
undefined memory after the PROG_END slots is parsed as a branch and
the rest of its setup is illegal.  I haven't seen this in the wild,
but valgrind was complaining when about this up in the userland
simulator mode.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/vc4_validate_shaders.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/vc4/vc4_validate_shaders.c b/drivers/gpu/drm/vc4/vc4_validate_shaders.c
index 2543cf5b8b51..917321ce832f 100644
--- a/drivers/gpu/drm/vc4/vc4_validate_shaders.c
+++ b/drivers/gpu/drm/vc4/vc4_validate_shaders.c
@@ -608,9 +608,7 @@ static bool
 vc4_validate_branches(struct vc4_shader_validation_state *validation_state)
 {
 	uint32_t max_branch_target = 0;
-	bool found_shader_end = false;
 	int ip;
-	int shader_end_ip = 0;
 	int last_branch = -2;
 
 	for (ip = 0; ip < validation_state->max_ip; ip++) {
@@ -621,8 +619,13 @@ vc4_validate_branches(struct vc4_shader_validation_state *validation_state)
 		uint32_t branch_target_ip;
 
 		if (sig == QPU_SIG_PROG_END) {
-			shader_end_ip = ip;
-			found_shader_end = true;
+			/* There are two delay slots after program end is
+			 * signaled that are still executed, then we're
+			 * finished.  validation_state->max_ip is the
+			 * instruction after the last valid instruction in the
+			 * program.
+			 */
+			validation_state->max_ip = ip + 3;
 			continue;
 		}
 
@@ -676,15 +679,9 @@ vc4_validate_branches(struct vc4_shader_validation_state *validation_state)
 		}
 		set_bit(after_delay_ip, validation_state->branch_targets);
 		max_branch_target = max(max_branch_target, after_delay_ip);
-
-		/* There are two delay slots after program end is signaled
-		 * that are still executed, then we're finished.
-		 */
-		if (found_shader_end && ip == shader_end_ip + 2)
-			break;
 	}
 
-	if (max_branch_target > shader_end_ip) {
+	if (max_branch_target > validation_state->max_ip - 3) {
 		DRM_ERROR("Branch landed after QPU_SIG_PROG_END");
 		return false;
 	}

From 7154d76fedf549607afbc0d13db9aaf02da5cebf Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Thu, 3 Nov 2016 18:53:10 -0700
Subject: [PATCH 3/5] drm/vc4: Add support for rendering with ETC1 textures.

The validation for it ends up being quite simple, but I hadn't got
around to it before merging the driver.  For backwards compatibility,
we also need to add a flag so that the userspace GL driver can easily
tell if the kernel will allow ETC1 textures (on an old kernel, it will
continue to convert to RGBA8)

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/vc4_drv.c      | 1 +
 drivers/gpu/drm/vc4/vc4_validate.c | 7 +++++++
 include/uapi/drm/vc4_drm.h         | 1 +
 3 files changed, 9 insertions(+)

diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c
index 8703f56b7947..b087404c2784 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.c
+++ b/drivers/gpu/drm/vc4/vc4_drv.c
@@ -78,6 +78,7 @@ static int vc4_get_param_ioctl(struct drm_device *dev, void *data,
 		pm_runtime_put(&vc4->v3d->pdev->dev);
 		break;
 	case DRM_VC4_PARAM_SUPPORTS_BRANCHES:
+	case DRM_VC4_PARAM_SUPPORTS_ETC1:
 		args->value = true;
 		break;
 	default:
diff --git a/drivers/gpu/drm/vc4/vc4_validate.c b/drivers/gpu/drm/vc4/vc4_validate.c
index 26503e307438..e18f88203d32 100644
--- a/drivers/gpu/drm/vc4/vc4_validate.c
+++ b/drivers/gpu/drm/vc4/vc4_validate.c
@@ -644,6 +644,13 @@ reloc_tex(struct vc4_exec_info *exec,
 		cpp = 1;
 		break;
 	case VC4_TEXTURE_TYPE_ETC1:
+		/* ETC1 is arranged as 64-bit blocks, where each block is 4x4
+		 * pixels.
+		 */
+		cpp = 8;
+		width = (width + 3) >> 2;
+		height = (height + 3) >> 2;
+		break;
 	case VC4_TEXTURE_TYPE_BW1:
 	case VC4_TEXTURE_TYPE_A4:
 	case VC4_TEXTURE_TYPE_A1:
diff --git a/include/uapi/drm/vc4_drm.h b/include/uapi/drm/vc4_drm.h
index ad7edc3edf7c..69caa21f0cb2 100644
--- a/include/uapi/drm/vc4_drm.h
+++ b/include/uapi/drm/vc4_drm.h
@@ -286,6 +286,7 @@ struct drm_vc4_get_hang_state {
 #define DRM_VC4_PARAM_V3D_IDENT1		1
 #define DRM_VC4_PARAM_V3D_IDENT2		2
 #define DRM_VC4_PARAM_SUPPORTS_BRANCHES		3
+#define DRM_VC4_PARAM_SUPPORTS_ETC1		4
 
 struct drm_vc4_get_param {
 	__u32 param;

From 3a62234680d86efa0239665ed8a0e908f1aef147 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 4 Nov 2016 15:58:38 -0700
Subject: [PATCH 4/5] drm/vc4: Use runtime autosuspend to avoid thrashing V3D
 power state.

The pm_runtime_put() we were using immediately released power on the
device, which meant that we were generally turning the device off and
on once per frame.  In many profiles I've looked at, that added up to
about 1% of CPU time, but this could get worse in the case of frequent
rendering and readback (as may happen in X rendering).  By keeping the
device on until we've been idle for a couple of frames, we drop the
overhead of runtime PM down to sub-.1%.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/vc4_drv.c | 9 ++++++---
 drivers/gpu/drm/vc4/vc4_gem.c | 6 ++++--
 drivers/gpu/drm/vc4/vc4_v3d.c | 2 ++
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c
index b087404c2784..7abfe088f2d1 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.c
+++ b/drivers/gpu/drm/vc4/vc4_drv.c
@@ -61,21 +61,24 @@ static int vc4_get_param_ioctl(struct drm_device *dev, void *data,
 		if (ret < 0)
 			return ret;
 		args->value = V3D_READ(V3D_IDENT0);
-		pm_runtime_put(&vc4->v3d->pdev->dev);
+		pm_runtime_mark_last_busy(&vc4->v3d->pdev->dev);
+		pm_runtime_put_autosuspend(&vc4->v3d->pdev->dev);
 		break;
 	case DRM_VC4_PARAM_V3D_IDENT1:
 		ret = pm_runtime_get_sync(&vc4->v3d->pdev->dev);
 		if (ret < 0)
 			return ret;
 		args->value = V3D_READ(V3D_IDENT1);
-		pm_runtime_put(&vc4->v3d->pdev->dev);
+		pm_runtime_mark_last_busy(&vc4->v3d->pdev->dev);
+		pm_runtime_put_autosuspend(&vc4->v3d->pdev->dev);
 		break;
 	case DRM_VC4_PARAM_V3D_IDENT2:
 		ret = pm_runtime_get_sync(&vc4->v3d->pdev->dev);
 		if (ret < 0)
 			return ret;
 		args->value = V3D_READ(V3D_IDENT2);
-		pm_runtime_put(&vc4->v3d->pdev->dev);
+		pm_runtime_mark_last_busy(&vc4->v3d->pdev->dev);
+		pm_runtime_put_autosuspend(&vc4->v3d->pdev->dev);
 		break;
 	case DRM_VC4_PARAM_SUPPORTS_BRANCHES:
 	case DRM_VC4_PARAM_SUPPORTS_ETC1:
diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
index 303f23c96220..db920771bfb5 100644
--- a/drivers/gpu/drm/vc4/vc4_gem.c
+++ b/drivers/gpu/drm/vc4/vc4_gem.c
@@ -709,8 +709,10 @@ vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec)
 	}
 
 	mutex_lock(&vc4->power_lock);
-	if (--vc4->power_refcount == 0)
-		pm_runtime_put(&vc4->v3d->pdev->dev);
+	if (--vc4->power_refcount == 0) {
+		pm_runtime_mark_last_busy(&vc4->v3d->pdev->dev);
+		pm_runtime_put_autosuspend(&vc4->v3d->pdev->dev);
+	}
 	mutex_unlock(&vc4->power_lock);
 
 	kfree(exec);
diff --git a/drivers/gpu/drm/vc4/vc4_v3d.c b/drivers/gpu/drm/vc4/vc4_v3d.c
index e6d3c6028341..7cc346ad9b0b 100644
--- a/drivers/gpu/drm/vc4/vc4_v3d.c
+++ b/drivers/gpu/drm/vc4/vc4_v3d.c
@@ -222,6 +222,8 @@ static int vc4_v3d_bind(struct device *dev, struct device *master, void *data)
 		return ret;
 	}
 
+	pm_runtime_use_autosuspend(dev);
+	pm_runtime_set_autosuspend_delay(dev, 40); /* a little over 2 frames. */
 	pm_runtime_enable(dev);
 
 	return 0;

From c778cc5df944291dcdb1ca7a6bb781fbc22550c5 Mon Sep 17 00:00:00 2001
From: Jonas Pfeil <pfeiljonas@gmx.de>
Date: Tue, 8 Nov 2016 00:18:39 +0100
Subject: [PATCH 5/5] drm/vc4: Add fragment shader threading support

FS threading brings performance improvements of 0-20% in glmark2.

The validation code checks for thread switch signals and ensures that
the registers of the other thread are not touched, and that our clamps
are not live across thread switches.  It also checks that the
threading and branching instructions do not interfere.

(Original patch by Jonas, changes by anholt for style cleanup,
removing validation the kernel doesn't need to do, and adding the flag
for userspace).

v2: Minor style fixes from checkpatch.

Signed-off-by: Jonas Pfeil <pfeiljonas@gmx.de>
Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/vc4_drv.c              |  1 +
 drivers/gpu/drm/vc4/vc4_drv.h              |  2 +
 drivers/gpu/drm/vc4/vc4_validate.c         | 17 ++++--
 drivers/gpu/drm/vc4/vc4_validate_shaders.c | 63 ++++++++++++++++++++++
 include/uapi/drm/vc4_drm.h                 |  1 +
 5 files changed, 79 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c
index 7abfe088f2d1..86aabf6d0f79 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.c
+++ b/drivers/gpu/drm/vc4/vc4_drv.c
@@ -82,6 +82,7 @@ static int vc4_get_param_ioctl(struct drm_device *dev, void *data,
 		break;
 	case DRM_VC4_PARAM_SUPPORTS_BRANCHES:
 	case DRM_VC4_PARAM_SUPPORTS_ETC1:
+	case DRM_VC4_PARAM_SUPPORTS_THREADED_FS:
 		args->value = true;
 		break;
 	default:
diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index 7c1e4d97486f..fef172804345 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -381,6 +381,8 @@ struct vc4_validated_shader_info {
 
 	uint32_t num_uniform_addr_offsets;
 	uint32_t *uniform_addr_offsets;
+
+	bool is_threaded;
 };
 
 /**
diff --git a/drivers/gpu/drm/vc4/vc4_validate.c b/drivers/gpu/drm/vc4/vc4_validate.c
index e18f88203d32..9fd171c361c2 100644
--- a/drivers/gpu/drm/vc4/vc4_validate.c
+++ b/drivers/gpu/drm/vc4/vc4_validate.c
@@ -789,11 +789,6 @@ validate_gl_shader_rec(struct drm_device *dev,
 	exec->shader_rec_v += roundup(packet_size, 16);
 	exec->shader_rec_size -= packet_size;
 
-	if (!(*(uint16_t *)pkt_u & VC4_SHADER_FLAG_FS_SINGLE_THREAD)) {
-		DRM_ERROR("Multi-threaded fragment shaders not supported.\n");
-		return -EINVAL;
-	}
-
 	for (i = 0; i < shader_reloc_count; i++) {
 		if (src_handles[i] > exec->bo_count) {
 			DRM_ERROR("Shader handle %d too big\n", src_handles[i]);
@@ -810,6 +805,18 @@ validate_gl_shader_rec(struct drm_device *dev,
 			return -EINVAL;
 	}
 
+	if (((*(uint16_t *)pkt_u & VC4_SHADER_FLAG_FS_SINGLE_THREAD) == 0) !=
+	    to_vc4_bo(&bo[0]->base)->validated_shader->is_threaded) {
+		DRM_ERROR("Thread mode of CL and FS do not match\n");
+		return -EINVAL;
+	}
+
+	if (to_vc4_bo(&bo[1]->base)->validated_shader->is_threaded ||
+	    to_vc4_bo(&bo[2]->base)->validated_shader->is_threaded) {
+		DRM_ERROR("cs and vs cannot be threaded\n");
+		return -EINVAL;
+	}
+
 	for (i = 0; i < shader_reloc_count; i++) {
 		struct vc4_validated_shader_info *validated_shader;
 		uint32_t o = shader_reloc_offsets[i];
diff --git a/drivers/gpu/drm/vc4/vc4_validate_shaders.c b/drivers/gpu/drm/vc4/vc4_validate_shaders.c
index 917321ce832f..5dba13dd1e9b 100644
--- a/drivers/gpu/drm/vc4/vc4_validate_shaders.c
+++ b/drivers/gpu/drm/vc4/vc4_validate_shaders.c
@@ -83,6 +83,13 @@ struct vc4_shader_validation_state {
 	 * basic blocks.
 	 */
 	bool needs_uniform_address_for_loop;
+
+	/* Set when we find an instruction writing the top half of the
+	 * register files.  If we allowed writing the unusable regs in
+	 * a threaded shader, then the other shader running on our
+	 * QPU's clamp validation would be invalid.
+	 */
+	bool all_registers_used;
 };
 
 static uint32_t
@@ -118,6 +125,13 @@ raddr_add_a_to_live_reg_index(uint64_t inst)
 		return ~0;
 }
 
+static bool
+live_reg_is_upper_half(uint32_t lri)
+{
+	return	(lri >= 16 && lri < 32) ||
+		(lri >= 32 + 16 && lri < 32 + 32);
+}
+
 static bool
 is_tmu_submit(uint32_t waddr)
 {
@@ -390,6 +404,9 @@ check_reg_write(struct vc4_validated_shader_info *validated_shader,
 		} else {
 			validation_state->live_immediates[lri] = ~0;
 		}
+
+		if (live_reg_is_upper_half(lri))
+			validation_state->all_registers_used = true;
 	}
 
 	switch (waddr) {
@@ -598,6 +615,11 @@ check_instruction_reads(struct vc4_validated_shader_info *validated_shader,
 		}
 	}
 
+	if ((raddr_a >= 16 && raddr_a < 32) ||
+	    (raddr_b >= 16 && raddr_b < 32 && sig != QPU_SIG_SMALL_IMM)) {
+		validation_state->all_registers_used = true;
+	}
+
 	return true;
 }
 
@@ -753,6 +775,7 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 {
 	bool found_shader_end = false;
 	int shader_end_ip = 0;
+	uint32_t last_thread_switch_ip = -3;
 	uint32_t ip;
 	struct vc4_validated_shader_info *validated_shader = NULL;
 	struct vc4_shader_validation_state validation_state;
@@ -785,6 +808,17 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 		if (!vc4_handle_branch_target(&validation_state))
 			goto fail;
 
+		if (ip == last_thread_switch_ip + 3) {
+			/* Reset r0-r3 live clamp data */
+			int i;
+
+			for (i = 64; i < LIVE_REG_COUNT; i++) {
+				validation_state.live_min_clamp_offsets[i] = ~0;
+				validation_state.live_max_clamp_regs[i] = false;
+				validation_state.live_immediates[i] = ~0;
+			}
+		}
+
 		switch (sig) {
 		case QPU_SIG_NONE:
 		case QPU_SIG_WAIT_FOR_SCOREBOARD:
@@ -794,6 +828,8 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 		case QPU_SIG_LOAD_TMU1:
 		case QPU_SIG_PROG_END:
 		case QPU_SIG_SMALL_IMM:
+		case QPU_SIG_THREAD_SWITCH:
+		case QPU_SIG_LAST_THREAD_SWITCH:
 			if (!check_instruction_writes(validated_shader,
 						      &validation_state)) {
 				DRM_ERROR("Bad write at ip %d\n", ip);
@@ -809,6 +845,18 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 				shader_end_ip = ip;
 			}
 
+			if (sig == QPU_SIG_THREAD_SWITCH ||
+			    sig == QPU_SIG_LAST_THREAD_SWITCH) {
+				validated_shader->is_threaded = true;
+
+				if (ip < last_thread_switch_ip + 3) {
+					DRM_ERROR("Thread switch too soon after "
+						  "last switch at ip %d\n", ip);
+					goto fail;
+				}
+				last_thread_switch_ip = ip;
+			}
+
 			break;
 
 		case QPU_SIG_LOAD_IMM:
@@ -823,6 +871,13 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 			if (!check_branch(inst, validated_shader,
 					  &validation_state, ip))
 				goto fail;
+
+			if (ip < last_thread_switch_ip + 3) {
+				DRM_ERROR("Branch in thread switch at ip %d",
+					  ip);
+				goto fail;
+			}
+
 			break;
 		default:
 			DRM_ERROR("Unsupported QPU signal %d at "
@@ -844,6 +899,14 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 		goto fail;
 	}
 
+	/* Might corrupt other thread */
+	if (validated_shader->is_threaded &&
+	    validation_state.all_registers_used) {
+		DRM_ERROR("Shader uses threading, but uses the upper "
+			  "half of the registers, too\n");
+		goto fail;
+	}
+
 	/* If we did a backwards branch and we haven't emitted a uniforms
 	 * reset since then, we still need the uniforms stream to have the
 	 * uniforms address available so that the backwards branch can do its
diff --git a/include/uapi/drm/vc4_drm.h b/include/uapi/drm/vc4_drm.h
index 69caa21f0cb2..f07a09016726 100644
--- a/include/uapi/drm/vc4_drm.h
+++ b/include/uapi/drm/vc4_drm.h
@@ -287,6 +287,7 @@ struct drm_vc4_get_hang_state {
 #define DRM_VC4_PARAM_V3D_IDENT2		2
 #define DRM_VC4_PARAM_SUPPORTS_BRANCHES		3
 #define DRM_VC4_PARAM_SUPPORTS_ETC1		4
+#define DRM_VC4_PARAM_SUPPORTS_THREADED_FS	5
 
 struct drm_vc4_get_param {
 	__u32 param;