diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c
index eaf26d9b5f11..1dab9e5b3689 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.c
+++ b/drivers/gpu/drm/vc4/vc4_drv.c
@@ -61,23 +61,28 @@ static int vc4_get_param_ioctl(struct drm_device *dev, void *data,
 		if (ret < 0)
 			return ret;
 		args->value = V3D_READ(V3D_IDENT0);
-		pm_runtime_put(&vc4->v3d->pdev->dev);
+		pm_runtime_mark_last_busy(&vc4->v3d->pdev->dev);
+		pm_runtime_put_autosuspend(&vc4->v3d->pdev->dev);
 		break;
 	case DRM_VC4_PARAM_V3D_IDENT1:
 		ret = pm_runtime_get_sync(&vc4->v3d->pdev->dev);
 		if (ret < 0)
 			return ret;
 		args->value = V3D_READ(V3D_IDENT1);
-		pm_runtime_put(&vc4->v3d->pdev->dev);
+		pm_runtime_mark_last_busy(&vc4->v3d->pdev->dev);
+		pm_runtime_put_autosuspend(&vc4->v3d->pdev->dev);
 		break;
 	case DRM_VC4_PARAM_V3D_IDENT2:
 		ret = pm_runtime_get_sync(&vc4->v3d->pdev->dev);
 		if (ret < 0)
 			return ret;
 		args->value = V3D_READ(V3D_IDENT2);
-		pm_runtime_put(&vc4->v3d->pdev->dev);
+		pm_runtime_mark_last_busy(&vc4->v3d->pdev->dev);
+		pm_runtime_put_autosuspend(&vc4->v3d->pdev->dev);
 		break;
 	case DRM_VC4_PARAM_SUPPORTS_BRANCHES:
+	case DRM_VC4_PARAM_SUPPORTS_ETC1:
+	case DRM_VC4_PARAM_SUPPORTS_THREADED_FS:
 		args->value = true;
 		break;
 	default:
diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index 7c1e4d97486f..fef172804345 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -381,6 +381,8 @@ struct vc4_validated_shader_info {
 
 	uint32_t num_uniform_addr_offsets;
 	uint32_t *uniform_addr_offsets;
+
+	bool is_threaded;
 };
 
 /**
diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
index 47a095f392f8..db920771bfb5 100644
--- a/drivers/gpu/drm/vc4/vc4_gem.c
+++ b/drivers/gpu/drm/vc4/vc4_gem.c
@@ -544,14 +544,15 @@ vc4_cl_lookup_bos(struct drm_device *dev,
 
 	handles = drm_malloc_ab(exec->bo_count, sizeof(uint32_t));
 	if (!handles) {
+		ret = -ENOMEM;
 		DRM_ERROR("Failed to allocate incoming GEM handles\n");
 		goto fail;
 	}
 
-	ret = copy_from_user(handles,
-			     (void __user *)(uintptr_t)args->bo_handles,
-			     exec->bo_count * sizeof(uint32_t));
-	if (ret) {
+	if (copy_from_user(handles,
+			   (void __user *)(uintptr_t)args->bo_handles,
+			   exec->bo_count * sizeof(uint32_t))) {
+		ret = -EFAULT;
 		DRM_ERROR("Failed to copy in GEM handles\n");
 		goto fail;
 	}
@@ -708,8 +709,10 @@ vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec)
 	}
 
 	mutex_lock(&vc4->power_lock);
-	if (--vc4->power_refcount == 0)
-		pm_runtime_put(&vc4->v3d->pdev->dev);
+	if (--vc4->power_refcount == 0) {
+		pm_runtime_mark_last_busy(&vc4->v3d->pdev->dev);
+		pm_runtime_put_autosuspend(&vc4->v3d->pdev->dev);
+	}
 	mutex_unlock(&vc4->power_lock);
 
 	kfree(exec);
diff --git a/drivers/gpu/drm/vc4/vc4_v3d.c b/drivers/gpu/drm/vc4/vc4_v3d.c
index e6d3c6028341..7cc346ad9b0b 100644
--- a/drivers/gpu/drm/vc4/vc4_v3d.c
+++ b/drivers/gpu/drm/vc4/vc4_v3d.c
@@ -222,6 +222,8 @@ static int vc4_v3d_bind(struct device *dev, struct device *master, void *data)
 		return ret;
 	}
 
+	pm_runtime_use_autosuspend(dev);
+	pm_runtime_set_autosuspend_delay(dev, 40); /* a little over 2 frames. */
 	pm_runtime_enable(dev);
 
 	return 0;
diff --git a/drivers/gpu/drm/vc4/vc4_validate.c b/drivers/gpu/drm/vc4/vc4_validate.c
index 26503e307438..9fd171c361c2 100644
--- a/drivers/gpu/drm/vc4/vc4_validate.c
+++ b/drivers/gpu/drm/vc4/vc4_validate.c
@@ -644,6 +644,13 @@ reloc_tex(struct vc4_exec_info *exec,
 		cpp = 1;
 		break;
 	case VC4_TEXTURE_TYPE_ETC1:
+		/* ETC1 is arranged as 64-bit blocks, where each block is 4x4
+		 * pixels.
+		 */
+		cpp = 8;
+		width = (width + 3) >> 2;
+		height = (height + 3) >> 2;
+		break;
 	case VC4_TEXTURE_TYPE_BW1:
 	case VC4_TEXTURE_TYPE_A4:
 	case VC4_TEXTURE_TYPE_A1:
@@ -782,11 +789,6 @@ validate_gl_shader_rec(struct drm_device *dev,
 	exec->shader_rec_v += roundup(packet_size, 16);
 	exec->shader_rec_size -= packet_size;
 
-	if (!(*(uint16_t *)pkt_u & VC4_SHADER_FLAG_FS_SINGLE_THREAD)) {
-		DRM_ERROR("Multi-threaded fragment shaders not supported.\n");
-		return -EINVAL;
-	}
-
 	for (i = 0; i < shader_reloc_count; i++) {
 		if (src_handles[i] > exec->bo_count) {
 			DRM_ERROR("Shader handle %d too big\n", src_handles[i]);
@@ -803,6 +805,18 @@ validate_gl_shader_rec(struct drm_device *dev,
 			return -EINVAL;
 	}
 
+	if (((*(uint16_t *)pkt_u & VC4_SHADER_FLAG_FS_SINGLE_THREAD) == 0) !=
+	    to_vc4_bo(&bo[0]->base)->validated_shader->is_threaded) {
+		DRM_ERROR("Thread mode of CL and FS do not match\n");
+		return -EINVAL;
+	}
+
+	if (to_vc4_bo(&bo[1]->base)->validated_shader->is_threaded ||
+	    to_vc4_bo(&bo[2]->base)->validated_shader->is_threaded) {
+		DRM_ERROR("cs and vs cannot be threaded\n");
+		return -EINVAL;
+	}
+
 	for (i = 0; i < shader_reloc_count; i++) {
 		struct vc4_validated_shader_info *validated_shader;
 		uint32_t o = shader_reloc_offsets[i];
diff --git a/drivers/gpu/drm/vc4/vc4_validate_shaders.c b/drivers/gpu/drm/vc4/vc4_validate_shaders.c
index 2543cf5b8b51..5dba13dd1e9b 100644
--- a/drivers/gpu/drm/vc4/vc4_validate_shaders.c
+++ b/drivers/gpu/drm/vc4/vc4_validate_shaders.c
@@ -83,6 +83,13 @@ struct vc4_shader_validation_state {
 	 * basic blocks.
 	 */
 	bool needs_uniform_address_for_loop;
+
+	/* Set when we find an instruction writing the top half of the
+	 * register files.  If we allowed writing the unusable regs in
+	 * a threaded shader, then the other shader running on our
+	 * QPU's clamp validation would be invalid.
+	 */
+	bool all_registers_used;
 };
 
 static uint32_t
@@ -118,6 +125,13 @@ raddr_add_a_to_live_reg_index(uint64_t inst)
 		return ~0;
 }
 
+static bool
+live_reg_is_upper_half(uint32_t lri)
+{
+	return	(lri >= 16 && lri < 32) ||
+		(lri >= 32 + 16 && lri < 32 + 32);
+}
+
 static bool
 is_tmu_submit(uint32_t waddr)
 {
@@ -390,6 +404,9 @@ check_reg_write(struct vc4_validated_shader_info *validated_shader,
 		} else {
 			validation_state->live_immediates[lri] = ~0;
 		}
+
+		if (live_reg_is_upper_half(lri))
+			validation_state->all_registers_used = true;
 	}
 
 	switch (waddr) {
@@ -598,6 +615,11 @@ check_instruction_reads(struct vc4_validated_shader_info *validated_shader,
 		}
 	}
 
+	if ((raddr_a >= 16 && raddr_a < 32) ||
+	    (raddr_b >= 16 && raddr_b < 32 && sig != QPU_SIG_SMALL_IMM)) {
+		validation_state->all_registers_used = true;
+	}
+
 	return true;
 }
 
@@ -608,9 +630,7 @@ static bool
 vc4_validate_branches(struct vc4_shader_validation_state *validation_state)
 {
 	uint32_t max_branch_target = 0;
-	bool found_shader_end = false;
 	int ip;
-	int shader_end_ip = 0;
 	int last_branch = -2;
 
 	for (ip = 0; ip < validation_state->max_ip; ip++) {
@@ -621,8 +641,13 @@ vc4_validate_branches(struct vc4_shader_validation_state *validation_state)
 		uint32_t branch_target_ip;
 
 		if (sig == QPU_SIG_PROG_END) {
-			shader_end_ip = ip;
-			found_shader_end = true;
+			/* There are two delay slots after program end is
+			 * signaled that are still executed, then we're
+			 * finished.  validation_state->max_ip is the
+			 * instruction after the last valid instruction in the
+			 * program.
+			 */
+			validation_state->max_ip = ip + 3;
 			continue;
 		}
 
@@ -676,15 +701,9 @@ vc4_validate_branches(struct vc4_shader_validation_state *validation_state)
 		}
 		set_bit(after_delay_ip, validation_state->branch_targets);
 		max_branch_target = max(max_branch_target, after_delay_ip);
-
-		/* There are two delay slots after program end is signaled
-		 * that are still executed, then we're finished.
-		 */
-		if (found_shader_end && ip == shader_end_ip + 2)
-			break;
 	}
 
-	if (max_branch_target > shader_end_ip) {
+	if (max_branch_target > validation_state->max_ip - 3) {
 		DRM_ERROR("Branch landed after QPU_SIG_PROG_END");
 		return false;
 	}
@@ -756,6 +775,7 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 {
 	bool found_shader_end = false;
 	int shader_end_ip = 0;
+	uint32_t last_thread_switch_ip = -3;
 	uint32_t ip;
 	struct vc4_validated_shader_info *validated_shader = NULL;
 	struct vc4_shader_validation_state validation_state;
@@ -788,6 +808,17 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 		if (!vc4_handle_branch_target(&validation_state))
 			goto fail;
 
+		if (ip == last_thread_switch_ip + 3) {
+			/* Reset r0-r3 live clamp data */
+			int i;
+
+			for (i = 64; i < LIVE_REG_COUNT; i++) {
+				validation_state.live_min_clamp_offsets[i] = ~0;
+				validation_state.live_max_clamp_regs[i] = false;
+				validation_state.live_immediates[i] = ~0;
+			}
+		}
+
 		switch (sig) {
 		case QPU_SIG_NONE:
 		case QPU_SIG_WAIT_FOR_SCOREBOARD:
@@ -797,6 +828,8 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 		case QPU_SIG_LOAD_TMU1:
 		case QPU_SIG_PROG_END:
 		case QPU_SIG_SMALL_IMM:
+		case QPU_SIG_THREAD_SWITCH:
+		case QPU_SIG_LAST_THREAD_SWITCH:
 			if (!check_instruction_writes(validated_shader,
 						      &validation_state)) {
 				DRM_ERROR("Bad write at ip %d\n", ip);
@@ -812,6 +845,18 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 				shader_end_ip = ip;
 			}
 
+			if (sig == QPU_SIG_THREAD_SWITCH ||
+			    sig == QPU_SIG_LAST_THREAD_SWITCH) {
+				validated_shader->is_threaded = true;
+
+				if (ip < last_thread_switch_ip + 3) {
+					DRM_ERROR("Thread switch too soon after "
+						  "last switch at ip %d\n", ip);
+					goto fail;
+				}
+				last_thread_switch_ip = ip;
+			}
+
 			break;
 
 		case QPU_SIG_LOAD_IMM:
@@ -826,6 +871,13 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 			if (!check_branch(inst, validated_shader,
 					  &validation_state, ip))
 				goto fail;
+
+			if (ip < last_thread_switch_ip + 3) {
+				DRM_ERROR("Branch in thread switch at ip %d",
+					  ip);
+				goto fail;
+			}
+
 			break;
 		default:
 			DRM_ERROR("Unsupported QPU signal %d at "
@@ -847,6 +899,14 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 		goto fail;
 	}
 
+	/* Might corrupt other thread */
+	if (validated_shader->is_threaded &&
+	    validation_state.all_registers_used) {
+		DRM_ERROR("Shader uses threading, but uses the upper "
+			  "half of the registers, too\n");
+		goto fail;
+	}
+
 	/* If we did a backwards branch and we haven't emitted a uniforms
 	 * reset since then, we still need the uniforms stream to have the
 	 * uniforms address available so that the backwards branch can do its
diff --git a/include/uapi/drm/vc4_drm.h b/include/uapi/drm/vc4_drm.h
index ad7edc3edf7c..f07a09016726 100644
--- a/include/uapi/drm/vc4_drm.h
+++ b/include/uapi/drm/vc4_drm.h
@@ -286,6 +286,8 @@ struct drm_vc4_get_hang_state {
 #define DRM_VC4_PARAM_V3D_IDENT1		1
 #define DRM_VC4_PARAM_V3D_IDENT2		2
 #define DRM_VC4_PARAM_SUPPORTS_BRANCHES		3
+#define DRM_VC4_PARAM_SUPPORTS_ETC1		4
+#define DRM_VC4_PARAM_SUPPORTS_THREADED_FS	5
 
 struct drm_vc4_get_param {
 	__u32 param;