diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c index eaf26d9b5f11..1dab9e5b3689 100644 --- a/drivers/gpu/drm/vc4/vc4_drv.c +++ b/drivers/gpu/drm/vc4/vc4_drv.c @@ -61,23 +61,28 @@ static int vc4_get_param_ioctl(struct drm_device *dev, void *data, if (ret < 0) return ret; args->value = V3D_READ(V3D_IDENT0); - pm_runtime_put(&vc4->v3d->pdev->dev); + pm_runtime_mark_last_busy(&vc4->v3d->pdev->dev); + pm_runtime_put_autosuspend(&vc4->v3d->pdev->dev); break; case DRM_VC4_PARAM_V3D_IDENT1: ret = pm_runtime_get_sync(&vc4->v3d->pdev->dev); if (ret < 0) return ret; args->value = V3D_READ(V3D_IDENT1); - pm_runtime_put(&vc4->v3d->pdev->dev); + pm_runtime_mark_last_busy(&vc4->v3d->pdev->dev); + pm_runtime_put_autosuspend(&vc4->v3d->pdev->dev); break; case DRM_VC4_PARAM_V3D_IDENT2: ret = pm_runtime_get_sync(&vc4->v3d->pdev->dev); if (ret < 0) return ret; args->value = V3D_READ(V3D_IDENT2); - pm_runtime_put(&vc4->v3d->pdev->dev); + pm_runtime_mark_last_busy(&vc4->v3d->pdev->dev); + pm_runtime_put_autosuspend(&vc4->v3d->pdev->dev); break; case DRM_VC4_PARAM_SUPPORTS_BRANCHES: + case DRM_VC4_PARAM_SUPPORTS_ETC1: + case DRM_VC4_PARAM_SUPPORTS_THREADED_FS: args->value = true; break; default: diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h index 7c1e4d97486f..fef172804345 100644 --- a/drivers/gpu/drm/vc4/vc4_drv.h +++ b/drivers/gpu/drm/vc4/vc4_drv.h @@ -381,6 +381,8 @@ struct vc4_validated_shader_info { uint32_t num_uniform_addr_offsets; uint32_t *uniform_addr_offsets; + + bool is_threaded; }; /** diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c index 47a095f392f8..db920771bfb5 100644 --- a/drivers/gpu/drm/vc4/vc4_gem.c +++ b/drivers/gpu/drm/vc4/vc4_gem.c @@ -544,14 +544,15 @@ vc4_cl_lookup_bos(struct drm_device *dev, handles = drm_malloc_ab(exec->bo_count, sizeof(uint32_t)); if (!handles) { + ret = -ENOMEM; DRM_ERROR("Failed to allocate incoming GEM handles\n"); goto fail; } - ret = copy_from_user(handles, - (void __user *)(uintptr_t)args->bo_handles, - exec->bo_count * sizeof(uint32_t)); - if (ret) { + if (copy_from_user(handles, + (void __user *)(uintptr_t)args->bo_handles, + exec->bo_count * sizeof(uint32_t))) { + ret = -EFAULT; DRM_ERROR("Failed to copy in GEM handles\n"); goto fail; } @@ -708,8 +709,10 @@ vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec) } mutex_lock(&vc4->power_lock); - if (--vc4->power_refcount == 0) - pm_runtime_put(&vc4->v3d->pdev->dev); + if (--vc4->power_refcount == 0) { + pm_runtime_mark_last_busy(&vc4->v3d->pdev->dev); + pm_runtime_put_autosuspend(&vc4->v3d->pdev->dev); + } mutex_unlock(&vc4->power_lock); kfree(exec); diff --git a/drivers/gpu/drm/vc4/vc4_v3d.c b/drivers/gpu/drm/vc4/vc4_v3d.c index e6d3c6028341..7cc346ad9b0b 100644 --- a/drivers/gpu/drm/vc4/vc4_v3d.c +++ b/drivers/gpu/drm/vc4/vc4_v3d.c @@ -222,6 +222,8 @@ static int vc4_v3d_bind(struct device *dev, struct device *master, void *data) return ret; } + pm_runtime_use_autosuspend(dev); + pm_runtime_set_autosuspend_delay(dev, 40); /* a little over 2 frames. */ pm_runtime_enable(dev); return 0; diff --git a/drivers/gpu/drm/vc4/vc4_validate.c b/drivers/gpu/drm/vc4/vc4_validate.c index 26503e307438..9fd171c361c2 100644 --- a/drivers/gpu/drm/vc4/vc4_validate.c +++ b/drivers/gpu/drm/vc4/vc4_validate.c @@ -644,6 +644,13 @@ reloc_tex(struct vc4_exec_info *exec, cpp = 1; break; case VC4_TEXTURE_TYPE_ETC1: + /* ETC1 is arranged as 64-bit blocks, where each block is 4x4 + * pixels. + */ + cpp = 8; + width = (width + 3) >> 2; + height = (height + 3) >> 2; + break; case VC4_TEXTURE_TYPE_BW1: case VC4_TEXTURE_TYPE_A4: case VC4_TEXTURE_TYPE_A1: @@ -782,11 +789,6 @@ validate_gl_shader_rec(struct drm_device *dev, exec->shader_rec_v += roundup(packet_size, 16); exec->shader_rec_size -= packet_size; - if (!(*(uint16_t *)pkt_u & VC4_SHADER_FLAG_FS_SINGLE_THREAD)) { - DRM_ERROR("Multi-threaded fragment shaders not supported.\n"); - return -EINVAL; - } - for (i = 0; i < shader_reloc_count; i++) { if (src_handles[i] > exec->bo_count) { DRM_ERROR("Shader handle %d too big\n", src_handles[i]); @@ -803,6 +805,18 @@ validate_gl_shader_rec(struct drm_device *dev, return -EINVAL; } + if (((*(uint16_t *)pkt_u & VC4_SHADER_FLAG_FS_SINGLE_THREAD) == 0) != + to_vc4_bo(&bo[0]->base)->validated_shader->is_threaded) { + DRM_ERROR("Thread mode of CL and FS do not match\n"); + return -EINVAL; + } + + if (to_vc4_bo(&bo[1]->base)->validated_shader->is_threaded || + to_vc4_bo(&bo[2]->base)->validated_shader->is_threaded) { + DRM_ERROR("cs and vs cannot be threaded\n"); + return -EINVAL; + } + for (i = 0; i < shader_reloc_count; i++) { struct vc4_validated_shader_info *validated_shader; uint32_t o = shader_reloc_offsets[i]; diff --git a/drivers/gpu/drm/vc4/vc4_validate_shaders.c b/drivers/gpu/drm/vc4/vc4_validate_shaders.c index 2543cf5b8b51..5dba13dd1e9b 100644 --- a/drivers/gpu/drm/vc4/vc4_validate_shaders.c +++ b/drivers/gpu/drm/vc4/vc4_validate_shaders.c @@ -83,6 +83,13 @@ struct vc4_shader_validation_state { * basic blocks. */ bool needs_uniform_address_for_loop; + + /* Set when we find an instruction writing the top half of the + * register files. If we allowed writing the unusable regs in + * a threaded shader, then the other shader running on our + * QPU's clamp validation would be invalid. + */ + bool all_registers_used; }; static uint32_t @@ -118,6 +125,13 @@ raddr_add_a_to_live_reg_index(uint64_t inst) return ~0; } +static bool +live_reg_is_upper_half(uint32_t lri) +{ + return (lri >= 16 && lri < 32) || + (lri >= 32 + 16 && lri < 32 + 32); +} + static bool is_tmu_submit(uint32_t waddr) { @@ -390,6 +404,9 @@ check_reg_write(struct vc4_validated_shader_info *validated_shader, } else { validation_state->live_immediates[lri] = ~0; } + + if (live_reg_is_upper_half(lri)) + validation_state->all_registers_used = true; } switch (waddr) { @@ -598,6 +615,11 @@ check_instruction_reads(struct vc4_validated_shader_info *validated_shader, } } + if ((raddr_a >= 16 && raddr_a < 32) || + (raddr_b >= 16 && raddr_b < 32 && sig != QPU_SIG_SMALL_IMM)) { + validation_state->all_registers_used = true; + } + return true; } @@ -608,9 +630,7 @@ static bool vc4_validate_branches(struct vc4_shader_validation_state *validation_state) { uint32_t max_branch_target = 0; - bool found_shader_end = false; int ip; - int shader_end_ip = 0; int last_branch = -2; for (ip = 0; ip < validation_state->max_ip; ip++) { @@ -621,8 +641,13 @@ vc4_validate_branches(struct vc4_shader_validation_state *validation_state) uint32_t branch_target_ip; if (sig == QPU_SIG_PROG_END) { - shader_end_ip = ip; - found_shader_end = true; + /* There are two delay slots after program end is + * signaled that are still executed, then we're + * finished. validation_state->max_ip is the + * instruction after the last valid instruction in the + * program. + */ + validation_state->max_ip = ip + 3; continue; } @@ -676,15 +701,9 @@ vc4_validate_branches(struct vc4_shader_validation_state *validation_state) } set_bit(after_delay_ip, validation_state->branch_targets); max_branch_target = max(max_branch_target, after_delay_ip); - - /* There are two delay slots after program end is signaled - * that are still executed, then we're finished. - */ - if (found_shader_end && ip == shader_end_ip + 2) - break; } - if (max_branch_target > shader_end_ip) { + if (max_branch_target > validation_state->max_ip - 3) { DRM_ERROR("Branch landed after QPU_SIG_PROG_END"); return false; } @@ -756,6 +775,7 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj) { bool found_shader_end = false; int shader_end_ip = 0; + uint32_t last_thread_switch_ip = -3; uint32_t ip; struct vc4_validated_shader_info *validated_shader = NULL; struct vc4_shader_validation_state validation_state; @@ -788,6 +808,17 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj) if (!vc4_handle_branch_target(&validation_state)) goto fail; + if (ip == last_thread_switch_ip + 3) { + /* Reset r0-r3 live clamp data */ + int i; + + for (i = 64; i < LIVE_REG_COUNT; i++) { + validation_state.live_min_clamp_offsets[i] = ~0; + validation_state.live_max_clamp_regs[i] = false; + validation_state.live_immediates[i] = ~0; + } + } + switch (sig) { case QPU_SIG_NONE: case QPU_SIG_WAIT_FOR_SCOREBOARD: @@ -797,6 +828,8 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj) case QPU_SIG_LOAD_TMU1: case QPU_SIG_PROG_END: case QPU_SIG_SMALL_IMM: + case QPU_SIG_THREAD_SWITCH: + case QPU_SIG_LAST_THREAD_SWITCH: if (!check_instruction_writes(validated_shader, &validation_state)) { DRM_ERROR("Bad write at ip %d\n", ip); @@ -812,6 +845,18 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj) shader_end_ip = ip; } + if (sig == QPU_SIG_THREAD_SWITCH || + sig == QPU_SIG_LAST_THREAD_SWITCH) { + validated_shader->is_threaded = true; + + if (ip < last_thread_switch_ip + 3) { + DRM_ERROR("Thread switch too soon after " + "last switch at ip %d\n", ip); + goto fail; + } + last_thread_switch_ip = ip; + } + break; case QPU_SIG_LOAD_IMM: @@ -826,6 +871,13 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj) if (!check_branch(inst, validated_shader, &validation_state, ip)) goto fail; + + if (ip < last_thread_switch_ip + 3) { + DRM_ERROR("Branch in thread switch at ip %d", + ip); + goto fail; + } + break; default: DRM_ERROR("Unsupported QPU signal %d at " @@ -847,6 +899,14 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj) goto fail; } + /* Might corrupt other thread */ + if (validated_shader->is_threaded && + validation_state.all_registers_used) { + DRM_ERROR("Shader uses threading, but uses the upper " + "half of the registers, too\n"); + goto fail; + } + /* If we did a backwards branch and we haven't emitted a uniforms * reset since then, we still need the uniforms stream to have the * uniforms address available so that the backwards branch can do its diff --git a/include/uapi/drm/vc4_drm.h b/include/uapi/drm/vc4_drm.h index ad7edc3edf7c..f07a09016726 100644 --- a/include/uapi/drm/vc4_drm.h +++ b/include/uapi/drm/vc4_drm.h @@ -286,6 +286,8 @@ struct drm_vc4_get_hang_state { #define DRM_VC4_PARAM_V3D_IDENT1 1 #define DRM_VC4_PARAM_V3D_IDENT2 2 #define DRM_VC4_PARAM_SUPPORTS_BRANCHES 3 +#define DRM_VC4_PARAM_SUPPORTS_ETC1 4 +#define DRM_VC4_PARAM_SUPPORTS_THREADED_FS 5 struct drm_vc4_get_param { __u32 param;