YJIT: Fancier opt_getinlinecache

Make sure `opt_getinlinecache` is in a block all on its own, and
invalidate it from the interpreter when `opt_setinlinecache`.
It will recompile with a filled cache the second time around.
This lets YJIT runs well when the IC for constant is cold.
This commit is contained in:
Alan Wu 2021-03-24 18:07:26 -04:00
Родитель e81d1f4ae3
Коммит b626dd7211
9 изменённых файлов: 146 добавлений и 73 удалений

Просмотреть файл

@ -2259,6 +2259,7 @@ iseq_set_sequence(rb_iseq_t *iseq, LINK_ANCHOR *const anchor)
VALUE *generated_iseq;
rb_event_flag_t events = 0;
long data = 0;
long getinlinecache_idx = -1;
int insn_num, code_index, insns_info_index, sp = 0;
int stack_max = fix_sp_depth(iseq, anchor);
@ -2362,6 +2363,11 @@ iseq_set_sequence(rb_iseq_t *iseq, LINK_ANCHOR *const anchor)
types = insn_op_types(insn);
len = insn_len(insn);
if (insn == BIN(opt_getinlinecache)) {
assert(getinlinecache_idx < 0 && "one get per set, no nesting");
getinlinecache_idx = code_index;
}
for (j = 0; types[j]; j++) {
char type = types[j];
/* printf("--> [%c - (%d-%d)]\n", type, k, j); */
@ -2419,6 +2425,13 @@ iseq_set_sequence(rb_iseq_t *iseq, LINK_ANCHOR *const anchor)
}
generated_iseq[code_index + 1 + j] = (VALUE)ic;
FL_SET(iseqv, ISEQ_MARKABLE_ISEQ);
if (insn == BIN(opt_setinlinecache) && type == TS_IC) {
assert(getinlinecache_idx >= 0);
// Store index to the matching opt_getinlinecache on the IC for YJIT
ic->get_insn_idx = (unsigned)getinlinecache_idx;
getinlinecache_idx = -1;
}
break;
}
case TS_CALLDATA:
@ -11107,6 +11120,7 @@ ibf_load_code(const struct ibf_load *load, rb_iseq_t *iseq, ibf_offset_t bytecod
unsigned int code_index;
ibf_offset_t reading_pos = bytecode_offset;
VALUE *code = ALLOC_N(VALUE, iseq_size);
long getinlinecache_idx = -1;
struct rb_iseq_constant_body *load_body = iseq->body;
struct rb_call_data *cd_entries = load_body->call_data;
@ -11114,13 +11128,22 @@ ibf_load_code(const struct ibf_load *load, rb_iseq_t *iseq, ibf_offset_t bytecod
for (code_index=0; code_index<iseq_size;) {
/* opcode */
const VALUE insn = code[code_index++] = ibf_load_small_value(load, &reading_pos);
const VALUE insn = code[code_index] = ibf_load_small_value(load, &reading_pos);
const char *types = insn_op_types(insn);
int op_index;
if (insn == BIN(opt_getinlinecache)) {
assert(getinlinecache_idx < 0 && "one get per set, no nesting");
getinlinecache_idx = code_index;
}
code_index++;
/* operands */
for (op_index=0; types[op_index]; op_index++, code_index++) {
switch (types[op_index]) {
char type = types[op_index];
switch (type) {
case TS_CDHASH:
case TS_VALUE:
{
VALUE op = ibf_load_small_value(load, &reading_pos);
@ -11168,6 +11191,13 @@ ibf_load_code(const struct ibf_load *load, rb_iseq_t *iseq, ibf_offset_t bytecod
{
VALUE op = ibf_load_small_value(load, &reading_pos);
code[code_index] = (VALUE)&is_entries[op];
if (insn == BIN(opt_setinlinecache) && type == TS_IC) {
assert(getinlinecache_idx >= 0);
// Store index to the matching opt_getinlinecache on the IC for YJIT
is_entries[op].ic_cache.get_insn_idx = (unsigned)getinlinecache_idx;
getinlinecache_idx = -1;
}
}
FL_SET(iseqv, ISEQ_MARKABLE_ISEQ);
break;

Просмотреть файл

@ -236,6 +236,9 @@ STATIC_ASSERT(sizeof_iseq_inline_constant_cache_entry,
struct iseq_inline_constant_cache {
struct iseq_inline_constant_cache_entry *entry;
// For YJIT: the index to the opt_getinlinecache instruction in the same iseq.
// It's set during compile time and constant once set.
unsigned get_insn_idx;
};
struct iseq_inline_iv_cache_entry {

Просмотреть файл

@ -4743,6 +4743,9 @@ vm_ic_update(const rb_iseq_t *iseq, IC ic, VALUE val, const VALUE *reg_ep)
if (rb_ractor_shareable_p(val)) ice->flags |= IMEMO_CONST_CACHE_SHAREABLE;
ruby_vm_const_missing_count = 0;
RB_OBJ_WRITE(iseq, &ic->entry, ice);
#ifndef MJIT_HEADER
yjit_constant_ic_update(iseq, ic);
#endif
}
static VALUE

5
yjit.h
Просмотреть файл

@ -5,9 +5,7 @@
#ifndef YJIT_H
#define YJIT_H 1
#include "stddef.h"
#include "stdint.h"
#include "stdbool.h"
#include "vm_core.h"
#include "method.h"
#ifdef _WIN32
@ -61,5 +59,6 @@ void rb_yjit_iseq_mark(const struct rb_iseq_constant_body *body);
void rb_yjit_iseq_update_references(const struct rb_iseq_constant_body *body);
void rb_yjit_iseq_free(const struct rb_iseq_constant_body *body);
void rb_yjit_before_ractor_spawn(void);
void yjit_constant_ic_update(const rb_iseq_t *iseq, IC ic);
#endif // #ifndef YJIT_H

Просмотреть файл

@ -43,7 +43,7 @@ jit_print_loc(jitstate_t* jit, const char* msg)
static int
jit_get_opcode(jitstate_t* jit)
{
return opcode_at_pc(jit->iseq, jit->pc);
return yjit_opcode_at_pc(jit->iseq, jit->pc);
}
// Get the index of the next instruction
@ -147,7 +147,7 @@ yjit_gen_exit(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
// Write back the old instruction at the exit PC
// Otherwise the interpreter may jump right back to the
// JITted code we're trying to exit
int exit_opcode = opcode_at_pc(jit->iseq, exit_pc);
int exit_opcode = yjit_opcode_at_pc(jit->iseq, exit_pc);
void* handler_addr = (void*)handler_table[exit_opcode];
mov(cb, REG0, const_ptr_opnd(exit_pc));
mov(cb, REG1, const_ptr_opnd(handler_addr));
@ -255,9 +255,8 @@ yjit_entry_prologue(void)
return code_ptr;
}
/*
Generate code to check for interrupts and take a side-exit
*/
// Generate code to check for interrupts and take a side-exit
static void
yjit_check_ints(codeblock_t* cb, uint8_t* side_exit)
{
@ -269,17 +268,36 @@ yjit_check_ints(codeblock_t* cb, uint8_t* side_exit)
jnz_ptr(cb, side_exit);
}
/*
Compile a sequence of bytecode instructions for a given basic block version
*/
// Generate a stubbed unconditional jump to the next bytecode instruction.
// Blocks that are part of a guard chain can use this to share the same successor.
static void
jit_jump_to_next_insn(jitstate_t *jit, const ctx_t *current_context)
{
// Reset the depth since in current usages we only ever jump to to
// chain_depth > 0 from the same instruction.
ctx_t reset_depth = *current_context;
reset_depth.chain_depth = 0;
blockid_t jump_block = { jit->iseq, jit_next_insn_idx(jit) };
// Generate the jump instruction
gen_direct_jump(
&reset_depth,
jump_block
);
}
// Compile a sequence of bytecode instructions for a given basic block version
void
yjit_gen_block(ctx_t* ctx, block_t* block, rb_execution_context_t* ec)
yjit_gen_block(ctx_t *ctx, block_t *block, rb_execution_context_t *ec)
{
RUBY_ASSERT(cb != NULL);
RUBY_ASSERT(block != NULL);
const rb_iseq_t *iseq = block->blockid.iseq;
uint32_t insn_idx = block->blockid.idx;
const uint32_t starting_insn_idx = insn_idx;
// NOTE: if we are ever deployed in production, we
// should probably just log an error and return NULL here,
@ -305,13 +323,21 @@ yjit_gen_block(ctx_t* ctx, block_t* block, rb_execution_context_t* ec)
// For each instruction to compile
for (;;) {
// Get the current pc and opcode
VALUE *pc = yjit_iseq_pc_at_idx(iseq, insn_idx);
int opcode = yjit_opcode_at_pc(iseq, pc);
RUBY_ASSERT(opcode >= 0 && opcode < VM_INSTRUCTION_SIZE);
// opt_getinlinecache wants to be in a block all on its own. Cut the block short
// if we run into it. See gen_opt_getinlinecache for details.
if (opcode == BIN(opt_getinlinecache) && insn_idx > starting_insn_idx) {
jit_jump_to_next_insn(&jit, ctx);
break;
}
// Set the current instruction
jit.insn_idx = insn_idx;
jit.pc = iseq_pc_at_idx(iseq, insn_idx);
// Get the current opcode
int opcode = jit_get_opcode(&jit);
RUBY_ASSERT(opcode >= 0 && opcode < VM_INSTRUCTION_SIZE);
jit.pc = pc;
// Lookup the codegen function for this instruction
codegen_fn gen_fn = gen_fns[opcode];
@ -322,8 +348,10 @@ yjit_gen_block(ctx_t* ctx, block_t* block, rb_execution_context_t* ec)
break;
}
//fprintf(stderr, "compiling %d: %s\n", insn_idx, insn_name(opcode));
//print_str(cb, insn_name(opcode));
if (0) {
fprintf(stderr, "compiling %d: %s\n", insn_idx, insn_name(opcode));
print_str(cb, insn_name(opcode));
}
// :count-placement:
// Count bytecode instructions that execute in generated code.
@ -366,9 +394,8 @@ yjit_gen_block(ctx_t* ctx, block_t* block, rb_execution_context_t* ec)
if (YJIT_DUMP_MODE >= 2) {
// Dump list of compiled instrutions
fprintf(stderr, "Compiled the following for iseq=%p:\n", (void *)iseq);
for (uint32_t idx = block->blockid.idx; idx < insn_idx;)
{
int opcode = opcode_at_pc(iseq, iseq_pc_at_idx(iseq, idx));
for (uint32_t idx = block->blockid.idx; idx < insn_idx; ) {
int opcode = yjit_opcode_at_pc(iseq, yjit_iseq_pc_at_idx(iseq, idx));
fprintf(stderr, " %04d %s\n", idx, insn_name(opcode));
idx += insn_len(opcode);
}
@ -605,25 +632,6 @@ guard_self_is_heap(codeblock_t *cb, x86opnd_t self_opnd, uint8_t *side_exit, ctx
}
}
// Generate a stubbed unconditional jump to the next bytecode instruction.
// Blocks that are part of a guard chain can use this to share the same successor.
static void
jit_jump_to_next_insn(jitstate_t *jit, const ctx_t *current_context)
{
// Reset the depth since in current usages we only ever jump to to
// chain_depth > 0 from the same instruction.
ctx_t reset_depth = *current_context;
reset_depth.chain_depth = 0;
blockid_t jump_block = { jit->iseq, jit_next_insn_idx(jit) };
// Generate the jump instruction
gen_direct_jump(
&reset_depth,
jump_block
);
}
static void
gen_jnz_to_target0(codeblock_t *cb, uint8_t *target0, uint8_t *target1, uint8_t shape)
{
@ -1918,6 +1926,7 @@ gen_leave(jitstate_t* jit, ctx_t* ctx)
}
RUBY_EXTERN rb_serial_t ruby_vm_global_constant_state;
static codegen_status_t
gen_opt_getinlinecache(jitstate_t *jit, ctx_t *ctx)
{
@ -1927,16 +1936,11 @@ gen_opt_getinlinecache(jitstate_t *jit, ctx_t *ctx)
// See vm_ic_hit_p().
struct iseq_inline_constant_cache_entry *ice = ic->entry;
if (!ice) {
// Cache not filled
return YJIT_CANT_COMPILE;
}
if (ice->ic_serial != ruby_vm_global_constant_state) {
// Cache miss at compile time.
return YJIT_CANT_COMPILE;
}
if (ice->ic_cref) {
// Only compile for caches that don't care about lexical scope.
if (!ice || // cache not filled
ice->ic_serial != ruby_vm_global_constant_state || // cache out of date
ice->ic_cref /* cache only valid for certain lexical scopes */) {
// In these cases, leave a block that unconditionally side exits
// for the interpreter to invalidate.
return YJIT_CANT_COMPILE;
}
@ -1946,7 +1950,7 @@ gen_opt_getinlinecache(jitstate_t *jit, ctx_t *ctx)
// Invalidate output code on any and all constant writes
// FIXME: This leaks when st_insert raises NoMemoryError
if (!assume_stable_global_constant_state(jit->block)) return YJIT_CANT_COMPILE;
assume_stable_global_constant_state(jit->block);
x86opnd_t stack_top = ctx_stack_push(ctx, TYPE_UNKNOWN);
jit_mov_gc_ptr(jit, cb, REG0, ice->value);

Просмотреть файл

@ -289,8 +289,8 @@ int ctx_diff(const ctx_t* src, const ctx_t* dst)
}
// Get all blocks for a particular place in an iseq.
static rb_yjit_block_array_t
get_version_array(const rb_iseq_t *iseq, unsigned idx)
rb_yjit_block_array_t
yjit_get_version_array(const rb_iseq_t *iseq, unsigned idx)
{
struct rb_iseq_constant_body *body = iseq->body;
@ -305,7 +305,7 @@ get_version_array(const rb_iseq_t *iseq, unsigned idx)
// Count the number of block versions matching a given blockid
static size_t get_num_versions(blockid_t blockid)
{
return rb_darray_size(get_version_array(blockid.iseq, blockid.idx));
return rb_darray_size(yjit_get_version_array(blockid.iseq, blockid.idx));
}
// Keep track of a block version. Block should be fully constructed.
@ -364,7 +364,7 @@ add_block_version(blockid_t blockid, block_t* block)
// Retrieve a basic block version for an (iseq, idx) tuple
block_t* find_block_version(blockid_t blockid, const ctx_t* ctx)
{
rb_yjit_block_array_t versions = get_version_array(blockid.iseq, blockid.idx);
rb_yjit_block_array_t versions = yjit_get_version_array(blockid.iseq, blockid.idx);
// Best match found
block_t* best_version = NULL;
@ -522,7 +522,7 @@ branch_stub_hit(const uint32_t branch_idx, const uint32_t target_idx, rb_executi
// Update the PC in the current CFP, because it
// may be out of sync in JITted code
ec->cfp->pc = iseq_pc_at_idx(target.iseq, target.idx);
ec->cfp->pc = yjit_iseq_pc_at_idx(target.iseq, target.idx);
// Try to find an existing compiled version of this block
block_t* p_block = find_block_version(target, target_ctx);
@ -846,7 +846,8 @@ void
invalidate_block_version(block_t* block)
{
ASSERT_vm_locking();
rb_vm_barrier(); // Stop other ractors since we are going to patch machine code.
// TODO: want to assert that all other ractors are stopped here. Can't patch
// machine code that some other thread is running.
const rb_iseq_t *iseq = block->blockid.iseq;
@ -854,7 +855,7 @@ invalidate_block_version(block_t* block)
// fprintf(stderr, "block=%p\n", block);
// Remove this block from the version array
rb_yjit_block_array_t versions = get_version_array(iseq, block->blockid.idx);
rb_yjit_block_array_t versions = yjit_get_version_array(iseq, block->blockid.idx);
RB_UNUSED_VAR(bool removed);
removed = block_array_remove(versions, block);
RUBY_ASSERT(removed);
@ -909,8 +910,8 @@ invalidate_block_version(block_t* block)
uint32_t idx = block->blockid.idx;
// FIXME: the following says "if", but it's unconditional.
// If the block is an entry point, it needs to be unmapped from its iseq
VALUE* entry_pc = iseq_pc_at_idx(iseq, idx);
int entry_opcode = opcode_at_pc(iseq, entry_pc);
VALUE* entry_pc = yjit_iseq_pc_at_idx(iseq, idx);
int entry_opcode = yjit_opcode_at_pc(iseq, entry_pc);
// TODO: unmap_addr2insn in yjit_iface.c? Maybe we can write a function to encompass this logic?
// Should check how it's used in exit and side-exit

Просмотреть файл

@ -235,6 +235,7 @@ block_t* gen_block_version(blockid_t blockid, const ctx_t* ctx, rb_execution_con
uint8_t* gen_entry_point(const rb_iseq_t *iseq, uint32_t insn_idx, rb_execution_context_t *ec);
void yjit_free_block(block_t *block);
void yjit_branches_update_references(void);
rb_yjit_block_array_t yjit_get_version_array(const rb_iseq_t *iseq, unsigned idx);
void gen_branch(
const ctx_t* src_ctx,

Просмотреть файл

@ -65,7 +65,8 @@ cb_write_post_call_bytes(codeblock_t* cb)
}
// Get the PC for a given index in an iseq
VALUE *iseq_pc_at_idx(const rb_iseq_t *iseq, uint32_t insn_idx)
VALUE *
yjit_iseq_pc_at_idx(const rb_iseq_t *iseq, uint32_t insn_idx)
{
RUBY_ASSERT(iseq != NULL);
RUBY_ASSERT(insn_idx < iseq->body->iseq_size);
@ -91,7 +92,7 @@ map_addr2insn(void *code_ptr, int insn)
}
int
opcode_at_pc(const rb_iseq_t *iseq, const VALUE *pc)
yjit_opcode_at_pc(const rb_iseq_t *iseq, const VALUE *pc)
{
const VALUE at_pc = *pc;
if (FL_TEST_RAW((VALUE)iseq, ISEQ_TRANSLATED)) {
@ -269,11 +270,9 @@ static st_table *blocks_assuming_stable_global_constant_state;
// Assume that the global constant state has not changed since call to this function.
// Can raise NoMemoryError.
RBIMPL_ATTR_NODISCARD()
bool
void
assume_stable_global_constant_state(block_t *block) {
st_insert(blocks_assuming_stable_global_constant_state, (st_data_t)block, 1);
return true;
}
static int
@ -491,7 +490,7 @@ rb_yjit_compile_iseq(const rb_iseq_t *iseq, rb_execution_context_t *ec)
if (code_ptr)
{
// Map the code address to the corresponding opcode
int first_opcode = opcode_at_pc(iseq, &encoded[0]);
int first_opcode = yjit_opcode_at_pc(iseq, &encoded[0]);
map_addr2insn(code_ptr, first_opcode);
encoded[0] = (VALUE)code_ptr;
}
@ -601,6 +600,39 @@ rb_yjit_constant_state_changed(void)
}
}
// Callback from the opt_setinlinecache instruction in the interpreter
void
yjit_constant_ic_update(const rb_iseq_t *iseq, IC ic)
{
RB_VM_LOCK_ENTER();
rb_vm_barrier(); // Stop other ractors since we are going to patch machine code.
{
const struct rb_iseq_constant_body *const body = iseq->body;
VALUE *code = body->iseq_encoded;
// This should come from a running iseq, so direct threading translation
// should have been done
RUBY_ASSERT(FL_TEST((VALUE)iseq, ISEQ_TRANSLATED));
RUBY_ASSERT(ic->get_insn_idx < body->iseq_size);
RUBY_ASSERT(rb_vm_insn_addr2insn((const void *)code[ic->get_insn_idx]) == BIN(opt_getinlinecache));
// Find the matching opt_getinlinecache and invalidate all the blocks there
RUBY_ASSERT(insn_op_type(BIN(opt_getinlinecache), 1) == TS_IC);
if (ic == (IC)code[ic->get_insn_idx + 1 + 1]) {
rb_yjit_block_array_t getinlinecache_blocks = yjit_get_version_array(iseq, ic->get_insn_idx);
rb_darray_for(getinlinecache_blocks, i) {
block_t *block = rb_darray_get(getinlinecache_blocks, i);
invalidate_block_version(block);
}
}
else {
RUBY_ASSERT(false && "ic->get_insn_diex not set properly");
}
}
RB_VM_LOCK_LEAVE();
}
void
rb_yjit_before_ractor_spawn(void)
{

Просмотреть файл

@ -85,9 +85,9 @@ RUBY_EXTERN struct rb_yjit_runtime_counters yjit_runtime_counters;
void cb_write_pre_call_bytes(codeblock_t* cb);
void cb_write_post_call_bytes(codeblock_t* cb);
VALUE *iseq_pc_at_idx(const rb_iseq_t *iseq, uint32_t insn_idx);
void map_addr2insn(void *code_ptr, int insn);
int opcode_at_pc(const rb_iseq_t *iseq, const VALUE *pc);
void yjit_map_addr2insn(void *code_ptr, int insn);
VALUE *yjit_iseq_pc_at_idx(const rb_iseq_t *iseq, uint32_t insn_idx);
int yjit_opcode_at_pc(const rb_iseq_t *iseq, const VALUE *pc);
void check_cfunc_dispatch(VALUE receiver, struct rb_callinfo *ci, void *callee, rb_callable_method_entry_t *compile_time_cme);
bool cfunc_needs_frame(const rb_method_cfunc_t *cfunc);
@ -95,7 +95,7 @@ bool cfunc_needs_frame(const rb_method_cfunc_t *cfunc);
RBIMPL_ATTR_NODISCARD() bool assume_bop_not_redefined(block_t *block, int redefined_flag, enum ruby_basic_operators bop);
void assume_method_lookup_stable(VALUE receiver_klass, const rb_callable_method_entry_t *cme, block_t *block);
RBIMPL_ATTR_NODISCARD() bool assume_single_ractor_mode(block_t *block);
RBIMPL_ATTR_NODISCARD() bool assume_stable_global_constant_state(block_t *block);
void assume_stable_global_constant_state(block_t *block);
// this function *must* return passed exit_pc
const VALUE *rb_yjit_count_side_exit_op(const VALUE *exit_pc);