rb_shape_transition_shape_capa: use optimal sizes transitions

Previously the growth was 3(embed), 6, 12, 24, ...

With this change it's now 3(embed), 8, 16, 32, 64, ... by default.

However, since power of two isn't the best size for all allocators,
if `malloc_usable_size` is vailable, we use it to discover the best
offset.

On Linux/glibc 2.35 for instance, the growth will be 3(embed), 7, 15, 31
to avoid wasting 8B per object.

Test program:

```c

size_t test(size_t slots) {
    size_t allocated = slots * VALUE_SIZE;
    void *test_ptr = malloc(allocated);
    size_t wasted = malloc_usable_size(test_ptr) - allocated;
    free(test_ptr);
    fprintf(stderr, "slots = %lu, wasted_bytes = %lu\n", slots, wasted);
    return wasted;
}

int main(int argc, char *argv[]) {
    size_t best_padding = 0;
    size_t padding = 0;
    for (padding = 0; padding <= 2; padding++) {
        size_t wasted = test(8 - padding);
        if (wasted == 0) {
            best_padding = padding;
            break;
        }
    }

    size_t index = 0;
    fprintf(stderr, "=============== naive ================\n");

    size_t list_size = 4;
    for (index = 0; index < 10; index++) {
        test(list_size);
        list_size *= 2;
    }

    fprintf(stderr, "=============== auto-padded (-%lu) ================\n", best_padding);

    list_size = 4;
    for (index = 0; index < 10; index ++) {
        test(list_size - best_padding);
        list_size *= 2;
    }

    fprintf(stderr, "\n\n");
    return 0;
}
```

```
===== glibc ======
slots = 8, wasted_bytes = 8
slots = 7, wasted_bytes = 0
=============== naive ================
slots = 4, wasted_bytes = 8
slots = 8, wasted_bytes = 8
slots = 16, wasted_bytes = 8
slots = 32, wasted_bytes = 8
slots = 64, wasted_bytes = 8
slots = 128, wasted_bytes = 8
slots = 256, wasted_bytes = 8
slots = 512, wasted_bytes = 8
slots = 1024, wasted_bytes = 8
slots = 2048, wasted_bytes = 8
=============== auto-padded (-1) ================
slots = 3, wasted_bytes = 0
slots = 7, wasted_bytes = 0
slots = 15, wasted_bytes = 0
slots = 31, wasted_bytes = 0
slots = 63, wasted_bytes = 0
slots = 127, wasted_bytes = 0
slots = 255, wasted_bytes = 0
slots = 511, wasted_bytes = 0
slots = 1023, wasted_bytes = 0
slots = 2047, wasted_bytes = 0
```

```
==========  jemalloc =======
slots = 8, wasted_bytes = 0
=============== naive ================
slots = 4, wasted_bytes = 0
slots = 8, wasted_bytes = 0
slots = 16, wasted_bytes = 0
slots = 32, wasted_bytes = 0
slots = 64, wasted_bytes = 0
slots = 128, wasted_bytes = 0
slots = 256, wasted_bytes = 0
slots = 512, wasted_bytes = 0
slots = 1024, wasted_bytes = 0
slots = 2048, wasted_bytes = 0
=============== auto-padded (-0) ================
slots = 4, wasted_bytes = 0
slots = 8, wasted_bytes = 0
slots = 16, wasted_bytes = 0
slots = 32, wasted_bytes = 0
slots = 64, wasted_bytes = 0
slots = 128, wasted_bytes = 0
slots = 256, wasted_bytes = 0
slots = 512, wasted_bytes = 0
slots = 1024, wasted_bytes = 0
slots = 2048, wasted_bytes = 0
```
This commit is contained in:
Jean Boussier 2023-10-10 15:32:12 +02:00 коммит произвёл Jean Boussier
Родитель e7d845b1d0
Коммит e5364ea496
3 изменённых файлов: 71 добавлений и 4 удалений

64
gc.c
Просмотреть файл

@ -157,6 +157,68 @@
#define MAP_ANONYMOUS MAP_ANON
#endif
static size_t malloc_offset = 0;
#if defined(HAVE_MALLOC_USABLE_SIZE)
static size_t
gc_compute_malloc_offset(void)
{
// Different allocators use different metadata storage strategies which result in different
// ideal sizes.
// For instance malloc(64) will waste 8B with glibc, but waste 0B with jemalloc.
// But malloc(56) will waste 0B with glibc, but waste 8B with jemalloc.
// So we try allocating 64, 56 and 48 bytes and select the first offset that doesn't
// waste memory.
// This was tested on Linux with glibc 2.35 and jemalloc 5, and for both it result in
// no wasted memory.
size_t offset = 0;
for (offset = 0; offset <= 16; offset += 8) {
size_t allocated = (64 - offset);
void *test_ptr = malloc(allocated);
size_t wasted = malloc_usable_size(test_ptr) - allocated;
free(test_ptr);
if (wasted == 0) {
return offset;
}
}
return 0;
}
#else
static size_t
gc_compute_malloc_offset(void)
{
// If we don't have malloc_usable_size, we use powers of 2.
return 0;
}
#endif
size_t
rb_malloc_grow_capa(size_t current, size_t type_size)
{
size_t current_capacity = current;
if (current_capacity < 4) {
current_capacity = 4;
}
current_capacity *= type_size;
// We double the current capacity.
size_t new_capacity = (current_capacity * 2);
// And round up to the next power of 2 if it's not already one.
if (rb_popcount64(new_capacity) != 1) {
new_capacity = (size_t)(1 << (64 - nlz_int64(new_capacity)));
}
new_capacity -= malloc_offset;
new_capacity /= type_size;
if (current > new_capacity) {
rb_bug("rb_malloc_grow_capa: current_capacity=%zu, new_capacity=%zu, malloc_offset=%zu", current, new_capacity, malloc_offset);
}
RUBY_ASSERT(new_capacity > current);
return new_capacity;
}
static inline struct rbimpl_size_mul_overflow_tag
size_add_overflow(size_t x, size_t y)
{
@ -13979,6 +14041,8 @@ void
Init_GC(void)
{
#undef rb_intern
malloc_offset = gc_compute_malloc_offset();
VALUE rb_mObjSpace;
VALUE rb_mProfiler;
VALUE gc_constants;

Просмотреть файл

@ -227,6 +227,7 @@ __attribute__((__alloc_align__(1)))
RUBY_ATTR_MALLOC void *rb_aligned_malloc(size_t, size_t) RUBY_ATTR_ALLOC_SIZE((2));
size_t rb_size_mul_or_raise(size_t, size_t, VALUE); /* used in compile.c */
size_t rb_size_mul_add_or_raise(size_t, size_t, size_t, VALUE); /* used in iseq.h */
size_t rb_malloc_grow_capa(size_t current_capacity, size_t type_size);
RUBY_ATTR_MALLOC void *rb_xmalloc_mul_add(size_t, size_t, size_t);
RUBY_ATTR_MALLOC void *rb_xcalloc_mul_add(size_t, size_t, size_t);
void *rb_xrealloc_mul_add(const void *, size_t, size_t, size_t);

10
shape.c
Просмотреть файл

@ -418,19 +418,21 @@ rb_shape_get_next(rb_shape_t* shape, VALUE obj, ID id)
}
static inline rb_shape_t *
rb_shape_transition_shape_capa_create(rb_shape_t* shape, uint32_t new_capacity)
rb_shape_transition_shape_capa_create(rb_shape_t* shape, size_t new_capacity)
{
RUBY_ASSERT(new_capacity < (size_t)MAX_IVARS);
ID edge_name = rb_make_temporary_id(new_capacity);
bool dont_care;
rb_shape_t * new_shape = get_next_shape_internal(shape, edge_name, SHAPE_CAPACITY_CHANGE, &dont_care, true, false);
new_shape->capacity = new_capacity;
new_shape->capacity = (uint32_t)new_capacity;
return new_shape;
}
rb_shape_t *
rb_shape_transition_shape_capa(rb_shape_t* shape)
{
return rb_shape_transition_shape_capa_create(shape, shape->capacity * 2);
return rb_shape_transition_shape_capa_create(shape, rb_malloc_grow_capa(shape->capacity, sizeof(VALUE)));
}
bool
@ -833,7 +835,7 @@ Init_default_shapes(void)
// Shapes by size pool
for (int i = 1; i < SIZE_POOL_COUNT; i++) {
uint32_t capa = (uint32_t)((rb_size_pool_slot_size(i) - offsetof(struct RObject, as.ary)) / sizeof(VALUE));
size_t capa = ((rb_size_pool_slot_size(i) - offsetof(struct RObject, as.ary)) / sizeof(VALUE));
rb_shape_t * new_shape = rb_shape_transition_shape_capa_create(root, capa);
new_shape->type = SHAPE_INITIAL_CAPACITY;
new_shape->size_pool_index = i;