From a4f1a294daa7482b555ec10e11bf4ba8f8d9ac41 Mon Sep 17 00:00:00 2001 From: "pavlov%pavlov.net" Date: Tue, 8 Apr 2008 07:19:41 +0000 Subject: [PATCH] bug 424040. add valgrind hooks to jemalloc. patch from Jason Evans r=me --- configure.in | 10 +- memory/jemalloc/jemalloc.c | 257 +++++++++---------------------------- 2 files changed, 73 insertions(+), 194 deletions(-) diff --git a/configure.in b/configure.in index 42e94cfcc73..b27046b3e9b 100644 --- a/configure.in +++ b/configure.in @@ -6144,7 +6144,7 @@ dnl ======================================================== dnl = Enable jemalloc dnl ======================================================== MOZ_ARG_ENABLE_BOOL(jemalloc, -[ --enable-jemalloc Replace memory allocator with jemalloc], +[ --enable-jemalloc Replace memory allocator with jemalloc], MOZ_MEMORY=1, MOZ_MEMORY=) @@ -6242,6 +6242,14 @@ if test "$MOZ_MEMORY"; then AC_MSG_ERROR([--enable-jemalloc not supported on ${target}]) ;; esac + + AC_ARG_WITH([valgrind], + [ --with-valgrind Enable valgrind integration hooks], + [enable_valgrind="yes"], [enable_valgrind="no"]) + AC_CHECK_HEADER([valgrind/valgrind.h], [], [enable_valgrind="no"]) + if test "x$enable_valgrind" = "xyes" ; then + AC_DEFINE(MOZ_VALGRIND) + fi fi AC_SUBST(MOZ_MEMORY) AC_SUBST(WIN32_CRT_SRC_DIR) diff --git a/memory/jemalloc/jemalloc.c b/memory/jemalloc/jemalloc.c index 98a1a7d8473..c2cb31a10e9 100644 --- a/memory/jemalloc/jemalloc.c +++ b/memory/jemalloc/jemalloc.c @@ -126,12 +126,16 @@ # define MALLOC_SYSV #endif -/* - * MALLOC_LAZY_FREE enables the use of a per-thread vector of slots that free() - * can atomically stuff object pointers into. This can reduce arena lock - * contention. - */ -/* #define MALLOC_LAZY_FREE */ +/* Embed no-op macros that support memory allocation tracking via valgrind. */ +#ifdef MOZ_VALGRIND +# define MALLOC_VALGRIND +#endif +#ifdef MALLOC_VALGRIND +# include +#else +# define VALGRIND_MALLOCLIKE_BLOCK(addr, sizeB, rzB, is_zeroed) +# define VALGRIND_FREELIKE_BLOCK(addr, rzB) +#endif /* * MALLOC_BALANCE enables monitoring of arena lock contention and dynamically @@ -409,10 +413,6 @@ static const bool __isthreaded = true; /* MALLOC_BALANCE requires TLS. */ # ifdef MALLOC_BALANCE # undef MALLOC_BALANCE -# endif - /* MALLOC_LAZY_FREE requires TLS. */ -# ifdef MALLOC_LAZY_FREE -# undef MALLOC_LAZY_FREE # endif #endif @@ -473,19 +473,6 @@ static const bool __isthreaded = true; #define RUN_MAX_SMALL_2POW 15 #define RUN_MAX_SMALL (1U << RUN_MAX_SMALL_2POW) -#ifdef MALLOC_LAZY_FREE - /* Default size of each arena's lazy free cache. */ -# define LAZY_FREE_2POW_DEFAULT 8 - /* - * Number of pseudo-random probes to conduct before considering the cache to - * be overly full. It takes on average n probes to detect fullness of - * (n-1)/n. However, we are effectively doing multiple non-independent - * trials (each deallocation is a trial), so the actual average threshold - * for clearing the cache is somewhat lower. - */ -# define LAZY_FREE_NPROBES 5 -#endif - /* * Hyper-threaded CPUs may need a special instruction inside spin loops in * order to yield to another virtual CPU. If no such instruction is defined @@ -863,16 +850,6 @@ struct arena_s { uint32_t contention; #endif -#ifdef MALLOC_LAZY_FREE - /* - * Deallocation of small objects can be lazy, in which case free_cache - * stores pointers to those objects that have not yet been deallocated. - * In order to avoid lock contention, slots are chosen randomly. Empty - * slots contain NULL. - */ - void **free_cache; -#endif - /* * bins is used to store rings of free regions of the following sizes, * assuming a 16-byte quantum, 4kB pagesize, and default MALLOC_OPTIONS. @@ -1063,9 +1040,6 @@ static bool opt_dss = true; static bool opt_mmap = true; #endif static size_t opt_dirty_max = DIRTY_MAX_DEFAULT; -#ifdef MALLOC_LAZY_FREE -static int opt_lazy_free_2pow = LAZY_FREE_2POW_DEFAULT; -#endif #ifdef MALLOC_BALANCE static uint64_t opt_balance_threshold = BALANCE_THRESHOLD_DEFAULT; #endif @@ -1178,10 +1152,6 @@ static void *arena_malloc_large(arena_t *arena, size_t size, bool zero); static void *arena_palloc(arena_t *arena, size_t alignment, size_t size, size_t alloc_size); static size_t arena_salloc(const void *ptr); -#ifdef MALLOC_LAZY_FREE -static void arena_dalloc_lazy_hard(arena_t *arena, arena_chunk_t *chunk, - void *ptr, size_t pageind, arena_chunk_map_t *mapelm); -#endif static void arena_dalloc_large(arena_t *arena, arena_chunk_t *chunk, void *ptr); static void arena_ralloc_large_shrink(arena_t *arena, arena_chunk_t *chunk, @@ -1472,7 +1442,7 @@ pow2_ceil(size_t x) return (x); } -#if (defined(MALLOC_LAZY_FREE) || defined(MALLOC_BALANCE)) +#ifdef MALLOC_BALANCE /* * Use a simple linear congruential pseudo-random number generator: * @@ -1521,12 +1491,6 @@ prn_##suffix(uint32_t lg_range) \ * problems. */ -#ifdef MALLOC_LAZY_FREE -/* Define the per-thread PRNG used for lazy deallocation. */ -static __thread uint32_t lazy_free_x; -PRN_DEFINE(lazy_free, lazy_free_x, 12345, 12347) -#endif - #ifdef MALLOC_BALANCE /* Define the PRNG used for arena assignment. */ static __thread uint32_t balance_x; @@ -1785,6 +1749,7 @@ base_alloc(size_t size) } #endif malloc_mutex_unlock(&base_mtx); + VALGRIND_MALLOCLIKE_BLOCK(ret, size, 0, false); return (ret); } @@ -1795,6 +1760,12 @@ base_calloc(size_t number, size_t size) void *ret; ret = base_alloc(number * size); +#ifdef MALLOC_VALGRIND + if (ret != NULL) { + VALGRIND_FREELIKE_BLOCK(ret, 0); + VALGRIND_MALLOCLIKE_BLOCK(ret, size, 0, true); + } +#endif memset(ret, 0, number * size); return (ret); @@ -1809,6 +1780,8 @@ base_node_alloc(void) if (base_nodes != NULL) { ret = base_nodes; base_nodes = *(extent_node_t **)ret; + VALGRIND_FREELIKE_BLOCK(ret, 0); + VALGRIND_MALLOCLIKE_BLOCK(ret, sizeof(extent_node_t), 0, false); malloc_mutex_unlock(&base_mtx); } else { malloc_mutex_unlock(&base_mtx); @@ -1823,6 +1796,8 @@ base_node_dealloc(extent_node_t *node) { malloc_mutex_lock(&base_mtx); + VALGRIND_FREELIKE_BLOCK(node, 0); + VALGRIND_MALLOCLIKE_BLOCK(node, sizeof(extent_node_t *), 0, false); *(extent_node_t **)node = base_nodes; base_nodes = node; malloc_mutex_unlock(&base_mtx); @@ -2628,20 +2603,6 @@ choose_arena_hard(void) assert(__isthreaded); -#ifdef MALLOC_LAZY_FREE - /* - * Seed the PRNG used for lazy deallocation. Since seeding only occurs - * on the first allocation by a thread, it is possible for a thread to - * deallocate before seeding. This is not a critical issue though, - * since it is extremely unusual for an application to to use threads - * that deallocate but *never* allocate, and because even if seeding - * never occurs for multiple threads, they will tend to drift apart - * unless some aspect of the application forces deallocation - * synchronization. - */ - SPRN(lazy_free, (uint32_t)(uintptr_t)(_pthread_self())); -#endif - #ifdef MALLOC_BALANCE /* * Seed the PRNG used for arena load balancing. We can get away with @@ -3017,6 +2978,8 @@ arena_chunk_alloc(arena_t *arena) chunk = (arena_chunk_t *)chunk_alloc(chunksize, true); if (chunk == NULL) return (NULL); + VALGRIND_MALLOCLIKE_BLOCK(chunk, (arena_chunk_header_npages << + pagesize_2pow), 0, false); #ifdef MALLOC_STATS arena->stats.mapped += chunksize; #endif @@ -3089,6 +3052,7 @@ arena_chunk_dealloc(arena_t *arena, arena_chunk_t *chunk) RB_REMOVE(arena_chunk_tree_s, &chunk->arena->chunks, arena->spare); arena->ndirty -= arena->spare->ndirty; + VALGRIND_FREELIKE_BLOCK(arena->spare, 0); chunk_dealloc((void *)arena->spare, chunksize); #ifdef MALLOC_STATS arena->stats.mapped -= chunksize; @@ -3402,6 +3366,9 @@ arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin) if (run == NULL) return (NULL); + VALGRIND_MALLOCLIKE_BLOCK(run, sizeof(arena_run_t) + (sizeof(unsigned) * + bin->regs_mask_nelms - 1), 0, false); + /* Initialize run internals. */ run->bin = bin; @@ -3656,6 +3623,7 @@ arena_malloc_small(arena_t *arena, size_t size, bool zero) #endif malloc_spin_unlock(&arena->lock); + VALGRIND_MALLOCLIKE_BLOCK(ret, size, 0, zero); if (zero == false) { #ifdef MALLOC_FILL if (opt_junk) @@ -3692,6 +3660,7 @@ arena_malloc_large(arena_t *arena, size_t size, bool zero) #endif malloc_spin_unlock(&arena->lock); + VALGRIND_MALLOCLIKE_BLOCK(ret, size, 0, zero); if (zero == false) { #ifdef MALLOC_FILL if (opt_junk) @@ -3813,6 +3782,7 @@ arena_palloc(arena_t *arena, size_t alignment, size_t size, size_t alloc_size) #endif malloc_spin_unlock(&arena->lock); + VALGRIND_MALLOCLIKE_BLOCK(ret, size, 0, false); #ifdef MALLOC_FILL if (opt_junk) memset(ret, 0xa5, size); @@ -4031,6 +4001,7 @@ arena_dalloc_small(arena_t *arena, arena_chunk_t *chunk, void *ptr, #ifdef MALLOC_DEBUG run->magic = 0; #endif + VALGRIND_FREELIKE_BLOCK(run, 0); arena_run_dalloc(arena, run, true); #ifdef MALLOC_STATS bin->stats.curruns--; @@ -4059,90 +4030,6 @@ arena_dalloc_small(arena_t *arena, arena_chunk_t *chunk, void *ptr, #endif } -#ifdef MALLOC_LAZY_FREE -static inline void -arena_dalloc_lazy(arena_t *arena, arena_chunk_t *chunk, void *ptr, - size_t pageind, arena_chunk_map_t *mapelm) -{ - void **free_cache = arena->free_cache; - unsigned i, slot; - - if (__isthreaded == false || opt_lazy_free_2pow < 0) { - malloc_spin_lock(&arena->lock); - arena_dalloc_small(arena, chunk, ptr, pageind, *mapelm); - malloc_spin_unlock(&arena->lock); - return; - } - - for (i = 0; i < LAZY_FREE_NPROBES; i++) { - slot = PRN(lazy_free, opt_lazy_free_2pow); - if (atomic_cmpset_ptr((uintptr_t *)&free_cache[slot], - (uintptr_t)NULL, (uintptr_t)ptr)) { - return; - } - } - - arena_dalloc_lazy_hard(arena, chunk, ptr, pageind, mapelm); -} - -static void -arena_dalloc_lazy_hard(arena_t *arena, arena_chunk_t *chunk, void *ptr, - size_t pageind, arena_chunk_map_t *mapelm) -{ - void **free_cache = arena->free_cache; - unsigned i, slot; - - malloc_spin_lock(&arena->lock); - arena_dalloc_small(arena, chunk, ptr, pageind, *mapelm); - - /* - * Check whether another thread already cleared the cache. It is - * possible that another thread cleared the cache *and* this slot was - * already refilled, which could result in a mostly fruitless cache - * sweep, but such a sequence of events causes no correctness issues. - */ - if ((ptr = (void *)atomic_readandclear_ptr( - (uintptr_t *)&free_cache[slot])) - != NULL) { - unsigned lazy_free_mask; - - /* - * Clear the cache, since we failed to find a slot. It is - * possible that other threads will continue to insert objects - * into the cache while this one sweeps, but that is okay, - * since on average the cache is still swept with the same - * frequency. - */ - - /* Handle pointer at current slot. */ - chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); - pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> - pagesize_2pow); - mapelm = &chunk->map[pageind]; - arena_dalloc_small(arena, chunk, ptr, pageind, *mapelm); - - /* Sweep remainder of slots. */ - lazy_free_mask = (1U << opt_lazy_free_2pow) - 1; - for (i = (slot + 1) & lazy_free_mask; - i != slot; - i = (i + 1) & lazy_free_mask) { - ptr = (void *)atomic_readandclear_ptr( - (uintptr_t *)&free_cache[i]); - if (ptr != NULL) { - chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); - pageind = (((uintptr_t)ptr - (uintptr_t)chunk) - >> pagesize_2pow); - mapelm = &chunk->map[pageind]; - arena_dalloc_small(arena, chunk, ptr, pageind, - *mapelm); - } - } - } - - malloc_spin_unlock(&arena->lock); -} -#endif - static void arena_dalloc_large(arena_t *arena, arena_chunk_t *chunk, void *ptr) { @@ -4197,17 +4084,14 @@ arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr) mapelm = &chunk->map[pageind]; if ((*mapelm & CHUNK_MAP_LARGE) == 0) { /* Small allocation. */ -#ifdef MALLOC_LAZY_FREE - arena_dalloc_lazy(arena, chunk, ptr, pageind, mapelm); -#else malloc_spin_lock(&arena->lock); arena_dalloc_small(arena, chunk, ptr, pageind, *mapelm); malloc_spin_unlock(&arena->lock); -#endif } else { assert((*mapelm & CHUNK_MAP_POS_MASK) == 0); arena_dalloc_large(arena, chunk, ptr); } + VALGRIND_FREELIKE_BLOCK(ptr, 0); } static inline void @@ -4424,10 +4308,28 @@ iralloc(void *ptr, size_t size) oldsize = isalloc(ptr); +#ifndef MALLOC_VALGRIND if (size <= arena_maxclass) return (arena_ralloc(ptr, size, oldsize)); else return (huge_ralloc(ptr, size, oldsize)); +#else + /* + * Valgrind does not provide a public interface for modifying an + * existing allocation, so use malloc/memcpy/free instead. + */ + { + void *ret = imalloc(size); + if (ret != NULL) { + if (oldsize < size) + memcpy(ret, ptr, oldsize); + else + memcpy(ret, ptr, size); + idalloc(ptr); + } + return (ret); + } +#endif } static bool @@ -4457,15 +4359,6 @@ arena_new(arena_t *arena) #ifdef MALLOC_BALANCE arena->contention = 0; #endif -#ifdef MALLOC_LAZY_FREE - if (opt_lazy_free_2pow >= 0) { - arena->free_cache = (void **) base_calloc(1, sizeof(void *) - * (1U << opt_lazy_free_2pow)); - if (arena->free_cache == NULL) - return (true); - } else - arena->free_cache = NULL; -#endif /* Initialize bins. */ prev_run_size = pagesize; @@ -4615,6 +4508,12 @@ huge_malloc(size_t size, bool zero) pages_decommit((void *)((uintptr_t)ret + psize), csize - psize); #endif +#ifdef MALLOC_DECOMMIT + VALGRIND_MALLOCLIKE_BLOCK(ret, psize, 0, zero); +#else + VALGRIND_MALLOCLIKE_BLOCK(ret, csize, 0, zero); +#endif + #ifdef MALLOC_FILL if (zero == false) { if (opt_junk) @@ -4754,6 +4653,12 @@ huge_palloc(size_t alignment, size_t size) } #endif +#ifdef MALLOC_DECOMMIT + VALGRIND_MALLOCLIKE_BLOCK(ret, psize, 0, false); +#else + VALGRIND_MALLOCLIKE_BLOCK(ret, chunk_size, 0, false); +#endif + #ifdef MALLOC_FILL if (opt_junk) # ifdef MALLOC_DECOMMIT @@ -4890,6 +4795,7 @@ huge_dalloc(void *ptr) #else chunk_dealloc(node->addr, node->size); #endif + VALGRIND_FREELIKE_BLOCK(node->addr, 0); base_node_dealloc(node); } @@ -5085,13 +4991,6 @@ malloc_print_stats(void) _malloc_message("CPUs: ", umax2s(ncpus, s), "\n", ""); _malloc_message("Max arenas: ", umax2s(narenas, s), "\n", ""); -#ifdef MALLOC_LAZY_FREE - if (opt_lazy_free_2pow >= 0) { - _malloc_message("Lazy free slots: ", - umax2s(1U << opt_lazy_free_2pow, s), "\n", ""); - } else - _malloc_message("Lazy free slots: 0\n", "", "", ""); -#endif #ifdef MALLOC_BALANCE _malloc_message("Arena balance threshold: ", umax2s(opt_balance_threshold, s), "\n", ""); @@ -5281,11 +5180,6 @@ malloc_init_hard(void) pagesize_mask = result - 1; pagesize_2pow = ffs((int)result) - 1; -#ifdef MALLOC_LAZY_FREE - if (ncpus == 1) - opt_lazy_free_2pow = -1; -#endif - for (i = 0; i < 3; i++) { unsigned j; @@ -5427,18 +5321,6 @@ MALLOC_OUT: (sizeof(size_t) << 3)) opt_chunk_2pow++; break; - case 'l': -#ifdef MALLOC_LAZY_FREE - if (opt_lazy_free_2pow >= 0) - opt_lazy_free_2pow--; -#endif - break; - case 'L': -#ifdef MALLOC_LAZY_FREE - if (ncpus > 1) - opt_lazy_free_2pow++; -#endif - break; case 'm': #ifdef MALLOC_DSS opt_mmap = false; @@ -5585,14 +5467,6 @@ MALLOC_OUT: } arena_maxclass = chunksize - (arena_chunk_header_npages << pagesize_2pow); -#ifdef MALLOC_LAZY_FREE - /* - * Make sure that allocating the free_cache does not exceed the limits - * of what base_alloc() can handle. - */ - while ((sizeof(void *) << opt_lazy_free_2pow) > chunksize) - opt_lazy_free_2pow--; -#endif UTRACE(0, 0, 0); @@ -5747,9 +5621,6 @@ MALLOC_OUT: * Seed here for the initial thread, since choose_arena_hard() is only * called for other threads. The seed values don't really matter. */ -#ifdef MALLOC_LAZY_FREE - SPRN(lazy_free, 42); -#endif #ifdef MALLOC_BALANCE SPRN(balance, 42); #endif