diff --git a/arch/c6x/kernel/setup.c b/arch/c6x/kernel/setup.c index 72e17f7ebd6f..786e36e2f61d 100644 --- a/arch/c6x/kernel/setup.c +++ b/arch/c6x/kernel/setup.c @@ -281,8 +281,6 @@ notrace void __init machine_init(unsigned long dt_ptr) */ set_ist(_vectors_start); - lockdep_init(); - /* * dtb is passed in from bootloader. * fdt is linked in blob. diff --git a/arch/microblaze/kernel/setup.c b/arch/microblaze/kernel/setup.c index 89a2a9394927..f31ebb5dc26c 100644 --- a/arch/microblaze/kernel/setup.c +++ b/arch/microblaze/kernel/setup.c @@ -130,8 +130,6 @@ void __init machine_early_init(const char *cmdline, unsigned int ram, memset(__bss_start, 0, __bss_stop-__bss_start); memset(_ssbss, 0, _esbss-_ssbss); - lockdep_init(); - /* initialize device tree for usage in early_printk */ early_init_devtree(_fdt_start); diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index ad8c9db61237..d544fa311757 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -114,8 +114,6 @@ extern unsigned int memset_nocache_branch; /* Insn to be replaced by NOP */ notrace void __init machine_init(u64 dt_ptr) { - lockdep_init(); - /* Enable early debugging if any specified (see udbg.h) */ udbg_early_init(); diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 5c03a6a9b054..f98be8383a39 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -255,9 +255,6 @@ void __init early_setup(unsigned long dt_ptr) setup_paca(&boot_paca); fixup_boot_paca(); - /* Initialize lockdep early or else spinlocks will blow */ - lockdep_init(); - /* -------- printk is now safe to use ------- */ /* Enable early debugging if any specified (see udbg.h) */ diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index c55576bbaa1f..a0684de5a93b 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -448,7 +448,6 @@ void __init startup_init(void) rescue_initrd(); clear_bss_section(); init_kernel_storage_key(); - lockdep_init(); lockdep_off(); setup_lowcore_early(); setup_facility_list(); diff --git a/arch/sparc/kernel/head_64.S b/arch/sparc/kernel/head_64.S index f2d30cab5b3f..cd1f592cd347 100644 --- a/arch/sparc/kernel/head_64.S +++ b/arch/sparc/kernel/head_64.S @@ -696,14 +696,6 @@ tlb_fixup_done: call __bzero sub %o1, %o0, %o1 -#ifdef CONFIG_LOCKDEP - /* We have this call this super early, as even prom_init can grab - * spinlocks and thus call into the lockdep code. - */ - call lockdep_init - nop -#endif - call prom_init mov %l7, %o0 ! OpenPROM cif handler diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index a584e1c50918..bfb28caf97b1 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -6,18 +6,17 @@ /* * Force strict CPU ordering. - * And yes, this is required on UP too when we're talking + * And yes, this might be required on UP too when we're talking * to devices. */ #ifdef CONFIG_X86_32 -/* - * Some non-Intel clones support out of order store. wmb() ceases to be a - * nop for these. - */ -#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2) -#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2) -#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM) +#define mb() asm volatile(ALTERNATIVE("lock; addl $0,0(%%esp)", "mfence", \ + X86_FEATURE_XMM2) ::: "memory", "cc") +#define rmb() asm volatile(ALTERNATIVE("lock; addl $0,0(%%esp)", "lfence", \ + X86_FEATURE_XMM2) ::: "memory", "cc") +#define wmb() asm volatile(ALTERNATIVE("lock; addl $0,0(%%esp)", "sfence", \ + X86_FEATURE_XMM2) ::: "memory", "cc") #else #define mb() asm volatile("mfence":::"memory") #define rmb() asm volatile("lfence":::"memory") diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 9f7c21c22477..9decee2bfdbe 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -418,9 +418,9 @@ static void mwait_idle(void) if (!current_set_polling_and_test()) { trace_cpu_idle_rcuidle(1, smp_processor_id()); if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) { - smp_mb(); /* quirk */ + mb(); /* quirk */ clflush((void *)¤t_thread_info()->flags); - smp_mb(); /* quirk */ + mb(); /* quirk */ } __monitor((void *)¤t_thread_info()->flags, 0, 0); diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 4ba229ac3f4f..f56cc418c87d 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1520,12 +1520,6 @@ __init void lguest_init(void) */ reserve_top_address(lguest_data.reserve_mem); - /* - * If we don't initialize the lock dependency checker now, it crashes - * atomic_notifier_chain_register, then paravirt_disable_iospace. - */ - lockdep_init(); - /* Hook in our special panic hypercall code. */ atomic_notifier_chain_register(&panic_notifier_list, &paniced); diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h index 39e1cb201b8e..35a52a880b2f 100644 --- a/include/asm-generic/qspinlock.h +++ b/include/asm-generic/qspinlock.h @@ -119,11 +119,6 @@ static __always_inline bool virt_spin_lock(struct qspinlock *lock) } #endif -/* - * Initializier - */ -#define __ARCH_SPIN_LOCK_UNLOCKED { ATOMIC_INIT(0) } - /* * Remapping spinlock architecture specific functions to the corresponding * queued spinlock functions. diff --git a/include/asm-generic/qspinlock_types.h b/include/asm-generic/qspinlock_types.h index 85f888e86761..034acd0c4956 100644 --- a/include/asm-generic/qspinlock_types.h +++ b/include/asm-generic/qspinlock_types.h @@ -32,6 +32,11 @@ typedef struct qspinlock { atomic_t val; } arch_spinlock_t; +/* + * Initializier + */ +#define __ARCH_SPIN_LOCK_UNLOCKED { ATOMIC_INIT(0) } + /* * Bitfields in the atomic value: * diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 48f5aab117ae..a27f4f17c382 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -263,8 +263,9 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s * In contrast to ACCESS_ONCE these two macros will also work on aggregate * data types like structs or unions. If the size of the accessed data * type exceeds the word size of the machine (e.g., 32 bits or 64 bits) - * READ_ONCE() and WRITE_ONCE() will fall back to memcpy and print a - * compile-time warning. + * READ_ONCE() and WRITE_ONCE() will fall back to memcpy(). There's at + * least two memcpy()s: one for the __builtin_memcpy() and then one for + * the macro doing the copy of variable - '__u' allocated on the stack. * * Their two major use cases are: (1) Mediating communication between * process-level code and irq/NMI handlers, all running on the same CPU, diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 4dca42fd32f5..d026b190c530 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -261,7 +261,6 @@ struct held_lock { /* * Initialization, self-test and debugging-output methods: */ -extern void lockdep_init(void); extern void lockdep_info(void); extern void lockdep_reset(void); extern void lockdep_reset_lock(struct lockdep_map *lock); @@ -392,7 +391,6 @@ static inline void lockdep_on(void) # define lockdep_set_current_reclaim_state(g) do { } while (0) # define lockdep_clear_current_reclaim_state() do { } while (0) # define lockdep_trace_alloc(g) do { } while (0) -# define lockdep_init() do { } while (0) # define lockdep_info() do { } while (0) # define lockdep_init_map(lock, name, key, sub) \ do { (void)(name); (void)(key); } while (0) diff --git a/init/main.c b/init/main.c index 58c9e374704b..b3008bcfb1dc 100644 --- a/init/main.c +++ b/init/main.c @@ -499,11 +499,6 @@ asmlinkage __visible void __init start_kernel(void) char *command_line; char *after_dashes; - /* - * Need to run as early as possible, to initialize the - * lockdep hash: - */ - lockdep_init(); set_task_stack_end_magic(&init_task); smp_setup_processor_id(); debug_objects_early_init(); diff --git a/kernel/futex.c b/kernel/futex.c index 5d6ce6413ef1..a5d2e74c89e0 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -124,16 +124,16 @@ * futex_wait(futex, val); * * waiters++; (a) - * mb(); (A) <-- paired with -. - * | - * lock(hash_bucket(futex)); | - * | - * uval = *futex; | - * | *futex = newval; - * | sys_futex(WAKE, futex); - * | futex_wake(futex); - * | - * `-------> mb(); (B) + * smp_mb(); (A) <-- paired with -. + * | + * lock(hash_bucket(futex)); | + * | + * uval = *futex; | + * | *futex = newval; + * | sys_futex(WAKE, futex); + * | futex_wake(futex); + * | + * `--------> smp_mb(); (B) * if (uval == val) * queue(); * unlock(hash_bucket(futex)); @@ -334,7 +334,7 @@ static inline void futex_get_mm(union futex_key *key) /* * Ensure futex_get_mm() implies a full barrier such that * get_futex_key() implies a full barrier. This is relied upon - * as full barrier (B), see the ordering comment above. + * as smp_mb(); (B), see the ordering comment above. */ smp_mb__after_atomic(); } @@ -407,10 +407,10 @@ static void get_futex_key_refs(union futex_key *key) switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: - ihold(key->shared.inode); /* implies MB (B) */ + ihold(key->shared.inode); /* implies smp_mb(); (B) */ break; case FUT_OFF_MMSHARED: - futex_get_mm(key); /* implies MB (B) */ + futex_get_mm(key); /* implies smp_mb(); (B) */ break; default: /* @@ -418,7 +418,7 @@ static void get_futex_key_refs(union futex_key *key) * mm, therefore the only purpose of calling get_futex_key_refs * is because we need the barrier for the lockless waiter check. */ - smp_mb(); /* explicit MB (B) */ + smp_mb(); /* explicit smp_mb(); (B) */ } } @@ -497,7 +497,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) if (!fshared) { key->private.mm = mm; key->private.address = address; - get_futex_key_refs(key); /* implies MB (B) */ + get_futex_key_refs(key); /* implies smp_mb(); (B) */ return 0; } @@ -520,7 +520,20 @@ again: else err = 0; - lock_page(page); + /* + * The treatment of mapping from this point on is critical. The page + * lock protects many things but in this context the page lock + * stabilizes mapping, prevents inode freeing in the shared + * file-backed region case and guards against movement to swap cache. + * + * Strictly speaking the page lock is not needed in all cases being + * considered here and page lock forces unnecessarily serialization + * From this point on, mapping will be re-verified if necessary and + * page lock will be acquired only if it is unavoidable + */ + page = compound_head(page); + mapping = READ_ONCE(page->mapping); + /* * If page->mapping is NULL, then it cannot be a PageAnon * page; but it might be the ZERO_PAGE or in the gate area or @@ -536,19 +549,31 @@ again: * shmem_writepage move it from filecache to swapcache beneath us: * an unlikely race, but we do need to retry for page->mapping. */ - mapping = compound_head(page)->mapping; - if (!mapping) { - int shmem_swizzled = PageSwapCache(page); + if (unlikely(!mapping)) { + int shmem_swizzled; + + /* + * Page lock is required to identify which special case above + * applies. If this is really a shmem page then the page lock + * will prevent unexpected transitions. + */ + lock_page(page); + shmem_swizzled = PageSwapCache(page) || page->mapping; unlock_page(page); put_page(page); + if (shmem_swizzled) goto again; + return -EFAULT; } /* * Private mappings are handled in a simple way. * + * If the futex key is stored on an anonymous page, then the associated + * object is the mm which is implicitly pinned by the calling process. + * * NOTE: When userspace waits on a MAP_SHARED mapping, even if * it's a read-only handle, it's expected that futexes attach to * the object not the particular process. @@ -566,16 +591,74 @@ again: key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ key->private.mm = mm; key->private.address = address; + + get_futex_key_refs(key); /* implies smp_mb(); (B) */ + } else { + struct inode *inode; + + /* + * The associated futex object in this case is the inode and + * the page->mapping must be traversed. Ordinarily this should + * be stabilised under page lock but it's not strictly + * necessary in this case as we just want to pin the inode, not + * update the radix tree or anything like that. + * + * The RCU read lock is taken as the inode is finally freed + * under RCU. If the mapping still matches expectations then the + * mapping->host can be safely accessed as being a valid inode. + */ + rcu_read_lock(); + + if (READ_ONCE(page->mapping) != mapping) { + rcu_read_unlock(); + put_page(page); + + goto again; + } + + inode = READ_ONCE(mapping->host); + if (!inode) { + rcu_read_unlock(); + put_page(page); + + goto again; + } + + /* + * Take a reference unless it is about to be freed. Previously + * this reference was taken by ihold under the page lock + * pinning the inode in place so i_lock was unnecessary. The + * only way for this check to fail is if the inode was + * truncated in parallel so warn for now if this happens. + * + * We are not calling into get_futex_key_refs() in file-backed + * cases, therefore a successful atomic_inc return below will + * guarantee that get_futex_key() will still imply smp_mb(); (B). + */ + if (WARN_ON_ONCE(!atomic_inc_not_zero(&inode->i_count))) { + rcu_read_unlock(); + put_page(page); + + goto again; + } + + /* Should be impossible but lets be paranoid for now */ + if (WARN_ON_ONCE(inode->i_mapping != mapping)) { + err = -EFAULT; + rcu_read_unlock(); + iput(inode); + + goto out; + } + key->both.offset |= FUT_OFF_INODE; /* inode-based key */ - key->shared.inode = mapping->host; + key->shared.inode = inode; key->shared.pgoff = basepage_index(page); + rcu_read_unlock(); } - get_futex_key_refs(key); /* implies MB (B) */ - out: - unlock_page(page); put_page(page); return err; } @@ -1864,7 +1947,7 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) q->lock_ptr = &hb->lock; - spin_lock(&hb->lock); /* implies MB (A) */ + spin_lock(&hb->lock); /* implies smp_mb(); (A) */ return hb; } @@ -1927,8 +2010,12 @@ static int unqueue_me(struct futex_q *q) /* In the common case we don't take the spinlock, which is nice. */ retry: - lock_ptr = q->lock_ptr; - barrier(); + /* + * q->lock_ptr can change between this read and the following spin_lock. + * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and + * optimizing lock_ptr out of the logic below. + */ + lock_ptr = READ_ONCE(q->lock_ptr); if (lock_ptr != NULL) { spin_lock(lock_ptr); /* diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 716547fdb873..f894a2cd9b2a 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -123,8 +123,6 @@ static inline int debug_locks_off_graph_unlock(void) return ret; } -static int lockdep_initialized; - unsigned long nr_list_entries; static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; @@ -433,19 +431,6 @@ unsigned int nr_process_chains; unsigned int max_lockdep_depth; #ifdef CONFIG_DEBUG_LOCKDEP -/* - * We cannot printk in early bootup code. Not even early_printk() - * might work. So we mark any initialization errors and printk - * about it later on, in lockdep_info(). - */ -static int lockdep_init_error; -static const char *lock_init_error; -static unsigned long lockdep_init_trace_data[20]; -static struct stack_trace lockdep_init_trace = { - .max_entries = ARRAY_SIZE(lockdep_init_trace_data), - .entries = lockdep_init_trace_data, -}; - /* * Various lockdep statistics: */ @@ -669,20 +654,6 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) struct hlist_head *hash_head; struct lock_class *class; -#ifdef CONFIG_DEBUG_LOCKDEP - /* - * If the architecture calls into lockdep before initializing - * the hashes then we'll warn about it later. (we cannot printk - * right now) - */ - if (unlikely(!lockdep_initialized)) { - lockdep_init(); - lockdep_init_error = 1; - lock_init_error = lock->name; - save_stack_trace(&lockdep_init_trace); - } -#endif - if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { debug_locks_off(); printk(KERN_ERR @@ -2010,6 +1981,53 @@ struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i) return lock_classes + chain_hlocks[chain->base + i]; } +/* + * Returns the index of the first held_lock of the current chain + */ +static inline int get_first_held_lock(struct task_struct *curr, + struct held_lock *hlock) +{ + int i; + struct held_lock *hlock_curr; + + for (i = curr->lockdep_depth - 1; i >= 0; i--) { + hlock_curr = curr->held_locks + i; + if (hlock_curr->irq_context != hlock->irq_context) + break; + + } + + return ++i; +} + +/* + * Checks whether the chain and the current held locks are consistent + * in depth and also in content. If they are not it most likely means + * that there was a collision during the calculation of the chain_key. + * Returns: 0 not passed, 1 passed + */ +static int check_no_collision(struct task_struct *curr, + struct held_lock *hlock, + struct lock_chain *chain) +{ +#ifdef CONFIG_DEBUG_LOCKDEP + int i, j, id; + + i = get_first_held_lock(curr, hlock); + + if (DEBUG_LOCKS_WARN_ON(chain->depth != curr->lockdep_depth - (i - 1))) + return 0; + + for (j = 0; j < chain->depth - 1; j++, i++) { + id = curr->held_locks[i].class_idx - 1; + + if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) + return 0; + } +#endif + return 1; +} + /* * Look up a dependency chain. If the key is not present yet then * add it and return 1 - in this case the new dependency chain is @@ -2023,7 +2041,6 @@ static inline int lookup_chain_cache(struct task_struct *curr, struct lock_class *class = hlock_class(hlock); struct hlist_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; - struct held_lock *hlock_curr; int i, j; /* @@ -2041,6 +2058,9 @@ static inline int lookup_chain_cache(struct task_struct *curr, if (chain->chain_key == chain_key) { cache_hit: debug_atomic_inc(chain_lookup_hits); + if (!check_no_collision(curr, hlock, chain)) + return 0; + if (very_verbose(class)) printk("\nhash chain already cached, key: " "%016Lx tail class: [%p] %s\n", @@ -2078,13 +2098,7 @@ cache_hit: chain = lock_chains + nr_lock_chains++; chain->chain_key = chain_key; chain->irq_context = hlock->irq_context; - /* Find the first held_lock of current chain */ - for (i = curr->lockdep_depth - 1; i >= 0; i--) { - hlock_curr = curr->held_locks + i; - if (hlock_curr->irq_context != hlock->irq_context) - break; - } - i++; + i = get_first_held_lock(curr, hlock); chain->depth = curr->lockdep_depth + 1 - i; if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { chain->base = nr_chain_hlocks; @@ -2172,7 +2186,7 @@ static void check_chain_key(struct task_struct *curr) { #ifdef CONFIG_DEBUG_LOCKDEP struct held_lock *hlock, *prev_hlock = NULL; - unsigned int i, id; + unsigned int i; u64 chain_key = 0; for (i = 0; i < curr->lockdep_depth; i++) { @@ -2189,17 +2203,16 @@ static void check_chain_key(struct task_struct *curr) (unsigned long long)hlock->prev_chain_key); return; } - id = hlock->class_idx - 1; /* * Whoops ran out of static storage again? */ - if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) + if (DEBUG_LOCKS_WARN_ON(hlock->class_idx > MAX_LOCKDEP_KEYS)) return; if (prev_hlock && (prev_hlock->irq_context != hlock->irq_context)) chain_key = 0; - chain_key = iterate_chain_key(chain_key, id); + chain_key = iterate_chain_key(chain_key, hlock->class_idx); prev_hlock = hlock; } if (chain_key != curr->curr_chain_key) { @@ -3077,7 +3090,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, struct task_struct *curr = current; struct lock_class *class = NULL; struct held_lock *hlock; - unsigned int depth, id; + unsigned int depth; int chain_head = 0; int class_idx; u64 chain_key; @@ -3180,11 +3193,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, * The 'key ID' is what is the most compact key value to drive * the hash, not class->key. */ - id = class - lock_classes; /* * Whoops, we did it again.. ran straight out of our static allocation. */ - if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) + if (DEBUG_LOCKS_WARN_ON(class_idx > MAX_LOCKDEP_KEYS)) return 0; chain_key = curr->curr_chain_key; @@ -3202,7 +3214,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, chain_key = 0; chain_head = 1; } - chain_key = iterate_chain_key(chain_key, id); + chain_key = iterate_chain_key(chain_key, class_idx); if (nest_lock && !__lock_is_held(nest_lock)) return print_lock_nested_lock_not_held(curr, hlock, ip); @@ -4013,28 +4025,6 @@ out_restore: raw_local_irq_restore(flags); } -void lockdep_init(void) -{ - int i; - - /* - * Some architectures have their own start_kernel() - * code which calls lockdep_init(), while we also - * call lockdep_init() from the start_kernel() itself, - * and we want to initialize the hashes only once: - */ - if (lockdep_initialized) - return; - - for (i = 0; i < CLASSHASH_SIZE; i++) - INIT_HLIST_HEAD(classhash_table + i); - - for (i = 0; i < CHAINHASH_SIZE; i++) - INIT_HLIST_HEAD(chainhash_table + i); - - lockdep_initialized = 1; -} - void __init lockdep_info(void) { printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n"); @@ -4061,14 +4051,6 @@ void __init lockdep_info(void) printk(" per task-struct memory footprint: %lu bytes\n", sizeof(struct held_lock) * MAX_LOCK_DEPTH); - -#ifdef CONFIG_DEBUG_LOCKDEP - if (lockdep_init_error) { - printk("WARNING: lockdep init error: lock '%s' was acquired before lockdep_init().\n", lock_init_error); - printk("Call stack leading to lockdep invocation was:\n"); - print_stack_trace(&lockdep_init_trace, 0); - } -#endif } static void diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index 5b9102a47ea5..c835270f0c2f 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -67,7 +67,13 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) node->locked = 0; node->next = NULL; - prev = xchg_acquire(lock, node); + /* + * We rely on the full barrier with global transitivity implied by the + * below xchg() to order the initialization stores above against any + * observation of @node. And to provide the ACQUIRE ordering associated + * with a LOCK primitive. + */ + prev = xchg(lock, node); if (likely(prev == NULL)) { /* * Lock acquired, don't need to set node->locked to 1. Threads diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 0551c219c40e..e364b424b019 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -716,6 +716,7 @@ static inline void __mutex_unlock_common_slowpath(struct mutex *lock, int nested) { unsigned long flags; + WAKE_Q(wake_q); /* * As a performance measurement, release the lock before doing other @@ -743,11 +744,11 @@ __mutex_unlock_common_slowpath(struct mutex *lock, int nested) struct mutex_waiter, list); debug_mutex_wake_waiter(lock, waiter); - - wake_up_process(waiter->task); + wake_q_add(&wake_q, waiter->task); } spin_unlock_mutex(&lock->wait_lock, flags); + wake_up_q(&wake_q); } /* diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 393d1874b9e0..ce2f75e32ae1 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -358,8 +358,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) * sequentiality; this is because not all clear_pending_set_locked() * implementations imply full barriers. */ - while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_MASK) - cpu_relax(); + smp_cond_acquire(!(atomic_read(&lock->val) & _Q_LOCKED_MASK)); /* * take ownership and clear the pending bit. @@ -435,7 +434,7 @@ queue: * * The PV pv_wait_head_or_lock function, if active, will acquire * the lock and return a non-zero value. So we have to skip the - * smp_load_acquire() call. As the next PV queue head hasn't been + * smp_cond_acquire() call. As the next PV queue head hasn't been * designated yet, there is no way for the locked value to become * _Q_SLOW_VAL. So both the set_locked() and the * atomic_cmpxchg_relaxed() calls will be safe. @@ -466,7 +465,7 @@ locked: break; } /* - * The smp_load_acquire() call above has provided the necessary + * The smp_cond_acquire() call above has provided the necessary * acquire semantics required for locking. At most two * iterations of this loop may be ran. */ diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 87bb235c3448..21ede57f68b3 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -54,6 +54,11 @@ struct pv_node { u8 state; }; +/* + * Include queued spinlock statistics code + */ +#include "qspinlock_stat.h" + /* * By replacing the regular queued_spin_trylock() with the function below, * it will be called once when a lock waiter enter the PV slowpath before @@ -65,9 +70,11 @@ struct pv_node { static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock) { struct __qspinlock *l = (void *)lock; + int ret = !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && + (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0); - return !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && - (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0); + qstat_inc(qstat_pv_lock_stealing, ret); + return ret; } /* @@ -137,11 +144,6 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock) } #endif /* _Q_PENDING_BITS == 8 */ -/* - * Include queued spinlock statistics code - */ -#include "qspinlock_stat.h" - /* * Lock and MCS node addresses hash table for fast lookup * @@ -398,6 +400,11 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) if (READ_ONCE(pn->state) == vcpu_hashed) lp = (struct qspinlock **)1; + /* + * Tracking # of slowpath locking operations + */ + qstat_inc(qstat_pv_lock_slowpath, true); + for (;; waitcnt++) { /* * Set correct vCPU state to be used by queue node wait-early diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index 640dcecdd1df..eb2a2c9bc3fc 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -22,6 +22,7 @@ * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake * pv_latency_kick - average latency (ns) of vCPU kick operation * pv_latency_wake - average latency (ns) from vCPU kick to wakeup + * pv_lock_slowpath - # of locking operations via the slowpath * pv_lock_stealing - # of lock stealing operations * pv_spurious_wakeup - # of spurious wakeups * pv_wait_again - # of vCPU wait's that happened after a vCPU kick @@ -45,6 +46,7 @@ enum qlock_stats { qstat_pv_kick_wake, qstat_pv_latency_kick, qstat_pv_latency_wake, + qstat_pv_lock_slowpath, qstat_pv_lock_stealing, qstat_pv_spurious_wakeup, qstat_pv_wait_again, @@ -70,6 +72,7 @@ static const char * const qstat_names[qstat_num + 1] = { [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup", [qstat_pv_latency_kick] = "pv_latency_kick", [qstat_pv_latency_wake] = "pv_latency_wake", + [qstat_pv_lock_slowpath] = "pv_lock_slowpath", [qstat_pv_lock_stealing] = "pv_lock_stealing", [qstat_pv_wait_again] = "pv_wait_again", [qstat_pv_wait_early] = "pv_wait_early", @@ -279,19 +282,6 @@ static inline void __pv_wait(u8 *ptr, u8 val) #define pv_kick(c) __pv_kick(c) #define pv_wait(p, v) __pv_wait(p, v) -/* - * PV unfair trylock count tracking function - */ -static inline int qstat_spin_steal_lock(struct qspinlock *lock) -{ - int ret = pv_queued_spin_steal_lock(lock); - - qstat_inc(qstat_pv_lock_stealing, ret); - return ret; -} -#undef queued_spin_trylock -#define queued_spin_trylock(l) qstat_spin_steal_lock(l) - #else /* CONFIG_QUEUED_LOCK_STAT */ static inline void qstat_inc(enum qlock_stats stat, bool cond) { } diff --git a/kernel/smp.c b/kernel/smp.c index d903c02223af..300d29391e07 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -105,13 +105,12 @@ void __init call_function_init(void) * previous function call. For multi-cpu calls its even more interesting * as we'll have to ensure no other cpu is observing our csd. */ -static void csd_lock_wait(struct call_single_data *csd) +static __always_inline void csd_lock_wait(struct call_single_data *csd) { - while (smp_load_acquire(&csd->flags) & CSD_FLAG_LOCK) - cpu_relax(); + smp_cond_acquire(!(csd->flags & CSD_FLAG_LOCK)); } -static void csd_lock(struct call_single_data *csd) +static __always_inline void csd_lock(struct call_single_data *csd) { csd_lock_wait(csd); csd->flags |= CSD_FLAG_LOCK; @@ -124,7 +123,7 @@ static void csd_lock(struct call_single_data *csd) smp_wmb(); } -static void csd_unlock(struct call_single_data *csd) +static __always_inline void csd_unlock(struct call_single_data *csd) { WARN_ON(!(csd->flags & CSD_FLAG_LOCK)); diff --git a/lib/test_static_keys.c b/lib/test_static_keys.c index c61b299e367f..915d75df2086 100644 --- a/lib/test_static_keys.c +++ b/lib/test_static_keys.c @@ -46,8 +46,11 @@ struct test_key { bool (*test_key)(void); }; -#define test_key_func(key, branch) \ - ({bool func(void) { return branch(key); } func; }) +#define test_key_func(key, branch) \ +static bool key ## _ ## branch(void) \ +{ \ + return branch(&key); \ +} static void invert_key(struct static_key *key) { @@ -92,6 +95,25 @@ static int verify_keys(struct test_key *keys, int size, bool invert) return 0; } +test_key_func(old_true_key, static_key_true) +test_key_func(old_false_key, static_key_false) +test_key_func(true_key, static_branch_likely) +test_key_func(true_key, static_branch_unlikely) +test_key_func(false_key, static_branch_likely) +test_key_func(false_key, static_branch_unlikely) +test_key_func(base_old_true_key, static_key_true) +test_key_func(base_inv_old_true_key, static_key_true) +test_key_func(base_old_false_key, static_key_false) +test_key_func(base_inv_old_false_key, static_key_false) +test_key_func(base_true_key, static_branch_likely) +test_key_func(base_true_key, static_branch_unlikely) +test_key_func(base_inv_true_key, static_branch_likely) +test_key_func(base_inv_true_key, static_branch_unlikely) +test_key_func(base_false_key, static_branch_likely) +test_key_func(base_false_key, static_branch_unlikely) +test_key_func(base_inv_false_key, static_branch_likely) +test_key_func(base_inv_false_key, static_branch_unlikely) + static int __init test_static_key_init(void) { int ret; @@ -102,95 +124,95 @@ static int __init test_static_key_init(void) { .init_state = true, .key = &old_true_key, - .test_key = test_key_func(&old_true_key, static_key_true), + .test_key = &old_true_key_static_key_true, }, { .init_state = false, .key = &old_false_key, - .test_key = test_key_func(&old_false_key, static_key_false), + .test_key = &old_false_key_static_key_false, }, /* internal keys - new keys */ { .init_state = true, .key = &true_key.key, - .test_key = test_key_func(&true_key, static_branch_likely), + .test_key = &true_key_static_branch_likely, }, { .init_state = true, .key = &true_key.key, - .test_key = test_key_func(&true_key, static_branch_unlikely), + .test_key = &true_key_static_branch_unlikely, }, { .init_state = false, .key = &false_key.key, - .test_key = test_key_func(&false_key, static_branch_likely), + .test_key = &false_key_static_branch_likely, }, { .init_state = false, .key = &false_key.key, - .test_key = test_key_func(&false_key, static_branch_unlikely), + .test_key = &false_key_static_branch_unlikely, }, /* external keys - old keys */ { .init_state = true, .key = &base_old_true_key, - .test_key = test_key_func(&base_old_true_key, static_key_true), + .test_key = &base_old_true_key_static_key_true, }, { .init_state = false, .key = &base_inv_old_true_key, - .test_key = test_key_func(&base_inv_old_true_key, static_key_true), + .test_key = &base_inv_old_true_key_static_key_true, }, { .init_state = false, .key = &base_old_false_key, - .test_key = test_key_func(&base_old_false_key, static_key_false), + .test_key = &base_old_false_key_static_key_false, }, { .init_state = true, .key = &base_inv_old_false_key, - .test_key = test_key_func(&base_inv_old_false_key, static_key_false), + .test_key = &base_inv_old_false_key_static_key_false, }, /* external keys - new keys */ { .init_state = true, .key = &base_true_key.key, - .test_key = test_key_func(&base_true_key, static_branch_likely), + .test_key = &base_true_key_static_branch_likely, }, { .init_state = true, .key = &base_true_key.key, - .test_key = test_key_func(&base_true_key, static_branch_unlikely), + .test_key = &base_true_key_static_branch_unlikely, }, { .init_state = false, .key = &base_inv_true_key.key, - .test_key = test_key_func(&base_inv_true_key, static_branch_likely), + .test_key = &base_inv_true_key_static_branch_likely, }, { .init_state = false, .key = &base_inv_true_key.key, - .test_key = test_key_func(&base_inv_true_key, static_branch_unlikely), + .test_key = &base_inv_true_key_static_branch_unlikely, }, { .init_state = false, .key = &base_false_key.key, - .test_key = test_key_func(&base_false_key, static_branch_likely), + .test_key = &base_false_key_static_branch_likely, }, { .init_state = false, .key = &base_false_key.key, - .test_key = test_key_func(&base_false_key, static_branch_unlikely), + .test_key = &base_false_key_static_branch_unlikely, }, { .init_state = true, .key = &base_inv_false_key.key, - .test_key = test_key_func(&base_inv_false_key, static_branch_likely), + .test_key = &base_inv_false_key_static_branch_likely, }, { .init_state = true, .key = &base_inv_false_key.key, - .test_key = test_key_func(&base_inv_false_key, static_branch_unlikely), + .test_key = &base_inv_false_key_static_branch_unlikely, }, }; diff --git a/tools/lib/lockdep/Makefile b/tools/lib/lockdep/Makefile index 90d2baeb621a..1d57af56814b 100644 --- a/tools/lib/lockdep/Makefile +++ b/tools/lib/lockdep/Makefile @@ -100,7 +100,7 @@ include $(srctree)/tools/build/Makefile.include do_compile_shared_library = \ ($(print_shared_lib_compile) \ - $(CC) --shared $^ -o $@ -lpthread -ldl -Wl,-soname='"$@"';$(shell ln -s $@ liblockdep.so)) + $(CC) --shared $^ -o $@ -lpthread -ldl -Wl,-soname='"$@"';$(shell ln -sf $@ liblockdep.so)) do_build_static_lib = \ ($(print_static_lib_build) \ diff --git a/tools/lib/lockdep/common.c b/tools/lib/lockdep/common.c index 9be663340f0a..d1c89cc06f5f 100644 --- a/tools/lib/lockdep/common.c +++ b/tools/lib/lockdep/common.c @@ -11,11 +11,6 @@ static __thread struct task_struct current_obj; bool debug_locks = true; bool debug_locks_silent; -__attribute__((constructor)) static void liblockdep_init(void) -{ - lockdep_init(); -} - __attribute__((destructor)) static void liblockdep_exit(void) { debug_check_no_locks_held(); diff --git a/tools/lib/lockdep/include/liblockdep/common.h b/tools/lib/lockdep/include/liblockdep/common.h index a60c14b9662a..6e66277ec437 100644 --- a/tools/lib/lockdep/include/liblockdep/common.h +++ b/tools/lib/lockdep/include/liblockdep/common.h @@ -44,7 +44,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, void lock_release(struct lockdep_map *lock, int nested, unsigned long ip); extern void debug_check_no_locks_freed(const void *from, unsigned long len); -extern void lockdep_init(void); #define STATIC_LOCKDEP_MAP_INIT(_name, _key) \ { .name = (_name), .key = (void *)(_key), } diff --git a/tools/lib/lockdep/lockdep.c b/tools/lib/lockdep/lockdep.c index f42b7e9aa48f..a0a2e3a266af 100644 --- a/tools/lib/lockdep/lockdep.c +++ b/tools/lib/lockdep/lockdep.c @@ -1,2 +1,8 @@ #include + +/* Trivial API wrappers, we don't (yet) have RCU in user-space: */ +#define hlist_for_each_entry_rcu hlist_for_each_entry +#define hlist_add_head_rcu hlist_add_head +#define hlist_del_rcu hlist_del + #include "../../../kernel/locking/lockdep.c" diff --git a/tools/lib/lockdep/preload.c b/tools/lib/lockdep/preload.c index 21cdf869a01b..52844847569c 100644 --- a/tools/lib/lockdep/preload.c +++ b/tools/lib/lockdep/preload.c @@ -439,7 +439,5 @@ __attribute__((constructor)) static void init_preload(void) ll_pthread_rwlock_unlock = dlsym(RTLD_NEXT, "pthread_rwlock_unlock"); #endif - lockdep_init(); - __init_state = done; } diff --git a/tools/lib/lockdep/tests/AA.c b/tools/lib/lockdep/tests/AA.c index 0f782ff404ac..18211a5f354f 100644 --- a/tools/lib/lockdep/tests/AA.c +++ b/tools/lib/lockdep/tests/AA.c @@ -1,13 +1,13 @@ #include -void main(void) +int main(void) { - pthread_mutex_t a, b; + pthread_mutex_t a; pthread_mutex_init(&a, NULL); - pthread_mutex_init(&b, NULL); pthread_mutex_lock(&a); - pthread_mutex_lock(&b); pthread_mutex_lock(&a); + + return 0; } diff --git a/tools/lib/lockdep/tests/ABA.c b/tools/lib/lockdep/tests/ABA.c new file mode 100644 index 000000000000..0f782ff404ac --- /dev/null +++ b/tools/lib/lockdep/tests/ABA.c @@ -0,0 +1,13 @@ +#include + +void main(void) +{ + pthread_mutex_t a, b; + + pthread_mutex_init(&a, NULL); + pthread_mutex_init(&b, NULL); + + pthread_mutex_lock(&a); + pthread_mutex_lock(&b); + pthread_mutex_lock(&a); +} diff --git a/tools/lib/lockdep/tests/ABBA_2threads.c b/tools/lib/lockdep/tests/ABBA_2threads.c new file mode 100644 index 000000000000..cd807d736361 --- /dev/null +++ b/tools/lib/lockdep/tests/ABBA_2threads.c @@ -0,0 +1,46 @@ +#include +#include + +pthread_mutex_t a = PTHREAD_MUTEX_INITIALIZER; +pthread_mutex_t b = PTHREAD_MUTEX_INITIALIZER; +pthread_barrier_t bar; + +void *ba_lock(void *arg) +{ + int ret, i; + + pthread_mutex_lock(&b); + + if (pthread_barrier_wait(&bar) == PTHREAD_BARRIER_SERIAL_THREAD) + pthread_barrier_destroy(&bar); + + pthread_mutex_lock(&a); + + pthread_mutex_unlock(&a); + pthread_mutex_unlock(&b); +} + +int main(void) +{ + pthread_t t; + + pthread_barrier_init(&bar, NULL, 2); + + if (pthread_create(&t, NULL, ba_lock, NULL)) { + fprintf(stderr, "pthread_create() failed\n"); + return 1; + } + pthread_mutex_lock(&a); + + if (pthread_barrier_wait(&bar) == PTHREAD_BARRIER_SERIAL_THREAD) + pthread_barrier_destroy(&bar); + + pthread_mutex_lock(&b); + + pthread_mutex_unlock(&b); + pthread_mutex_unlock(&a); + + pthread_join(t, NULL); + + return 0; +} diff --git a/tools/lib/lockdep/uinclude/linux/compiler.h b/tools/lib/lockdep/uinclude/linux/compiler.h index 6386dc3182a0..fd3e56a83fc2 100644 --- a/tools/lib/lockdep/uinclude/linux/compiler.h +++ b/tools/lib/lockdep/uinclude/linux/compiler.h @@ -3,6 +3,7 @@ #define __used __attribute__((__unused__)) #define unlikely +#define READ_ONCE(x) (x) #define WRITE_ONCE(x, val) x=(val) #define RCU_INIT_POINTER(p, v) p=(v)