2005-04-17 02:20:36 +04:00
|
|
|
#ifndef __LINUX_SMP_H
|
|
|
|
#define __LINUX_SMP_H
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Generic SMP support
|
|
|
|
* Alan Cox. <alan@redhat.com>
|
|
|
|
*/
|
|
|
|
|
2007-05-17 09:11:09 +04:00
|
|
|
#include <linux/errno.h>
|
2008-09-24 09:15:57 +04:00
|
|
|
#include <linux/types.h>
|
2008-06-26 13:21:34 +04:00
|
|
|
#include <linux/list.h>
|
|
|
|
#include <linux/cpumask.h>
|
smp: add missing init.h include
Commit 34db18a054c6 ("smp: move smp setup functions to kernel/smp.c")
causes this build error on s390 because of a missing init.h include:
CC arch/s390/kernel/asm-offsets.s
In file included from /home2/heicarst/linux-2.6/arch/s390/include/asm/spinlock.h:14:0,
from include/linux/spinlock.h:87,
from include/linux/seqlock.h:29,
from include/linux/time.h:8,
from include/linux/timex.h:56,
from include/linux/sched.h:57,
from arch/s390/kernel/asm-offsets.c:10:
include/linux/smp.h:117:20: error: expected '=', ',', ';', 'asm' or '__attribute__' before 'setup_nr_cpu_ids'
include/linux/smp.h:118:20: error: expected '=', ',', ';', 'asm' or '__attribute__' before 'smp_init'
Fix it by adding the include statement.
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: WANG Cong <amwang@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-03-23 10:24:58 +03:00
|
|
|
#include <linux/init.h>
|
2014-01-31 03:45:47 +04:00
|
|
|
#include <linux/llist.h>
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2010-10-27 20:28:36 +04:00
|
|
|
typedef void (*smp_call_func_t)(void *info);
|
smp: Avoid using two cache lines for struct call_single_data
struct call_single_data is used in IPIs to transfer information between
CPUs. Its size is bigger than sizeof(unsigned long) and less than
cache line size. Currently it is not allocated with any explicit alignment
requirements. This makes it possible for allocated call_single_data to
cross two cache lines, which results in double the number of the cache lines
that need to be transferred among CPUs.
This can be fixed by requiring call_single_data to be aligned with the
size of call_single_data. Currently the size of call_single_data is the
power of 2. If we add new fields to call_single_data, we may need to
add padding to make sure the size of new definition is the power of 2
as well.
Fortunately, this is enforced by GCC, which will report bad sizes.
To set alignment requirements of call_single_data to the size of
call_single_data, a struct definition and a typedef is used.
To test the effect of the patch, I used the vm-scalability multiple
thread swap test case (swap-w-seq-mt). The test will create multiple
threads and each thread will eat memory until all RAM and part of swap
is used, so that huge number of IPIs are triggered when unmapping
memory. In the test, the throughput of memory writing improves ~5%
compared with misaligned call_single_data, because of faster IPIs.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Huang, Ying <ying.huang@intel.com>
[ Add call_single_data_t and align with size of call_single_data. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/87bmnqd6lz.fsf@yhuang-mobile.sh.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-08-08 07:30:00 +03:00
|
|
|
struct __call_single_data {
|
2014-02-24 19:39:56 +04:00
|
|
|
struct llist_node llist;
|
2010-10-27 20:28:36 +04:00
|
|
|
smp_call_func_t func;
|
2008-06-26 13:21:34 +04:00
|
|
|
void *info;
|
2015-04-20 19:08:49 +03:00
|
|
|
unsigned int flags;
|
2008-06-26 13:21:34 +04:00
|
|
|
};
|
|
|
|
|
smp: Avoid using two cache lines for struct call_single_data
struct call_single_data is used in IPIs to transfer information between
CPUs. Its size is bigger than sizeof(unsigned long) and less than
cache line size. Currently it is not allocated with any explicit alignment
requirements. This makes it possible for allocated call_single_data to
cross two cache lines, which results in double the number of the cache lines
that need to be transferred among CPUs.
This can be fixed by requiring call_single_data to be aligned with the
size of call_single_data. Currently the size of call_single_data is the
power of 2. If we add new fields to call_single_data, we may need to
add padding to make sure the size of new definition is the power of 2
as well.
Fortunately, this is enforced by GCC, which will report bad sizes.
To set alignment requirements of call_single_data to the size of
call_single_data, a struct definition and a typedef is used.
To test the effect of the patch, I used the vm-scalability multiple
thread swap test case (swap-w-seq-mt). The test will create multiple
threads and each thread will eat memory until all RAM and part of swap
is used, so that huge number of IPIs are triggered when unmapping
memory. In the test, the throughput of memory writing improves ~5%
compared with misaligned call_single_data, because of faster IPIs.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Huang, Ying <ying.huang@intel.com>
[ Add call_single_data_t and align with size of call_single_data. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/87bmnqd6lz.fsf@yhuang-mobile.sh.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-08-08 07:30:00 +03:00
|
|
|
/* Use __aligned() to avoid to use 2 cache lines for 1 csd */
|
|
|
|
typedef struct __call_single_data call_single_data_t
|
|
|
|
__aligned(sizeof(struct __call_single_data));
|
|
|
|
|
2008-12-16 07:26:48 +03:00
|
|
|
/* total number of cpus in this system (may exceed NR_CPUS) */
|
|
|
|
extern unsigned int total_cpus;
|
|
|
|
|
2010-10-27 20:28:36 +04:00
|
|
|
int smp_call_function_single(int cpuid, smp_call_func_t func, void *info,
|
|
|
|
int wait);
|
2009-01-09 23:27:08 +03:00
|
|
|
|
2013-09-12 01:23:26 +04:00
|
|
|
/*
|
|
|
|
* Call a function on all processors
|
|
|
|
*/
|
|
|
|
int on_each_cpu(smp_call_func_t func, void *info, int wait);
|
|
|
|
|
2013-09-12 01:23:24 +04:00
|
|
|
/*
|
|
|
|
* Call a function on processors specified by mask, which might include
|
|
|
|
* the local one.
|
|
|
|
*/
|
|
|
|
void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
|
|
|
|
void *info, bool wait);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Call a function on each processor for which the supplied function
|
|
|
|
* cond_func returns a positive value. This may include the local
|
|
|
|
* processor.
|
|
|
|
*/
|
|
|
|
void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
|
|
|
|
smp_call_func_t func, void *info, bool wait,
|
|
|
|
gfp_t gfp_flags);
|
|
|
|
|
smp: Avoid using two cache lines for struct call_single_data
struct call_single_data is used in IPIs to transfer information between
CPUs. Its size is bigger than sizeof(unsigned long) and less than
cache line size. Currently it is not allocated with any explicit alignment
requirements. This makes it possible for allocated call_single_data to
cross two cache lines, which results in double the number of the cache lines
that need to be transferred among CPUs.
This can be fixed by requiring call_single_data to be aligned with the
size of call_single_data. Currently the size of call_single_data is the
power of 2. If we add new fields to call_single_data, we may need to
add padding to make sure the size of new definition is the power of 2
as well.
Fortunately, this is enforced by GCC, which will report bad sizes.
To set alignment requirements of call_single_data to the size of
call_single_data, a struct definition and a typedef is used.
To test the effect of the patch, I used the vm-scalability multiple
thread swap test case (swap-w-seq-mt). The test will create multiple
threads and each thread will eat memory until all RAM and part of swap
is used, so that huge number of IPIs are triggered when unmapping
memory. In the test, the throughput of memory writing improves ~5%
compared with misaligned call_single_data, because of faster IPIs.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Huang, Ying <ying.huang@intel.com>
[ Add call_single_data_t and align with size of call_single_data. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/87bmnqd6lz.fsf@yhuang-mobile.sh.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-08-08 07:30:00 +03:00
|
|
|
int smp_call_function_single_async(int cpu, call_single_data_t *csd);
|
2013-11-15 02:32:09 +04:00
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
|
|
|
|
#include <linux/preempt.h>
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/compiler.h>
|
|
|
|
#include <linux/thread_info.h>
|
|
|
|
#include <asm/smp.h>
|
|
|
|
|
|
|
|
/*
|
|
|
|
* main cross-CPU interfaces, handles INIT, TLB flush, STOP, etc.
|
|
|
|
* (defined in asm header):
|
2009-03-13 13:14:06 +03:00
|
|
|
*/
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* stops all CPUs but the current one:
|
|
|
|
*/
|
|
|
|
extern void smp_send_stop(void);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* sends a 'reschedule' event to another CPU:
|
|
|
|
*/
|
|
|
|
extern void smp_send_reschedule(int cpu);
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Prepare machine for booting other CPUs.
|
|
|
|
*/
|
|
|
|
extern void smp_prepare_cpus(unsigned int max_cpus);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Bring a CPU up
|
|
|
|
*/
|
2012-04-20 17:05:42 +04:00
|
|
|
extern int __cpu_up(unsigned int cpunum, struct task_struct *tidle);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Final polishing of CPUs
|
|
|
|
*/
|
|
|
|
extern void smp_cpus_done(unsigned int max_cpus);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Call a function on all other processors
|
|
|
|
*/
|
2010-10-27 20:28:36 +04:00
|
|
|
int smp_call_function(smp_call_func_t func, void *info, int wait);
|
2008-12-30 01:35:16 +03:00
|
|
|
void smp_call_function_many(const struct cpumask *mask,
|
2010-10-27 20:28:36 +04:00
|
|
|
smp_call_func_t func, void *info, bool wait);
|
2008-11-05 05:39:10 +03:00
|
|
|
|
2009-11-18 01:27:27 +03:00
|
|
|
int smp_call_function_any(const struct cpumask *mask,
|
2010-10-27 20:28:36 +04:00
|
|
|
smp_call_func_t func, void *info, int wait);
|
2009-11-18 01:27:27 +03:00
|
|
|
|
2012-05-07 21:59:48 +04:00
|
|
|
void kick_all_cpus_sync(void);
|
2014-09-04 11:17:54 +04:00
|
|
|
void wake_up_all_idle_cpus(void);
|
2012-05-07 21:59:48 +04:00
|
|
|
|
2008-06-26 13:21:34 +04:00
|
|
|
/*
|
|
|
|
* Generic and arch helpers
|
|
|
|
*/
|
2011-03-29 20:35:04 +04:00
|
|
|
void __init call_function_init(void);
|
2008-06-26 13:21:34 +04:00
|
|
|
void generic_smp_call_function_single_interrupt(void);
|
smp: make smp_call_function_many() use logic similar to smp_call_function_single()
I'm testing swapout workload in a two-socket Xeon machine. The workload
has 10 threads, each thread sequentially accesses separate memory
region. TLB flush overhead is very big in the workload. For each page,
page reclaim need move it from active lru list and then unmap it. Both
need a TLB flush. And this is a multthread workload, TLB flush happens
in 10 CPUs. In X86, TLB flush uses generic smp_call)function. So this
workload stress smp_call_function_many heavily.
Without patch, perf shows:
+ 24.49% [k] generic_smp_call_function_interrupt
- 21.72% [k] _raw_spin_lock
- _raw_spin_lock
+ 79.80% __page_check_address
+ 6.42% generic_smp_call_function_interrupt
+ 3.31% get_swap_page
+ 2.37% free_pcppages_bulk
+ 1.75% handle_pte_fault
+ 1.54% put_super
+ 1.41% grab_super_passive
+ 1.36% __swap_duplicate
+ 0.68% blk_flush_plug_list
+ 0.62% swap_info_get
+ 6.55% [k] flush_tlb_func
+ 6.46% [k] smp_call_function_many
+ 5.09% [k] call_function_interrupt
+ 4.75% [k] default_send_IPI_mask_sequence_phys
+ 2.18% [k] find_next_bit
swapout throughput is around 1300M/s.
With the patch, perf shows:
- 27.23% [k] _raw_spin_lock
- _raw_spin_lock
+ 80.53% __page_check_address
+ 8.39% generic_smp_call_function_single_interrupt
+ 2.44% get_swap_page
+ 1.76% free_pcppages_bulk
+ 1.40% handle_pte_fault
+ 1.15% __swap_duplicate
+ 1.05% put_super
+ 0.98% grab_super_passive
+ 0.86% blk_flush_plug_list
+ 0.57% swap_info_get
+ 8.25% [k] default_send_IPI_mask_sequence_phys
+ 7.55% [k] call_function_interrupt
+ 7.47% [k] smp_call_function_many
+ 7.25% [k] flush_tlb_func
+ 3.81% [k] _raw_spin_lock_irqsave
+ 3.78% [k] generic_smp_call_function_single_interrupt
swapout throughput is around 1400M/s. So there is around a 7%
improvement, and total cpu utilization doesn't change.
Without the patch, cfd_data is shared by all CPUs.
generic_smp_call_function_interrupt does read/write cfd_data several times
which will create a lot of cache ping-pong. With the patch, the data
becomes per-cpu. The ping-pong is avoided. And from the perf data, this
doesn't make call_single_queue lock contend.
Next step is to remove generic_smp_call_function_interrupt() from arch
code.
Signed-off-by: Shaohua Li <shli@fusionio.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-22 04:43:03 +04:00
|
|
|
#define generic_smp_call_function_interrupt \
|
|
|
|
generic_smp_call_function_single_interrupt
|
2006-09-26 10:32:33 +04:00
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
/*
|
|
|
|
* Mark the boot cpu "online" so that it can call console drivers in
|
|
|
|
* printk() and can access its per-cpu storage.
|
|
|
|
*/
|
|
|
|
void smp_prepare_boot_cpu(void);
|
|
|
|
|
2008-01-30 15:33:17 +03:00
|
|
|
extern unsigned int setup_max_cpus;
|
2011-03-23 02:34:06 +03:00
|
|
|
extern void __init setup_nr_cpu_ids(void);
|
|
|
|
extern void __init smp_init(void);
|
2008-01-30 15:33:17 +03:00
|
|
|
|
2017-03-20 14:26:55 +03:00
|
|
|
extern int __boot_cpu_id;
|
|
|
|
|
|
|
|
static inline int get_boot_cpu_id(void)
|
|
|
|
{
|
|
|
|
return __boot_cpu_id;
|
|
|
|
}
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
#else /* !SMP */
|
|
|
|
|
2009-03-13 13:14:06 +03:00
|
|
|
static inline void smp_send_stop(void) { }
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
/*
|
|
|
|
* These macros fold the SMP functionality into a single CPU system
|
|
|
|
*/
|
2005-06-22 04:14:34 +04:00
|
|
|
#define raw_smp_processor_id() 0
|
2010-10-27 20:28:36 +04:00
|
|
|
static inline int up_smp_call_function(smp_call_func_t func, void *info)
|
2006-03-26 13:37:19 +04:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
2008-06-06 13:18:06 +04:00
|
|
|
#define smp_call_function(func, info, wait) \
|
2007-11-10 00:39:38 +03:00
|
|
|
(up_smp_call_function(func, info))
|
2013-09-12 01:19:37 +04:00
|
|
|
|
2005-07-28 12:07:41 +04:00
|
|
|
static inline void smp_send_reschedule(int cpu) { }
|
2005-07-28 21:34:47 +04:00
|
|
|
#define smp_prepare_boot_cpu() do {} while (0)
|
2008-12-15 11:34:35 +03:00
|
|
|
#define smp_call_function_many(mask, func, info, wait) \
|
|
|
|
(up_smp_call_function(func, info))
|
2011-03-29 20:35:04 +04:00
|
|
|
static inline void call_function_init(void) { }
|
2009-11-18 01:27:27 +03:00
|
|
|
|
|
|
|
static inline int
|
2010-10-27 20:28:36 +04:00
|
|
|
smp_call_function_any(const struct cpumask *mask, smp_call_func_t func,
|
2009-11-18 01:27:27 +03:00
|
|
|
void *info, int wait)
|
2008-06-26 13:21:34 +04:00
|
|
|
{
|
2009-11-18 01:27:27 +03:00
|
|
|
return smp_call_function_single(0, func, info, wait);
|
2008-06-26 13:21:34 +04:00
|
|
|
}
|
2009-11-18 01:27:27 +03:00
|
|
|
|
2012-05-07 21:59:48 +04:00
|
|
|
static inline void kick_all_cpus_sync(void) { }
|
2014-09-04 11:17:54 +04:00
|
|
|
static inline void wake_up_all_idle_cpus(void) { }
|
2012-05-07 21:59:48 +04:00
|
|
|
|
2015-01-16 00:22:39 +03:00
|
|
|
#ifdef CONFIG_UP_LATE_INIT
|
|
|
|
extern void __init up_late_init(void);
|
|
|
|
static inline void smp_init(void) { up_late_init(); }
|
|
|
|
#else
|
|
|
|
static inline void smp_init(void) { }
|
|
|
|
#endif
|
|
|
|
|
2017-03-20 14:26:55 +03:00
|
|
|
static inline int get_boot_cpu_id(void)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
#endif /* !SMP */
|
|
|
|
|
|
|
|
/*
|
2005-06-22 04:14:34 +04:00
|
|
|
* smp_processor_id(): get the current CPU ID.
|
2005-04-17 02:20:36 +04:00
|
|
|
*
|
2010-03-06 00:42:45 +03:00
|
|
|
* if DEBUG_PREEMPT is enabled then we check whether it is
|
2005-06-22 04:14:34 +04:00
|
|
|
* used in a preemption-safe way. (smp_processor_id() is safe
|
|
|
|
* if it's used in a preemption-off critical section, or in
|
|
|
|
* a thread that is bound to the current CPU.)
|
2005-04-17 02:20:36 +04:00
|
|
|
*
|
2005-06-22 04:14:34 +04:00
|
|
|
* NOTE: raw_smp_processor_id() is for internal use only
|
|
|
|
* (smp_processor_id() is the preferred variant), but in rare
|
|
|
|
* instances it might also be used to turn off false positives
|
|
|
|
* (i.e. smp_processor_id() use that the debugging code reports but
|
|
|
|
* which use for some reason is legal). Don't use this to hack around
|
|
|
|
* the warning message, as your code might not work under PREEMPT.
|
2005-04-17 02:20:36 +04:00
|
|
|
*/
|
2005-06-22 04:14:34 +04:00
|
|
|
#ifdef CONFIG_DEBUG_PREEMPT
|
|
|
|
extern unsigned int debug_smp_processor_id(void);
|
|
|
|
# define smp_processor_id() debug_smp_processor_id()
|
2005-04-17 02:20:36 +04:00
|
|
|
#else
|
2005-06-22 04:14:34 +04:00
|
|
|
# define smp_processor_id() raw_smp_processor_id()
|
2005-04-17 02:20:36 +04:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#define get_cpu() ({ preempt_disable(); smp_processor_id(); })
|
|
|
|
#define put_cpu() preempt_enable()
|
|
|
|
|
2009-01-31 16:09:06 +03:00
|
|
|
/*
|
|
|
|
* Callback to arch code if there's nosmp or maxcpus=0 on the
|
|
|
|
* boot command line:
|
|
|
|
*/
|
|
|
|
extern void arch_disable_smp_support(void);
|
|
|
|
|
2014-02-11 02:25:49 +04:00
|
|
|
extern void arch_enable_nonboot_cpus_begin(void);
|
|
|
|
extern void arch_enable_nonboot_cpus_end(void);
|
|
|
|
|
2006-06-30 12:55:50 +04:00
|
|
|
void smp_setup_processor_id(void);
|
|
|
|
|
2016-08-29 09:48:44 +03:00
|
|
|
int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par,
|
|
|
|
bool phys);
|
|
|
|
|
2016-07-13 20:17:01 +03:00
|
|
|
/* SMP core functions */
|
|
|
|
int smpcfd_prepare_cpu(unsigned int cpu);
|
|
|
|
int smpcfd_dead_cpu(unsigned int cpu);
|
|
|
|
int smpcfd_dying_cpu(unsigned int cpu);
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
#endif /* __LINUX_SMP_H */
|