From 75013fb16f8484898eaa8d0b08fed942d790f029 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 1 Mar 2017 01:23:24 +0900 Subject: [PATCH 01/37] kprobes/x86: Fix kernel panic when certain exception-handling addresses are probed Fix to the exception table entry check by using probed address instead of the address of copied instruction. This bug may cause unexpected kernel panic if user probe an address where an exception can happen which should be fixup by __ex_table (e.g. copy_from_user.) Unless user puts a kprobe on such address, this doesn't cause any problem. This bug has been introduced years ago, by commit: 464846888d9a ("x86/kprobes: Fix a bug which can modify kernel code permanently"). Signed-off-by: Masami Hiramatsu Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 464846888d9a ("x86/kprobes: Fix a bug which can modify kernel code permanently") Link: http://lkml.kernel.org/r/148829899399.28855.12581062400757221722.stgit@devbox Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes/common.h | 2 +- arch/x86/kernel/kprobes/core.c | 6 +++--- arch/x86/kernel/kprobes/opt.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h index c6ee63f927ab..d688826e5736 100644 --- a/arch/x86/kernel/kprobes/common.h +++ b/arch/x86/kernel/kprobes/common.h @@ -67,7 +67,7 @@ #endif /* Ensure if the instruction can be boostable */ -extern int can_boost(kprobe_opcode_t *instruction); +extern int can_boost(kprobe_opcode_t *instruction, void *addr); /* Recover instruction if given address is probed */ extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr); diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 520b8dfe1640..88b3c942473d 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -166,12 +166,12 @@ NOKPROBE_SYMBOL(skip_prefixes); * Returns non-zero if opcode is boostable. * RIP relative instructions are adjusted at copying time in 64 bits mode */ -int can_boost(kprobe_opcode_t *opcodes) +int can_boost(kprobe_opcode_t *opcodes, void *addr) { kprobe_opcode_t opcode; kprobe_opcode_t *orig_opcodes = opcodes; - if (search_exception_tables((unsigned long)opcodes)) + if (search_exception_tables((unsigned long)addr)) return 0; /* Page fault may occur on this address. */ retry: @@ -416,7 +416,7 @@ static int arch_copy_kprobe(struct kprobe *p) * __copy_instruction can modify the displacement of the instruction, * but it doesn't affect boostable check. */ - if (can_boost(p->ainsn.insn)) + if (can_boost(p->ainsn.insn, p->addr)) p->ainsn.boostable = 0; else p->ainsn.boostable = -1; diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 3d1bee9d6a72..3e7c6e5a08ff 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -178,7 +178,7 @@ static int copy_optimized_instructions(u8 *dest, u8 *src) while (len < RELATIVEJUMP_SIZE) { ret = __copy_instruction(dest + len, src + len); - if (!ret || !can_boost(dest + len)) + if (!ret || !can_boost(dest + len, src + len)) return -EINVAL; len += ret; } From 6b0b7551428e4caae1e2c023a529465a9a9ae2d4 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Thu, 16 Feb 2017 17:00:50 +1100 Subject: [PATCH 02/37] perf/core: Rename CONFIG_[UK]PROBE_EVENT to CONFIG_[UK]PROBE_EVENTS We have uses of CONFIG_UPROBE_EVENT and CONFIG_KPROBE_EVENT as well as CONFIG_UPROBE_EVENTS and CONFIG_KPROBE_EVENTS. Consistently use the plurals. Signed-off-by: Anton Blanchard Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@kernel.org Cc: alexander.shishkin@linux.intel.com Cc: davem@davemloft.net Cc: sparclinux@vger.kernel.org Link: http://lkml.kernel.org/r/20170216060050.20866-1-anton@ozlabs.org Signed-off-by: Ingo Molnar --- Documentation/trace/kprobetrace.txt | 2 +- Documentation/trace/uprobetracer.txt | 2 +- arch/powerpc/configs/85xx/kmp204x_defconfig | 2 +- arch/s390/configs/default_defconfig | 2 +- arch/s390/configs/gcov_defconfig | 2 +- arch/s390/configs/performance_defconfig | 2 +- arch/s390/defconfig | 2 +- kernel/trace/Kconfig | 6 +++--- kernel/trace/Makefile | 4 ++-- kernel/trace/trace.c | 10 +++++----- kernel/trace/trace_probe.h | 4 ++-- 11 files changed, 19 insertions(+), 19 deletions(-) diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt index e4991fb1eedc..41ef9d8efe95 100644 --- a/Documentation/trace/kprobetrace.txt +++ b/Documentation/trace/kprobetrace.txt @@ -12,7 +12,7 @@ kprobes can probe (this means, all functions body except for __kprobes functions). Unlike the Tracepoint based event, this can be added and removed dynamically, on the fly. -To enable this feature, build your kernel with CONFIG_KPROBE_EVENT=y. +To enable this feature, build your kernel with CONFIG_KPROBE_EVENTS=y. Similar to the events tracer, this doesn't need to be activated via current_tracer. Instead of that, add probe points via diff --git a/Documentation/trace/uprobetracer.txt b/Documentation/trace/uprobetracer.txt index fa7b680ee8a0..bf526a7c5559 100644 --- a/Documentation/trace/uprobetracer.txt +++ b/Documentation/trace/uprobetracer.txt @@ -7,7 +7,7 @@ Overview -------- Uprobe based trace events are similar to kprobe based trace events. -To enable this feature, build your kernel with CONFIG_UPROBE_EVENT=y. +To enable this feature, build your kernel with CONFIG_UPROBE_EVENTS=y. Similar to the kprobe-event tracer, this doesn't need to be activated via current_tracer. Instead of that, add probe points via diff --git a/arch/powerpc/configs/85xx/kmp204x_defconfig b/arch/powerpc/configs/85xx/kmp204x_defconfig index aaaaa609cd24..34a4da23f000 100644 --- a/arch/powerpc/configs/85xx/kmp204x_defconfig +++ b/arch/powerpc/configs/85xx/kmp204x_defconfig @@ -210,7 +210,7 @@ CONFIG_DEBUG_SHIRQ=y CONFIG_DETECT_HUNG_TASK=y CONFIG_SCHEDSTATS=y CONFIG_RCU_TRACE=y -CONFIG_UPROBE_EVENT=y +CONFIG_UPROBE_EVENTS=y CONFIG_CRYPTO_NULL=y CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_MD4=y diff --git a/arch/s390/configs/default_defconfig b/arch/s390/configs/default_defconfig index 143b1e00b818..4b176fe83da4 100644 --- a/arch/s390/configs/default_defconfig +++ b/arch/s390/configs/default_defconfig @@ -609,7 +609,7 @@ CONFIG_SCHED_TRACER=y CONFIG_FTRACE_SYSCALLS=y CONFIG_STACK_TRACER=y CONFIG_BLK_DEV_IO_TRACE=y -CONFIG_UPROBE_EVENT=y +CONFIG_UPROBE_EVENTS=y CONFIG_FUNCTION_PROFILER=y CONFIG_HIST_TRIGGERS=y CONFIG_TRACE_ENUM_MAP_FILE=y diff --git a/arch/s390/configs/gcov_defconfig b/arch/s390/configs/gcov_defconfig index f05d2d6e1087..0de46cc397f6 100644 --- a/arch/s390/configs/gcov_defconfig +++ b/arch/s390/configs/gcov_defconfig @@ -560,7 +560,7 @@ CONFIG_SCHED_TRACER=y CONFIG_FTRACE_SYSCALLS=y CONFIG_STACK_TRACER=y CONFIG_BLK_DEV_IO_TRACE=y -CONFIG_UPROBE_EVENT=y +CONFIG_UPROBE_EVENTS=y CONFIG_FUNCTION_PROFILER=y CONFIG_HIST_TRIGGERS=y CONFIG_TRACE_ENUM_MAP_FILE=y diff --git a/arch/s390/configs/performance_defconfig b/arch/s390/configs/performance_defconfig index 2358bf33c5ef..e167557b434c 100644 --- a/arch/s390/configs/performance_defconfig +++ b/arch/s390/configs/performance_defconfig @@ -558,7 +558,7 @@ CONFIG_SCHED_TRACER=y CONFIG_FTRACE_SYSCALLS=y CONFIG_STACK_TRACER=y CONFIG_BLK_DEV_IO_TRACE=y -CONFIG_UPROBE_EVENT=y +CONFIG_UPROBE_EVENTS=y CONFIG_FUNCTION_PROFILER=y CONFIG_HIST_TRIGGERS=y CONFIG_TRACE_ENUM_MAP_FILE=y diff --git a/arch/s390/defconfig b/arch/s390/defconfig index 68bfd09f1b02..97189dbaf34b 100644 --- a/arch/s390/defconfig +++ b/arch/s390/defconfig @@ -179,7 +179,7 @@ CONFIG_FTRACE_SYSCALLS=y CONFIG_TRACER_SNAPSHOT_PER_CPU_SWAP=y CONFIG_STACK_TRACER=y CONFIG_BLK_DEV_IO_TRACE=y -CONFIG_UPROBE_EVENT=y +CONFIG_UPROBE_EVENTS=y CONFIG_FUNCTION_PROFILER=y CONFIG_TRACE_ENUM_MAP_FILE=y CONFIG_KPROBES_SANITY_TEST=y diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d5038005eb5d..d4a06e714645 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -429,7 +429,7 @@ config BLK_DEV_IO_TRACE If unsure, say N. -config KPROBE_EVENT +config KPROBE_EVENTS depends on KPROBES depends on HAVE_REGS_AND_STACK_ACCESS_API bool "Enable kprobes-based dynamic events" @@ -447,7 +447,7 @@ config KPROBE_EVENT This option is also required by perf-probe subcommand of perf tools. If you want to use perf tools, this option is strongly recommended. -config UPROBE_EVENT +config UPROBE_EVENTS bool "Enable uprobes-based dynamic events" depends on ARCH_SUPPORTS_UPROBES depends on MMU @@ -466,7 +466,7 @@ config UPROBE_EVENT config BPF_EVENTS depends on BPF_SYSCALL - depends on (KPROBE_EVENT || UPROBE_EVENT) && PERF_EVENTS + depends on (KPROBE_EVENTS || UPROBE_EVENTS) && PERF_EVENTS bool default y help diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index e57980845549..90f2701d92a7 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -57,7 +57,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o -obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o +obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe.o obj-$(CONFIG_TRACEPOINTS) += power-traces.o ifeq ($(CONFIG_PM),y) obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o @@ -66,7 +66,7 @@ ifeq ($(CONFIG_TRACING),y) obj-$(CONFIG_KGDB_KDB) += trace_kdb.o endif obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o -obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o +obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 707445ceb7ef..f35109514a01 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4341,22 +4341,22 @@ static const char readme_msg[] = "\t\t\t traces\n" #endif #endif /* CONFIG_STACK_TRACER */ -#ifdef CONFIG_KPROBE_EVENT +#ifdef CONFIG_KPROBE_EVENTS " kprobe_events\t\t- Add/remove/show the kernel dynamic events\n" "\t\t\t Write into this file to define/undefine new trace events.\n" #endif -#ifdef CONFIG_UPROBE_EVENT +#ifdef CONFIG_UPROBE_EVENTS " uprobe_events\t\t- Add/remove/show the userspace dynamic events\n" "\t\t\t Write into this file to define/undefine new trace events.\n" #endif -#if defined(CONFIG_KPROBE_EVENT) || defined(CONFIG_UPROBE_EVENT) +#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) "\t accepts: event-definitions (one definition per line)\n" "\t Format: p|r[:[/]] []\n" "\t -:[/]\n" -#ifdef CONFIG_KPROBE_EVENT +#ifdef CONFIG_KPROBE_EVENTS "\t place: [:][+]|\n" #endif -#ifdef CONFIG_UPROBE_EVENT +#ifdef CONFIG_UPROBE_EVENTS "\t place: :\n" #endif "\t args: =fetcharg[:type]\n" diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 0c0ae54d44c6..903273c93e61 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -248,7 +248,7 @@ ASSIGN_FETCH_FUNC(file_offset, ftype), \ #define FETCH_TYPE_STRING 0 #define FETCH_TYPE_STRSIZE 1 -#ifdef CONFIG_KPROBE_EVENT +#ifdef CONFIG_KPROBE_EVENTS struct symbol_cache; unsigned long update_symbol_cache(struct symbol_cache *sc); void free_symbol_cache(struct symbol_cache *sc); @@ -278,7 +278,7 @@ alloc_symbol_cache(const char *sym, long offset) { return NULL; } -#endif /* CONFIG_KPROBE_EVENT */ +#endif /* CONFIG_KPROBE_EVENTS */ struct probe_arg { struct fetch_param fetch; From 771ceddaadd0a2b31603034b36dca50943ff6836 Mon Sep 17 00:00:00 2001 From: Karol Wachowski Date: Mon, 20 Feb 2017 12:50:40 +0100 Subject: [PATCH 03/37] perf vendor events: Add mapping for KnightsMill PMU events Reuse events from KnightsLanding for KnightsMill Signed-off-by: Karol Wachowski Cc: Alexander Shishkin Cc: Andi Kleen Cc: Dave Hansen Cc: Kan Liang Cc: Peter Zijlstra Cc: Piotr Luc Cc: Srinivas Pandruvada Link: http://lkml.kernel.org/r/1487591440-25172-1-git-send-email-karol.wachowski@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/x86/mapfile.csv | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index 12181bb1da2a..d1a12e584c1b 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -17,6 +17,7 @@ GenuineIntel-6-3A,v18,ivybridge,core GenuineIntel-6-3E,v19,ivytown,core GenuineIntel-6-2D,v20,jaketown,core GenuineIntel-6-57,v9,knightslanding,core +GenuineIntel-6-85,v9,knightslanding,core GenuineIntel-6-1E,v2,nehalemep,core GenuineIntel-6-1F,v2,nehalemep,core GenuineIntel-6-1A,v2,nehalemep,core From 02d492e5dcb72c004d213756eb87c9d62a6d76a7 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 7 Feb 2017 01:40:05 +0100 Subject: [PATCH 04/37] perf stat: Issue a HW watchdog disable hint When using perf stat on an AMD F15h system with the default hw events attributes, some of the events don't get counted: Performance counter stats for 'sleep 1': 0.749208 task-clock (msec) # 0.001 CPUs utilized 1 context-switches # 0.001 M/sec 0 cpu-migrations # 0.000 K/sec 54 page-faults # 0.072 M/sec 1,122,815 cycles # 1.499 GHz 286,740 stalled-cycles-frontend # 25.54% frontend cycles idle stalled-cycles-backend (0.00%) ^^^^^^^^^^^^ instructions (0.00%) ^^^^^^^^^^^^ branches (0.00%) branch-misses (0.00%) 1.001550070 seconds time elapsed The reason is that we have the HW watchdog consuming one PMU counter and when perf tries to schedule 6 events on 6 counters and some of those counters are constrained to only a specific subset of PMCs by the hardware, the event scheduling fails. So issue a hint to disable the HW watchdog around a perf stat session. Committer note: Testing it... # perf stat -d usleep 1 Performance counter stats for 'usleep 1': 1.180203 task-clock (msec) # 0.490 CPUs utilized 1 context-switches # 0.847 K/sec 0 cpu-migrations # 0.000 K/sec 54 page-faults # 0.046 M/sec 184,754 cycles # 0.157 GHz 714,553 instructions # 3.87 insn per cycle 154,661 branches # 131.046 M/sec 7,247 branch-misses # 4.69% of all branches 219,984 L1-dcache-loads # 186.395 M/sec 17,600 L1-dcache-load-misses # 8.00% of all L1-dcache hits (90.16%) LLC-loads (0.00%) LLC-load-misses (0.00%) 0.002406823 seconds time elapsed Some events weren't counted. Try disabling the NMI watchdog: echo 0 > /proc/sys/kernel/nmi_watchdog perf stat ... echo 1 > /proc/sys/kernel/nmi_watchdog # Signed-off-by: Borislav Petkov Acked-by: Ingo Molnar Tested-by: Arnaldo Carvalho de Melo Cc: Peter Zijlstra Cc: Robert Richter Cc: Vince Weaver Link: http://lkml.kernel.org/r/20170211183218.ijnvb5f7ciyuunx4@pd.tnic Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 13b54999ad79..f4f555a67e9b 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -146,6 +146,7 @@ static aggr_get_id_t aggr_get_id; static bool append_file; static const char *output_name; static int output_fd; +static int print_free_counters_hint; struct perf_stat { bool record; @@ -1109,6 +1110,9 @@ static void printout(int id, int nr, struct perf_evsel *counter, double uval, counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED, csv_sep); + if (counter->supported) + print_free_counters_hint = 1; + fprintf(stat_config.output, "%-*s%s", csv_output ? 0 : unit_width, counter->unit, csv_sep); @@ -1477,6 +1481,13 @@ static void print_footer(void) avg_stats(&walltime_nsecs_stats)); } fprintf(output, "\n\n"); + + if (print_free_counters_hint) + fprintf(output, +"Some events weren't counted. Try disabling the NMI watchdog:\n" +" echo 0 > /proc/sys/kernel/nmi_watchdog\n" +" perf stat ...\n" +" echo 1 > /proc/sys/kernel/nmi_watchdog\n"); } static void print_counters(struct timespec *ts, int argc, const char **argv) From 4900653829175f60356efc279695bb23c59483c3 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 22 Feb 2017 16:48:24 -0300 Subject: [PATCH 05/37] tools include: Adopt __compiletime_error From the kernel, get the gcc one and provide the fallback so that we can continue build with other compilers, such as with clang. Will be used by tools/arch/x86/include/asm/cmpxchg.h. Cc: Adrian Hunter Cc: David Ahern Cc: Elena Reshetova Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-pecgz6efai4a9euuk4rxuotr@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/linux/compiler-gcc.h | 4 ++++ tools/include/linux/compiler.h | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/tools/include/linux/compiler-gcc.h b/tools/include/linux/compiler-gcc.h index 48af2f10a42d..616935f1ff56 100644 --- a/tools/include/linux/compiler-gcc.h +++ b/tools/include/linux/compiler-gcc.h @@ -12,3 +12,7 @@ #if GCC_VERSION >= 70000 && !defined(__CHECKER__) # define __fallthrough __attribute__ ((fallthrough)) #endif + +#if GCC_VERSION >= 40300 +# define __compiletime_error(message) __attribute__((error(message))) +#endif /* GCC_VERSION >= 40300 */ diff --git a/tools/include/linux/compiler.h b/tools/include/linux/compiler.h index 8de163b17c0d..c9e65e8faacd 100644 --- a/tools/include/linux/compiler.h +++ b/tools/include/linux/compiler.h @@ -5,6 +5,10 @@ #include #endif +#ifndef __compiletime_error +# define __compiletime_error(message) +#endif + /* Optimization barrier */ /* The "volatile" is due to gcc bugs */ #define barrier() __asm__ __volatile__("": : :"memory") From 3337e682d9f3043bb0b925d976558ed5c41b0a09 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 22 Feb 2017 16:54:53 -0300 Subject: [PATCH 06/37] tools arch x86: Include asm/cmpxchg.h Will be included from atomic.h and used in refcount.h Cc: Adrian Hunter Cc: David Ahern Cc: Elena Reshetova Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-pzrydfee75mhq64kazxmf9it@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/asm/cmpxchg.h | 89 ++++++++++++++++++++++++++++ tools/perf/MANIFEST | 1 + tools/scripts/Makefile.include | 9 +++ 3 files changed, 99 insertions(+) create mode 100644 tools/arch/x86/include/asm/cmpxchg.h diff --git a/tools/arch/x86/include/asm/cmpxchg.h b/tools/arch/x86/include/asm/cmpxchg.h new file mode 100644 index 000000000000..f5253260f3cc --- /dev/null +++ b/tools/arch/x86/include/asm/cmpxchg.h @@ -0,0 +1,89 @@ +#ifndef TOOLS_ASM_X86_CMPXCHG_H +#define TOOLS_ASM_X86_CMPXCHG_H + +#include + +/* + * Non-existant functions to indicate usage errors at link time + * (or compile-time if the compiler implements __compiletime_error(). + */ +extern void __cmpxchg_wrong_size(void) + __compiletime_error("Bad argument size for cmpxchg"); + +/* + * Constants for operation sizes. On 32-bit, the 64-bit size it set to + * -1 because sizeof will never return -1, thereby making those switch + * case statements guaranteeed dead code which the compiler will + * eliminate, and allowing the "missing symbol in the default case" to + * indicate a usage error. + */ +#define __X86_CASE_B 1 +#define __X86_CASE_W 2 +#define __X86_CASE_L 4 +#ifdef __x86_64__ +#define __X86_CASE_Q 8 +#else +#define __X86_CASE_Q -1 /* sizeof will never return -1 */ +#endif + +/* + * Atomic compare and exchange. Compare OLD with MEM, if identical, + * store NEW in MEM. Return the initial value in MEM. Success is + * indicated by comparing RETURN with OLD. + */ +#define __raw_cmpxchg(ptr, old, new, size, lock) \ +({ \ + __typeof__(*(ptr)) __ret; \ + __typeof__(*(ptr)) __old = (old); \ + __typeof__(*(ptr)) __new = (new); \ + switch (size) { \ + case __X86_CASE_B: \ + { \ + volatile u8 *__ptr = (volatile u8 *)(ptr); \ + asm volatile(lock "cmpxchgb %2,%1" \ + : "=a" (__ret), "+m" (*__ptr) \ + : "q" (__new), "0" (__old) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_W: \ + { \ + volatile u16 *__ptr = (volatile u16 *)(ptr); \ + asm volatile(lock "cmpxchgw %2,%1" \ + : "=a" (__ret), "+m" (*__ptr) \ + : "r" (__new), "0" (__old) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_L: \ + { \ + volatile u32 *__ptr = (volatile u32 *)(ptr); \ + asm volatile(lock "cmpxchgl %2,%1" \ + : "=a" (__ret), "+m" (*__ptr) \ + : "r" (__new), "0" (__old) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_Q: \ + { \ + volatile u64 *__ptr = (volatile u64 *)(ptr); \ + asm volatile(lock "cmpxchgq %2,%1" \ + : "=a" (__ret), "+m" (*__ptr) \ + : "r" (__new), "0" (__old) \ + : "memory"); \ + break; \ + } \ + default: \ + __cmpxchg_wrong_size(); \ + } \ + __ret; \ +}) + +#define __cmpxchg(ptr, old, new, size) \ + __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX) + +#define cmpxchg(ptr, old, new) \ + __cmpxchg(ptr, old, new, sizeof(*(ptr))) + + +#endif /* TOOLS_ASM_X86_CMPXCHG_H */ diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST index 8672f835ae4e..e2c52190cf28 100644 --- a/tools/perf/MANIFEST +++ b/tools/perf/MANIFEST @@ -12,6 +12,7 @@ tools/arch/sparc/include/asm/barrier_32.h tools/arch/sparc/include/asm/barrier_64.h tools/arch/tile/include/asm/barrier.h tools/arch/x86/include/asm/barrier.h +tools/arch/x86/include/asm/cmpxchg.h tools/arch/x86/include/asm/cpufeatures.h tools/arch/x86/include/asm/disabled-features.h tools/arch/x86/include/asm/required-features.h diff --git a/tools/scripts/Makefile.include b/tools/scripts/Makefile.include index 621578aa12d6..fc74db62fef4 100644 --- a/tools/scripts/Makefile.include +++ b/tools/scripts/Makefile.include @@ -43,6 +43,15 @@ ifneq ($(CC), clang) EXTRA_WARNINGS += -Wstrict-aliasing=3 endif +# Hack to avoid type-punned warnings on old systems such as RHEL5: +# We should be changing CFLAGS and checking gcc version, but this +# will do for now and keep the above -Wstrict-aliasing=3 in place +# in newer systems. +# Needed for the __raw_cmpxchg in tools/arch/x86/include/asm/cmpxchg.h +ifneq ($(filter 3.%,$(MAKE_VERSION)),) # make-3 +EXTRA_WARNINGS += -fno-strict-aliasing +endif + ifneq ($(findstring $(MAKEFLAGS), w),w) PRINT_DIR = --no-print-directory else From 8a73615df3b8973df2de1455c00e9169522d8257 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 22 Feb 2017 16:55:43 -0300 Subject: [PATCH 07/37] tools arch x86: Introduce atomic_cmpxchg() Will be used by atomic_cmpxchg_relaxed(), in turn used by refcount.h. Cc: Adrian Hunter Cc: David Ahern Cc: Elena Reshetova Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-kdmovd3l4gw5b1w31ypr6ddv@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/asm/atomic.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/arch/x86/include/asm/atomic.h b/tools/arch/x86/include/asm/atomic.h index 059e33e94260..328eeceec709 100644 --- a/tools/arch/x86/include/asm/atomic.h +++ b/tools/arch/x86/include/asm/atomic.h @@ -7,6 +7,8 @@ #define LOCK_PREFIX "\n\tlock; " +#include + /* * Atomic operations that C can't guarantee us. Useful for * resource counting etc.. @@ -62,4 +64,9 @@ static inline int atomic_dec_and_test(atomic_t *v) GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", "e"); } +static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new) +{ + return cmpxchg(&v->counter, old, new); +} + #endif /* _TOOLS_LINUX_ASM_X86_ATOMIC_H */ From 2bcdeadbc094b4f6511aedea1e5b8052bf0cc89c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 22 Feb 2017 16:57:53 -0300 Subject: [PATCH 08/37] tools include: Introduce atomic_cmpxchg_{relaxed,release}() Will be used by refcnt.h Cc: Adrian Hunter Cc: David Ahern Cc: Elena Reshetova Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-jszriruqfqpez1bkivwfj6qb@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/linux/atomic.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/include/linux/atomic.h b/tools/include/linux/atomic.h index 4e3d3d18ebab..9f21fc2b092b 100644 --- a/tools/include/linux/atomic.h +++ b/tools/include/linux/atomic.h @@ -3,4 +3,10 @@ #include +/* atomic_cmpxchg_relaxed */ +#ifndef atomic_cmpxchg_relaxed +#define atomic_cmpxchg_relaxed atomic_cmpxchg +#define atomic_cmpxchg_release atomic_cmpxchg +#endif /* atomic_cmpxchg_relaxed */ + #endif /* __TOOLS_LINUX_ATOMIC_H */ From ed4aad50ea0384737034b39f952f29cfb2da52ac Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 23 Feb 2017 15:33:02 -0300 Subject: [PATCH 09/37] tools include: Provide gcc based cmpxchg fallback for !x86 We've been using an atomic_t implementation subset based on the gcc builtin functions for a while, now, with refcount.h we need cmpxchg(), use gcc's __sync_val_compare_and_swap() for that. Cc: Adrian Hunter Cc: David Ahern Cc: Elena Reshetova Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-b9zovyxgpa0c4vi3nm0kjo97@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/asm-generic/atomic-gcc.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/include/asm-generic/atomic-gcc.h b/tools/include/asm-generic/atomic-gcc.h index 2ba78c9f5701..5e9738f97bf3 100644 --- a/tools/include/asm-generic/atomic-gcc.h +++ b/tools/include/asm-generic/atomic-gcc.h @@ -60,4 +60,12 @@ static inline int atomic_dec_and_test(atomic_t *v) return __sync_sub_and_fetch(&v->counter, 1) == 0; } +#define cmpxchg(ptr, oldval, newval) \ + __sync_val_compare_and_swap(ptr, oldval, newval) + +static inline int atomic_cmpxchg(atomic_t *v, int oldval, int newval) +{ + return cmpxchg(&(v)->counter, oldval, newval); +} + #endif /* __TOOLS_ASM_GENERIC_ATOMIC_H */ From eaa75b5117d52adf1efd3c6c3fb4bd8f97de648b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 22 Feb 2017 17:42:40 -0300 Subject: [PATCH 10/37] tools include: Add UINT_MAX def to kernel.h The kernel has it and some files we got from there would require us including the userland header for that, so add it conditionally. Cc: Adrian Hunter Cc: David Ahern Cc: Elena Reshetova Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-gmwyal7c9vzzttlyk6u59rzn@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/linux/kernel.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/include/linux/kernel.h b/tools/include/linux/kernel.h index 28607db02bd3..adb4d0147755 100644 --- a/tools/include/linux/kernel.h +++ b/tools/include/linux/kernel.h @@ -5,6 +5,10 @@ #include #include +#ifndef UINT_MAX +#define UINT_MAX (~0U) +#endif + #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) #define PERF_ALIGN(x, a) __PERF_ALIGN_MASK(x, (typeof(x))(a)-1) From 73a9bf95ed1c05698ecabe2f28c47aedfa61b52b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 22 Feb 2017 17:00:53 -0300 Subject: [PATCH 11/37] tools include: Adopt kernel's refcount.h To aid in catching bugs when using atomics as a reference count. This is a trimmed down version with just what is used by tools/ at this point. After this, the patches submitted by Elena for tools/ doing the conversion from atomic_ to recount_ methods can be applied and tested. To activate it, buint perf with: make DEBUG=1 -C tools/perf Cc: Adrian Hunter Cc: David Ahern Cc: Elena Reshetova Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-dqtxsumns9ov0l9r5x398f19@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/linux/refcount.h | 151 +++++++++++++++++++++++++++++++++ tools/perf/MANIFEST | 1 + 2 files changed, 152 insertions(+) create mode 100644 tools/include/linux/refcount.h diff --git a/tools/include/linux/refcount.h b/tools/include/linux/refcount.h new file mode 100644 index 000000000000..a0177c1f55b1 --- /dev/null +++ b/tools/include/linux/refcount.h @@ -0,0 +1,151 @@ +#ifndef _TOOLS_LINUX_REFCOUNT_H +#define _TOOLS_LINUX_REFCOUNT_H + +/* + * Variant of atomic_t specialized for reference counts. + * + * The interface matches the atomic_t interface (to aid in porting) but only + * provides the few functions one should use for reference counting. + * + * It differs in that the counter saturates at UINT_MAX and will not move once + * there. This avoids wrapping the counter and causing 'spurious' + * use-after-free issues. + * + * Memory ordering rules are slightly relaxed wrt regular atomic_t functions + * and provide only what is strictly required for refcounts. + * + * The increments are fully relaxed; these will not provide ordering. The + * rationale is that whatever is used to obtain the object we're increasing the + * reference count on will provide the ordering. For locked data structures, + * its the lock acquire, for RCU/lockless data structures its the dependent + * load. + * + * Do note that inc_not_zero() provides a control dependency which will order + * future stores against the inc, this ensures we'll never modify the object + * if we did not in fact acquire a reference. + * + * The decrements will provide release order, such that all the prior loads and + * stores will be issued before, it also provides a control dependency, which + * will order us against the subsequent free(). + * + * The control dependency is against the load of the cmpxchg (ll/sc) that + * succeeded. This means the stores aren't fully ordered, but this is fine + * because the 1->0 transition indicates no concurrency. + * + * Note that the allocator is responsible for ordering things between free() + * and alloc(). + * + */ + +#include +#include + +#ifdef NDEBUG +#define REFCOUNT_WARN(cond, str) (void)(cond) +#define __refcount_check +#else +#define REFCOUNT_WARN(cond, str) BUG_ON(cond) +#define __refcount_check __must_check +#endif + +typedef struct refcount_struct { + atomic_t refs; +} refcount_t; + +#define REFCOUNT_INIT(n) { .refs = ATOMIC_INIT(n), } + +static inline void refcount_set(refcount_t *r, unsigned int n) +{ + atomic_set(&r->refs, n); +} + +static inline unsigned int refcount_read(const refcount_t *r) +{ + return atomic_read(&r->refs); +} + +/* + * Similar to atomic_inc_not_zero(), will saturate at UINT_MAX and WARN. + * + * Provides no memory ordering, it is assumed the caller has guaranteed the + * object memory to be stable (RCU, etc.). It does provide a control dependency + * and thereby orders future stores. See the comment on top. + */ +static inline __refcount_check +bool refcount_inc_not_zero(refcount_t *r) +{ + unsigned int old, new, val = atomic_read(&r->refs); + + for (;;) { + new = val + 1; + + if (!val) + return false; + + if (unlikely(!new)) + return true; + + old = atomic_cmpxchg_relaxed(&r->refs, val, new); + if (old == val) + break; + + val = old; + } + + REFCOUNT_WARN(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n"); + + return true; +} + +/* + * Similar to atomic_inc(), will saturate at UINT_MAX and WARN. + * + * Provides no memory ordering, it is assumed the caller already has a + * reference on the object, will WARN when this is not so. + */ +static inline void refcount_inc(refcount_t *r) +{ + REFCOUNT_WARN(!refcount_inc_not_zero(r), "refcount_t: increment on 0; use-after-free.\n"); +} + +/* + * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to + * decrement when saturated at UINT_MAX. + * + * Provides release memory ordering, such that prior loads and stores are done + * before, and provides a control dependency such that free() must come after. + * See the comment on top. + */ +static inline __refcount_check +bool refcount_sub_and_test(unsigned int i, refcount_t *r) +{ + unsigned int old, new, val = atomic_read(&r->refs); + + for (;;) { + if (unlikely(val == UINT_MAX)) + return false; + + new = val - i; + if (new > val) { + REFCOUNT_WARN(new > val, "refcount_t: underflow; use-after-free.\n"); + return false; + } + + old = atomic_cmpxchg_release(&r->refs, val, new); + if (old == val) + break; + + val = old; + } + + return !new; +} + +static inline __refcount_check +bool refcount_dec_and_test(refcount_t *r) +{ + return refcount_sub_and_test(1, r); +} + + +#endif /* _ATOMIC_LINUX_REFCOUNT_H */ diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST index e2c52190cf28..28648c09dcd6 100644 --- a/tools/perf/MANIFEST +++ b/tools/perf/MANIFEST @@ -79,6 +79,7 @@ tools/include/uapi/linux/perf_event.h tools/include/linux/poison.h tools/include/linux/rbtree.h tools/include/linux/rbtree_augmented.h +tools/include/linux/refcount.h tools/include/linux/string.h tools/include/linux/stringify.h tools/include/linux/types.h From 79c5fe6db8c70558d3a64959f55596d137ccc6e6 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Tue, 21 Feb 2017 17:34:55 +0200 Subject: [PATCH 12/37] perf cgroup: Convert cgroup_sel.refcnt from atomic_t to refcount_t The refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Signed-off-by: Elena Reshetova Signed-off-by: David Windsor Signed-off-by: Hans Liljestrand Signed-off-by: Kees Kook Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: alsa-devel@alsa-project.org Cc: Andrew Morton Cc: David Windsor Cc: Greg Kroah-Hartman Cc: Hans Liljestrand Cc: Jiri Olsa Cc: Kees Kook Cc: Mark Rutland Cc: Matija Glavinic Pecotic Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1487691303-31858-2-git-send-email-elena.reshetova@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/cgroup.c | 6 +++--- tools/perf/util/cgroup.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c index eafbf11442b2..86399eda3684 100644 --- a/tools/perf/util/cgroup.c +++ b/tools/perf/util/cgroup.c @@ -127,19 +127,19 @@ static int add_cgroup(struct perf_evlist *evlist, char *str) goto found; n++; } - if (atomic_read(&cgrp->refcnt) == 0) + if (refcount_read(&cgrp->refcnt) == 0) free(cgrp); return -1; found: - atomic_inc(&cgrp->refcnt); + refcount_inc(&cgrp->refcnt); counter->cgrp = cgrp; return 0; } void close_cgroup(struct cgroup_sel *cgrp) { - if (cgrp && atomic_dec_and_test(&cgrp->refcnt)) { + if (cgrp && refcount_dec_and_test(&cgrp->refcnt)) { close(cgrp->fd); zfree(&cgrp->name); free(cgrp); diff --git a/tools/perf/util/cgroup.h b/tools/perf/util/cgroup.h index 31f8dcdbd7ef..d91966b97cbd 100644 --- a/tools/perf/util/cgroup.h +++ b/tools/perf/util/cgroup.h @@ -1,14 +1,14 @@ #ifndef __CGROUP_H__ #define __CGROUP_H__ -#include +#include struct option; struct cgroup_sel { char *name; int fd; - atomic_t refcnt; + refcount_t refcnt; }; From ec09a42a6dbd2afde9b8fd4bb8f98bbd94ca904c Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Tue, 21 Feb 2017 17:34:56 +0200 Subject: [PATCH 13/37] perf cpumap: Convert cpu_map.refcnt from atomic_t to refcount_t The refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Signed-off-by: Elena Reshetova Signed-off-by: David Windsor Signed-off-by: Hans Liljestrand Signed-off-by: Kees Kook Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Andrew Morton Cc: David Windsor Cc: Greg Kroah-Hartman Cc: Hans Liljestrand Cc: Jiri Olsa Cc: Kees Kook Cc: Mark Rutland Cc: Matija Glavinic Pecotic Cc: Peter Zijlstra Cc: alsa-devel@alsa-project.org Link: http://lkml.kernel.org/r/1487691303-31858-3-git-send-email-elena.reshetova@intel.com [ fixed mixed conversion to refcount in tests/cpumap.c ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/cpumap.c | 2 +- tools/perf/util/cpumap.c | 16 ++++++++-------- tools/perf/util/cpumap.h | 4 ++-- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/tools/perf/tests/cpumap.c b/tools/perf/tests/cpumap.c index f168a85992d0..4478773cdb97 100644 --- a/tools/perf/tests/cpumap.c +++ b/tools/perf/tests/cpumap.c @@ -66,7 +66,7 @@ static int process_event_cpus(struct perf_tool *tool __maybe_unused, TEST_ASSERT_VAL("wrong nr", map->nr == 2); TEST_ASSERT_VAL("wrong cpu", map->map[0] == 1); TEST_ASSERT_VAL("wrong cpu", map->map[1] == 256); - TEST_ASSERT_VAL("wrong refcnt", atomic_read(&map->refcnt) == 1); + TEST_ASSERT_VAL("wrong refcnt", refcount_read(&map->refcnt) == 1); cpu_map__put(map); return 0; } diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index 8c7504939113..39ad2caccf56 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -29,7 +29,7 @@ static struct cpu_map *cpu_map__default_new(void) cpus->map[i] = i; cpus->nr = nr_cpus; - atomic_set(&cpus->refcnt, 1); + refcount_set(&cpus->refcnt, 1); } return cpus; @@ -43,7 +43,7 @@ static struct cpu_map *cpu_map__trim_new(int nr_cpus, int *tmp_cpus) if (cpus != NULL) { cpus->nr = nr_cpus; memcpy(cpus->map, tmp_cpus, payload_size); - atomic_set(&cpus->refcnt, 1); + refcount_set(&cpus->refcnt, 1); } return cpus; @@ -252,7 +252,7 @@ struct cpu_map *cpu_map__dummy_new(void) if (cpus != NULL) { cpus->nr = 1; cpus->map[0] = -1; - atomic_set(&cpus->refcnt, 1); + refcount_set(&cpus->refcnt, 1); } return cpus; @@ -269,7 +269,7 @@ struct cpu_map *cpu_map__empty_new(int nr) for (i = 0; i < nr; i++) cpus->map[i] = -1; - atomic_set(&cpus->refcnt, 1); + refcount_set(&cpus->refcnt, 1); } return cpus; @@ -278,7 +278,7 @@ struct cpu_map *cpu_map__empty_new(int nr) static void cpu_map__delete(struct cpu_map *map) { if (map) { - WARN_ONCE(atomic_read(&map->refcnt) != 0, + WARN_ONCE(refcount_read(&map->refcnt) != 0, "cpu_map refcnt unbalanced\n"); free(map); } @@ -287,13 +287,13 @@ static void cpu_map__delete(struct cpu_map *map) struct cpu_map *cpu_map__get(struct cpu_map *map) { if (map) - atomic_inc(&map->refcnt); + refcount_inc(&map->refcnt); return map; } void cpu_map__put(struct cpu_map *map) { - if (map && atomic_dec_and_test(&map->refcnt)) + if (map && refcount_dec_and_test(&map->refcnt)) cpu_map__delete(map); } @@ -357,7 +357,7 @@ int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res, /* ensure we process id in increasing order */ qsort(c->map, c->nr, sizeof(int), cmp_ids); - atomic_set(&c->refcnt, 1); + refcount_set(&c->refcnt, 1); *res = c; return 0; } diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index 1a0549af8f5c..e84491636c1b 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -3,13 +3,13 @@ #include #include -#include +#include #include "perf.h" #include "util/debug.h" struct cpu_map { - atomic_t refcnt; + refcount_t refcnt; int nr; int map[]; }; From 6df74bc08bc2201f65fb0e81cd5feb787575f7ce Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Tue, 21 Feb 2017 17:34:57 +0200 Subject: [PATCH 14/37] perf comm: Convert comm_str.refcnt from atomic_t to refcount_t The refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Signed-off-by: Elena Reshetova Signed-off-by: David Windsor Signed-off-by: Hans Liljestrand Signed-off-by: Kees Kook Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Andrew Morton Cc: David Windsor Cc: Greg Kroah-Hartman Cc: Hans Liljestrand Cc: Jiri Olsa Cc: Kees Kook Cc: Mark Rutland Cc: Matija Glavinic Pecotic Cc: Peter Zijlstra Cc: alsa-devel@alsa-project.org Link: http://lkml.kernel.org/r/1487691303-31858-4-git-send-email-elena.reshetova@intel.com [ Reinstated comm_str__get() function, needed when reusing entries in the rbtree ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/comm.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/tools/perf/util/comm.c b/tools/perf/util/comm.c index 21b7ff382c3f..32837b6f7879 100644 --- a/tools/perf/util/comm.c +++ b/tools/perf/util/comm.c @@ -2,12 +2,12 @@ #include "util.h" #include #include -#include +#include struct comm_str { char *str; struct rb_node rb_node; - atomic_t refcnt; + refcount_t refcnt; }; /* Should perhaps be moved to struct machine */ @@ -16,13 +16,13 @@ static struct rb_root comm_str_root; static struct comm_str *comm_str__get(struct comm_str *cs) { if (cs) - atomic_inc(&cs->refcnt); + refcount_inc(&cs->refcnt); return cs; } static void comm_str__put(struct comm_str *cs) { - if (cs && atomic_dec_and_test(&cs->refcnt)) { + if (cs && refcount_dec_and_test(&cs->refcnt)) { rb_erase(&cs->rb_node, &comm_str_root); zfree(&cs->str); free(cs); @@ -43,7 +43,7 @@ static struct comm_str *comm_str__alloc(const char *str) return NULL; } - atomic_set(&cs->refcnt, 0); + refcount_set(&cs->refcnt, 1); return cs; } @@ -61,7 +61,7 @@ static struct comm_str *comm_str__findnew(const char *str, struct rb_root *root) cmp = strcmp(str, iter->str); if (!cmp) - return iter; + return comm_str__get(iter); if (cmp < 0) p = &(*p)->rb_left; @@ -95,8 +95,6 @@ struct comm *comm__new(const char *str, u64 timestamp, bool exec) return NULL; } - comm_str__get(comm->comm_str); - return comm; } @@ -108,7 +106,6 @@ int comm__override(struct comm *comm, const char *str, u64 timestamp, bool exec) if (!new) return -ENOMEM; - comm_str__get(new); comm_str__put(old); comm->comm_str = new; comm->start = timestamp; From 7100810a75b9854f1b05550b54500497c5914d4b Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Tue, 21 Feb 2017 17:34:58 +0200 Subject: [PATCH 15/37] perf dso: Convert dso.refcnt from atomic_t to refcount_t The refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Signed-off-by: Elena Reshetova Signed-off-by: David Windsor Signed-off-by: Hans Liljestrand Signed-off-by: Kees Kook Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Andrew Morton Cc: David Windsor Cc: Greg Kroah-Hartman Cc: Hans Liljestrand Cc: Jiri Olsa Cc: Kees Kook Cc: Mark Rutland Cc: Matija Glavinic Pecotic Cc: Peter Zijlstra Cc: alsa-devel@alsa-project.org Link: http://lkml.kernel.org/r/1487691303-31858-5-git-send-email-elena.reshetova@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dso.c | 6 +++--- tools/perf/util/dso.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index d38b62a700ca..42db00d78573 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -1109,7 +1109,7 @@ struct dso *dso__new(const char *name) INIT_LIST_HEAD(&dso->node); INIT_LIST_HEAD(&dso->data.open_entry); pthread_mutex_init(&dso->lock, NULL); - atomic_set(&dso->refcnt, 1); + refcount_set(&dso->refcnt, 1); } return dso; @@ -1147,13 +1147,13 @@ void dso__delete(struct dso *dso) struct dso *dso__get(struct dso *dso) { if (dso) - atomic_inc(&dso->refcnt); + refcount_inc(&dso->refcnt); return dso; } void dso__put(struct dso *dso) { - if (dso && atomic_dec_and_test(&dso->refcnt)) + if (dso && refcount_dec_and_test(&dso->refcnt)) dso__delete(dso); } diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h index ecc4bbd3f82e..12350b171727 100644 --- a/tools/perf/util/dso.h +++ b/tools/perf/util/dso.h @@ -1,7 +1,7 @@ #ifndef __PERF_DSO #define __PERF_DSO -#include +#include #include #include #include @@ -187,7 +187,7 @@ struct dso { void *priv; u64 db_id; }; - atomic_t refcnt; + refcount_t refcnt; char name[0]; }; From e3a42cdd3e35d6c2181d5acfa191eb448aea6ace Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Tue, 21 Feb 2017 17:34:59 +0200 Subject: [PATCH 16/37] perf map: Convert map.refcnt from atomic_t to refcount_t The refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Signed-off-by: Elena Reshetova Signed-off-by: David Windsor Signed-off-by: Hans Liljestrand Signed-off-by: Kees Kook Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Andrew Morton Cc: David Windsor Cc: Greg Kroah-Hartman Cc: Hans Liljestrand Cc: Jiri Olsa Cc: Kees Kook Cc: Mark Rutland Cc: Matija Glavinic Pecotic Cc: Peter Zijlstra Cc: alsa-devel@alsa-project.org Link: http://lkml.kernel.org/r/1487691303-31858-6-git-send-email-elena.reshetova@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/map.c | 6 +++--- tools/perf/util/map.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index 0a943e7b1ea7..f0e2428efd0b 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -141,7 +141,7 @@ void map__init(struct map *map, enum map_type type, RB_CLEAR_NODE(&map->rb_node); map->groups = NULL; map->erange_warned = false; - atomic_set(&map->refcnt, 1); + refcount_set(&map->refcnt, 1); } struct map *map__new(struct machine *machine, u64 start, u64 len, @@ -255,7 +255,7 @@ void map__delete(struct map *map) void map__put(struct map *map) { - if (map && atomic_dec_and_test(&map->refcnt)) + if (map && refcount_dec_and_test(&map->refcnt)) map__delete(map); } @@ -354,7 +354,7 @@ struct map *map__clone(struct map *from) struct map *map = memdup(from, sizeof(*map)); if (map != NULL) { - atomic_set(&map->refcnt, 1); + refcount_set(&map->refcnt, 1); RB_CLEAR_NODE(&map->rb_node); dso__get(map->dso); map->groups = NULL; diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h index abdacf800c98..9545ff343ec5 100644 --- a/tools/perf/util/map.h +++ b/tools/perf/util/map.h @@ -1,7 +1,7 @@ #ifndef __PERF_MAP_H #define __PERF_MAP_H -#include +#include #include #include #include @@ -51,7 +51,7 @@ struct map { struct dso *dso; struct map_groups *groups; - atomic_t refcnt; + refcount_t refcnt; }; struct kmap { @@ -150,7 +150,7 @@ struct map *map__clone(struct map *map); static inline struct map *map__get(struct map *map) { if (map) - atomic_inc(&map->refcnt); + refcount_inc(&map->refcnt); return map; } From ead05e8f3fffe2860f0f8d1c23d74c526e9f2a3c Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Tue, 21 Feb 2017 17:35:00 +0200 Subject: [PATCH 17/37] perf map: Convert map_groups.refcnt from atomic_t to refcount_t The refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Signed-off-by: Elena Reshetova Signed-off-by: David Windsor Signed-off-by: Hans Liljestrand Signed-off-by: Kees Kook Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Andrew Morton Cc: David Windsor Cc: Greg Kroah-Hartman Cc: Hans Liljestrand Cc: Jiri Olsa Cc: Kees Kook Cc: Mark Rutland Cc: Matija Glavinic Pecotic Cc: Peter Zijlstra Cc: alsa-devel@alsa-project.org Link: http://lkml.kernel.org/r/1487691303-31858-7-git-send-email-elena.reshetova@intel.com [ Did the missing conversion of tests/thread-mg-share.c too ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/thread-mg-share.c | 12 ++++++------ tools/perf/util/map.c | 4 ++-- tools/perf/util/map.h | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tools/perf/tests/thread-mg-share.c b/tools/perf/tests/thread-mg-share.c index 188b63140fc8..76686dd6f5ec 100644 --- a/tools/perf/tests/thread-mg-share.c +++ b/tools/perf/tests/thread-mg-share.c @@ -43,7 +43,7 @@ int test__thread_mg_share(int subtest __maybe_unused) leader && t1 && t2 && t3 && other); mg = leader->mg; - TEST_ASSERT_EQUAL("wrong refcnt", atomic_read(&mg->refcnt), 4); + TEST_ASSERT_EQUAL("wrong refcnt", refcount_read(&mg->refcnt), 4); /* test the map groups pointer is shared */ TEST_ASSERT_VAL("map groups don't match", mg == t1->mg); @@ -71,25 +71,25 @@ int test__thread_mg_share(int subtest __maybe_unused) machine__remove_thread(machine, other_leader); other_mg = other->mg; - TEST_ASSERT_EQUAL("wrong refcnt", atomic_read(&other_mg->refcnt), 2); + TEST_ASSERT_EQUAL("wrong refcnt", refcount_read(&other_mg->refcnt), 2); TEST_ASSERT_VAL("map groups don't match", other_mg == other_leader->mg); /* release thread group */ thread__put(leader); - TEST_ASSERT_EQUAL("wrong refcnt", atomic_read(&mg->refcnt), 3); + TEST_ASSERT_EQUAL("wrong refcnt", refcount_read(&mg->refcnt), 3); thread__put(t1); - TEST_ASSERT_EQUAL("wrong refcnt", atomic_read(&mg->refcnt), 2); + TEST_ASSERT_EQUAL("wrong refcnt", refcount_read(&mg->refcnt), 2); thread__put(t2); - TEST_ASSERT_EQUAL("wrong refcnt", atomic_read(&mg->refcnt), 1); + TEST_ASSERT_EQUAL("wrong refcnt", refcount_read(&mg->refcnt), 1); thread__put(t3); /* release other group */ thread__put(other_leader); - TEST_ASSERT_EQUAL("wrong refcnt", atomic_read(&other_mg->refcnt), 1); + TEST_ASSERT_EQUAL("wrong refcnt", refcount_read(&other_mg->refcnt), 1); thread__put(other); diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index f0e2428efd0b..1d9ebcf9e38e 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -485,7 +485,7 @@ void map_groups__init(struct map_groups *mg, struct machine *machine) maps__init(&mg->maps[i]); } mg->machine = machine; - atomic_set(&mg->refcnt, 1); + refcount_set(&mg->refcnt, 1); } static void __maps__purge(struct maps *maps) @@ -547,7 +547,7 @@ void map_groups__delete(struct map_groups *mg) void map_groups__put(struct map_groups *mg) { - if (mg && atomic_dec_and_test(&mg->refcnt)) + if (mg && refcount_dec_and_test(&mg->refcnt)) map_groups__delete(mg); } diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h index 9545ff343ec5..c8a5a644c0a9 100644 --- a/tools/perf/util/map.h +++ b/tools/perf/util/map.h @@ -67,7 +67,7 @@ struct maps { struct map_groups { struct maps maps[MAP__NR_TYPES]; struct machine *machine; - atomic_t refcnt; + refcount_t refcnt; }; struct map_groups *map_groups__new(struct machine *machine); @@ -77,7 +77,7 @@ bool map_groups__empty(struct map_groups *mg); static inline struct map_groups *map_groups__get(struct map_groups *mg) { if (mg) - atomic_inc(&mg->refcnt); + refcount_inc(&mg->refcnt); return mg; } From 25a3720cf45779900246ec17e238fbb674ce4e67 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Tue, 21 Feb 2017 17:35:01 +0200 Subject: [PATCH 18/37] perf evlist: Convert perf_map.refcnt from atomic_t to refcount_t The refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Signed-off-by: Elena Reshetova Signed-off-by: David Windsor Signed-off-by: Hans Liljestrand Signed-off-by: Kees Kook Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Andrew Morton Cc: David Windsor Cc: Greg Kroah-Hartman Cc: Hans Liljestrand Cc: Jiri Olsa Cc: Kees Kook Cc: Mark Rutland Cc: Matija Glavinic Pecotic Cc: Peter Zijlstra Cc: alsa-devel@alsa-project.org Link: http://lkml.kernel.org/r/1487691303-31858-8-git-send-email-elena.reshetova@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 18 +++++++++--------- tools/perf/util/evlist.h | 4 ++-- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index b601f2814a30..564b924fb48a 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -777,7 +777,7 @@ union perf_event *perf_mmap__read_forward(struct perf_mmap *md, bool check_messu /* * Check if event was unmapped due to a POLLHUP/POLLERR. */ - if (!atomic_read(&md->refcnt)) + if (!refcount_read(&md->refcnt)) return NULL; head = perf_mmap__read_head(md); @@ -794,7 +794,7 @@ perf_mmap__read_backward(struct perf_mmap *md) /* * Check if event was unmapped due to a POLLHUP/POLLERR. */ - if (!atomic_read(&md->refcnt)) + if (!refcount_read(&md->refcnt)) return NULL; head = perf_mmap__read_head(md); @@ -856,7 +856,7 @@ void perf_mmap__read_catchup(struct perf_mmap *md) { u64 head; - if (!atomic_read(&md->refcnt)) + if (!refcount_read(&md->refcnt)) return; head = perf_mmap__read_head(md); @@ -875,14 +875,14 @@ static bool perf_mmap__empty(struct perf_mmap *md) static void perf_mmap__get(struct perf_mmap *map) { - atomic_inc(&map->refcnt); + refcount_inc(&map->refcnt); } static void perf_mmap__put(struct perf_mmap *md) { - BUG_ON(md->base && atomic_read(&md->refcnt) == 0); + BUG_ON(md->base && refcount_read(&md->refcnt) == 0); - if (atomic_dec_and_test(&md->refcnt)) + if (refcount_dec_and_test(&md->refcnt)) perf_mmap__munmap(md); } @@ -894,7 +894,7 @@ void perf_mmap__consume(struct perf_mmap *md, bool overwrite) perf_mmap__write_tail(md, old); } - if (atomic_read(&md->refcnt) == 1 && perf_mmap__empty(md)) + if (refcount_read(&md->refcnt) == 1 && perf_mmap__empty(md)) perf_mmap__put(md); } @@ -937,7 +937,7 @@ static void perf_mmap__munmap(struct perf_mmap *map) munmap(map->base, perf_mmap__mmap_len(map)); map->base = NULL; map->fd = -1; - atomic_set(&map->refcnt, 0); + refcount_set(&map->refcnt, 0); } auxtrace_mmap__munmap(&map->auxtrace_mmap); } @@ -1001,7 +1001,7 @@ static int perf_mmap__mmap(struct perf_mmap *map, * evlist layer can't just drop it when filtering events in * perf_evlist__filter_pollfd(). */ - atomic_set(&map->refcnt, 2); + refcount_set(&map->refcnt, 2); map->prev = 0; map->mask = mp->mask; map->base = mmap(NULL, perf_mmap__mmap_len(map), mp->prot, diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 389b9ccdf8c7..39942995f537 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -1,7 +1,7 @@ #ifndef __PERF_EVLIST_H #define __PERF_EVLIST_H 1 -#include +#include #include #include #include @@ -29,7 +29,7 @@ struct perf_mmap { void *base; int mask; int fd; - atomic_t refcnt; + refcount_t refcnt; u64 prev; struct auxtrace_mmap auxtrace_mmap; char event_copy[PERF_SAMPLE_MAX_SIZE] __attribute__((aligned(8))); From e34f5b11cd51fbe723e481c1db03a77260be6f4c Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Tue, 21 Feb 2017 17:35:02 +0200 Subject: [PATCH 19/37] perf thread: convert thread.refcnt from atomic_t to refcount_t The refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Signed-off-by: Elena Reshetova Signed-off-by: David Windsor Signed-off-by: Hans Liljestrand Signed-off-by: Kees Kook Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Andrew Morton Cc: David Windsor Cc: Greg Kroah-Hartman Cc: Hans Liljestrand Cc: Jiri Olsa Cc: Kees Kook Cc: Mark Rutland Cc: Matija Glavinic Pecotic Cc: Peter Zijlstra Cc: alsa-devel@alsa-project.org Link: http://lkml.kernel.org/r/1487691303-31858-9-git-send-email-elena.reshetova@intel.com [ Did missing conversion in __machine__remove_thread() ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/machine.c | 2 +- tools/perf/util/thread.c | 6 +++--- tools/perf/util/thread.h | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 71c9720d4973..b9974fe41bc1 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -1439,7 +1439,7 @@ static void __machine__remove_thread(struct machine *machine, struct thread *th, if (machine->last_match == th) machine->last_match = NULL; - BUG_ON(atomic_read(&th->refcnt) == 0); + BUG_ON(refcount_read(&th->refcnt) == 0); if (lock) pthread_rwlock_wrlock(&machine->threads_lock); rb_erase_init(&th->rb_node, &machine->threads); diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index f5af87f66663..74e79d26b421 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -53,7 +53,7 @@ struct thread *thread__new(pid_t pid, pid_t tid) goto err_thread; list_add(&comm->list, &thread->comm_list); - atomic_set(&thread->refcnt, 1); + refcount_set(&thread->refcnt, 1); RB_CLEAR_NODE(&thread->rb_node); } @@ -88,13 +88,13 @@ void thread__delete(struct thread *thread) struct thread *thread__get(struct thread *thread) { if (thread) - atomic_inc(&thread->refcnt); + refcount_inc(&thread->refcnt); return thread; } void thread__put(struct thread *thread) { - if (thread && atomic_dec_and_test(&thread->refcnt)) { + if (thread && refcount_dec_and_test(&thread->refcnt)) { /* * Remove it from the dead_threads list, as last reference * is gone. diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h index 99263cb6e6b6..e57188546465 100644 --- a/tools/perf/util/thread.h +++ b/tools/perf/util/thread.h @@ -1,7 +1,7 @@ #ifndef __PERF_THREAD_H #define __PERF_THREAD_H -#include +#include #include #include #include @@ -23,7 +23,7 @@ struct thread { pid_t tid; pid_t ppid; int cpu; - atomic_t refcnt; + refcount_t refcnt; char shortname[3]; bool comm_set; int comm_len; From 364fed351369e0193244fa2c78df855724cdddb9 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Tue, 21 Feb 2017 17:35:03 +0200 Subject: [PATCH 20/37] perf thread_map: Convert thread_map.refcnt from atomic_t to refcount_t The refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Signed-off-by: Elena Reshetova Signed-off-by: David Windsor Signed-off-by: Hans Liljestrand Signed-off-by: Kees Kook Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Andrew Morton Cc: David Windsor Cc: Greg Kroah-Hartman Cc: Hans Liljestrand Cc: Jiri Olsa Cc: Kees Kook Cc: Mark Rutland Cc: Matija Glavinic Pecotic Cc: Peter Zijlstra Cc: alsa-devel@alsa-project.org Link: http://lkml.kernel.org/r/1487691303-31858-10-git-send-email-elena.reshetova@intel.com [ Did missing tests/thread-map.c conversion ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/thread-map.c | 6 +++--- tools/perf/util/thread_map.c | 20 ++++++++++---------- tools/perf/util/thread_map.h | 4 ++-- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/tools/perf/tests/thread-map.c b/tools/perf/tests/thread-map.c index f2d2e542d0ee..a63d6945807b 100644 --- a/tools/perf/tests/thread-map.c +++ b/tools/perf/tests/thread-map.c @@ -29,7 +29,7 @@ int test__thread_map(int subtest __maybe_unused) thread_map__comm(map, 0) && !strcmp(thread_map__comm(map, 0), NAME)); TEST_ASSERT_VAL("wrong refcnt", - atomic_read(&map->refcnt) == 1); + refcount_read(&map->refcnt) == 1); thread_map__put(map); /* test dummy pid */ @@ -44,7 +44,7 @@ int test__thread_map(int subtest __maybe_unused) thread_map__comm(map, 0) && !strcmp(thread_map__comm(map, 0), "dummy")); TEST_ASSERT_VAL("wrong refcnt", - atomic_read(&map->refcnt) == 1); + refcount_read(&map->refcnt) == 1); thread_map__put(map); return 0; } @@ -71,7 +71,7 @@ static int process_event(struct perf_tool *tool __maybe_unused, thread_map__comm(threads, 0) && !strcmp(thread_map__comm(threads, 0), NAME)); TEST_ASSERT_VAL("wrong refcnt", - atomic_read(&threads->refcnt) == 1); + refcount_read(&threads->refcnt) == 1); thread_map__put(threads); return 0; } diff --git a/tools/perf/util/thread_map.c b/tools/perf/util/thread_map.c index 7c3fcc538a70..9026408ea55b 100644 --- a/tools/perf/util/thread_map.c +++ b/tools/perf/util/thread_map.c @@ -66,7 +66,7 @@ struct thread_map *thread_map__new_by_pid(pid_t pid) for (i = 0; i < items; i++) thread_map__set_pid(threads, i, atoi(namelist[i]->d_name)); threads->nr = items; - atomic_set(&threads->refcnt, 1); + refcount_set(&threads->refcnt, 1); } for (i=0; inr = 1; - atomic_set(&threads->refcnt, 1); + refcount_set(&threads->refcnt, 1); } return threads; @@ -105,7 +105,7 @@ struct thread_map *thread_map__new_by_uid(uid_t uid) goto out_free_threads; threads->nr = 0; - atomic_set(&threads->refcnt, 1); + refcount_set(&threads->refcnt, 1); while ((dirent = readdir(proc)) != NULL) { char *end; @@ -235,7 +235,7 @@ static struct thread_map *thread_map__new_by_pid_str(const char *pid_str) out: strlist__delete(slist); if (threads) - atomic_set(&threads->refcnt, 1); + refcount_set(&threads->refcnt, 1); return threads; out_free_namelist: @@ -255,7 +255,7 @@ struct thread_map *thread_map__new_dummy(void) if (threads != NULL) { thread_map__set_pid(threads, 0, -1); threads->nr = 1; - atomic_set(&threads->refcnt, 1); + refcount_set(&threads->refcnt, 1); } return threads; } @@ -300,7 +300,7 @@ struct thread_map *thread_map__new_by_tid_str(const char *tid_str) } out: if (threads) - atomic_set(&threads->refcnt, 1); + refcount_set(&threads->refcnt, 1); return threads; out_free_threads: @@ -326,7 +326,7 @@ static void thread_map__delete(struct thread_map *threads) if (threads) { int i; - WARN_ONCE(atomic_read(&threads->refcnt) != 0, + WARN_ONCE(refcount_read(&threads->refcnt) != 0, "thread map refcnt unbalanced\n"); for (i = 0; i < threads->nr; i++) free(thread_map__comm(threads, i)); @@ -337,13 +337,13 @@ static void thread_map__delete(struct thread_map *threads) struct thread_map *thread_map__get(struct thread_map *map) { if (map) - atomic_inc(&map->refcnt); + refcount_inc(&map->refcnt); return map; } void thread_map__put(struct thread_map *map) { - if (map && atomic_dec_and_test(&map->refcnt)) + if (map && refcount_dec_and_test(&map->refcnt)) thread_map__delete(map); } @@ -423,7 +423,7 @@ static void thread_map__copy_event(struct thread_map *threads, threads->map[i].comm = strndup(event->entries[i].comm, 16); } - atomic_set(&threads->refcnt, 1); + refcount_set(&threads->refcnt, 1); } struct thread_map *thread_map__new_event(struct thread_map_event *event) diff --git a/tools/perf/util/thread_map.h b/tools/perf/util/thread_map.h index ea0ef08c6303..bd34d7a0b9fa 100644 --- a/tools/perf/util/thread_map.h +++ b/tools/perf/util/thread_map.h @@ -3,7 +3,7 @@ #include #include -#include +#include struct thread_map_data { pid_t pid; @@ -11,7 +11,7 @@ struct thread_map_data { }; struct thread_map { - atomic_t refcnt; + refcount_t refcnt; int nr; struct thread_map_data map[]; }; From 4738ca30b4a7a113084d7863846175094f95c62f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 23 Feb 2017 13:24:34 -0300 Subject: [PATCH 21/37] perf evlist: Clarify a bit the use of perf_mmap->refcnt This is an odd refcount use case, so add some more comments to help understand that when it hits zero it really means that the mmap()ed area (on a perf_event_open() returned fd) has been munmap()ed. Cc: Adrian Hunter Cc: David Ahern Cc: Elena Reshetova Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/20170223162344.GD3595@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 564b924fb48a..50420cd35446 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -974,8 +974,19 @@ static struct perf_mmap *perf_evlist__alloc_mmap(struct perf_evlist *evlist) if (!map) return NULL; - for (i = 0; i < evlist->nr_mmaps; i++) + for (i = 0; i < evlist->nr_mmaps; i++) { map[i].fd = -1; + /* + * When the perf_mmap() call is made we grab one refcount, plus + * one extra to let perf_evlist__mmap_consume() get the last + * events after all real references (perf_mmap__get()) are + * dropped. + * + * Each PERF_EVENT_IOC_SET_OUTPUT points to this mmap and + * thus does perf_mmap__get() on it. + */ + refcount_set(&map[i].refcnt, 0); + } return map; } From 7768f8dada66d6052fccbc2ddc375f3e650455b9 Mon Sep 17 00:00:00 2001 From: Charles Baylis Date: Fri, 24 Feb 2017 13:32:56 +0000 Subject: [PATCH 22/37] perf tools: Allow sorting by symbol size Add new sort key 'symbol_size' to allow user to sort by symbol size, or (more usefully) display the symbol size using --fields=...,symbol_size. Committer note: Testing it together with the recently added -q, to remove the headers, and using the '+' sign with -s, to add the symbol_size sort order to the default, which is '-s/--sort comm,dso,symbol': # perf report -q -s +symbol_size | head -10 10.39% swapper [kernel.vmlinux] [k] intel_idle 270 3.45% swapper [kernel.vmlinux] [k] update_blocked_averages 1546 2.61% swapper [kernel.vmlinux] [k] update_load_avg 1292 2.36% swapper [kernel.vmlinux] [k] update_cfs_shares 240 1.83% swapper [kernel.vmlinux] [k] __hrtimer_run_queues 606 1.74% swapper [kernel.vmlinux] [k] update_cfs_rq_load_avg. 1187 1.66% swapper [kernel.vmlinux] [k] apic_timer_interrupt 152 1.60% CPU 0/KVM [kvm] [k] kvm_set_msr_common 3046 1.60% gnome-shell libglib-2.0.so.0 [.] g_slist_find 37 1.46% gnome-termina libglib-2.0.so.0 [.] g_hash_table_lookup 370 # Signed-off-by: Charles Baylis Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Maxim Kuvyrkov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1487943176-13840-1-git-send-email-charles.baylis@linaro.org [ Use symbol__size(), remove needless %lld + (long long) casting ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-report.txt | 1 + tools/perf/util/hist.h | 1 + tools/perf/util/sort.c | 41 ++++++++++++++++++++++++ tools/perf/util/sort.h | 1 + 4 files changed, 44 insertions(+) diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index c04cc0647c16..33f91906f5dc 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -80,6 +80,7 @@ OPTIONS - pid: command and tid of the task - dso: name of library or module executed at the time of sample - symbol: name of function executed at the time of sample + - symbol_size: size of function executed at the time of sample - parent: name of function matched to the parent regex filter. Unmatched entries are displayed as "[other]". - cpu: cpu number the task ran at the time of sample diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 28c216e3d5b7..2e839bf40bdd 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -57,6 +57,7 @@ enum hist_column { HISTC_SRCLINE_FROM, HISTC_SRCLINE_TO, HISTC_TRACE, + HISTC_SYM_SIZE, HISTC_NR_COLS, /* Last entry */ }; diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index 0ff622288d24..f8f16c0e20b6 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -1396,6 +1396,46 @@ struct sort_entry sort_transaction = { .se_width_idx = HISTC_TRANSACTION, }; +/* --sort symbol_size */ + +static int64_t _sort__sym_size_cmp(struct symbol *sym_l, struct symbol *sym_r) +{ + int64_t size_l = sym_l != NULL ? symbol__size(sym_l) : 0; + int64_t size_r = sym_r != NULL ? symbol__size(sym_r) : 0; + + return size_l < size_r ? -1 : + size_l == size_r ? 0 : 1; +} + +static int64_t +sort__sym_size_cmp(struct hist_entry *left, struct hist_entry *right) +{ + return _sort__sym_size_cmp(right->ms.sym, left->ms.sym); +} + +static int _hist_entry__sym_size_snprintf(struct symbol *sym, char *bf, + size_t bf_size, unsigned int width) +{ + if (sym) + return repsep_snprintf(bf, bf_size, "%*d", width, symbol__size(sym)); + + return repsep_snprintf(bf, bf_size, "%*s", width, "unknown"); +} + +static int hist_entry__sym_size_snprintf(struct hist_entry *he, char *bf, + size_t size, unsigned int width) +{ + return _hist_entry__sym_size_snprintf(he->ms.sym, bf, size, width); +} + +struct sort_entry sort_sym_size = { + .se_header = "Symbol size", + .se_cmp = sort__sym_size_cmp, + .se_snprintf = hist_entry__sym_size_snprintf, + .se_width_idx = HISTC_SYM_SIZE, +}; + + struct sort_dimension { const char *name; struct sort_entry *entry; @@ -1418,6 +1458,7 @@ static struct sort_dimension common_sort_dimensions[] = { DIM(SORT_GLOBAL_WEIGHT, "weight", sort_global_weight), DIM(SORT_TRANSACTION, "transaction", sort_transaction), DIM(SORT_TRACE, "trace", sort_trace), + DIM(SORT_SYM_SIZE, "symbol_size", sort_sym_size), }; #undef DIM diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index 796c847e2f00..f583325a3743 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -211,6 +211,7 @@ enum sort_type { SORT_GLOBAL_WEIGHT, SORT_TRANSACTION, SORT_TRACE, + SORT_SYM_SIZE, /* branch stack specific sort keys */ __SORT_BRANCH_STACK, From a9af6be5bc25214f7870fef2b6d3490fe8b87bf7 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 24 Feb 2017 10:12:48 +0900 Subject: [PATCH 23/37] perf ftrace: Add support for --pid option The -p (--pid) option enables to trace existing process by its pid. Committer notes: Testing it: Using the function_graph tracer on a process that is just waiting for user input and thus will make 'perf ftrace' sit there waiting for that, then press any key on that mutt session and see what happens: # perf ftrace -t function_graph -p `pidof mutt` | head -40 2) 1.038 us | switch_mm_irqs_off(); ------------------------------------------ 2) -0 => mutt-3595 ------------------------------------------ 2) | finish_task_switch() { 2) | smp_irq_work_interrupt() { 2) | irq_enter() { 2) 0.180 us | rcu_irq_enter(); 2) 1.248 us | } 2) | __wake_up() { 2) 0.126 us | _raw_spin_lock_irqsave(); 2) | __wake_up_common() { 2) | pollwake() { 2) | default_wake_function() { 2) | try_to_wake_up() { 2) 0.662 us | _raw_spin_lock_irqsave(); 2) | select_task_rq_fair() { 2) 1.719 us | effective_load.isra.41(); 2) 1.343 us | effective_load.isra.41(); 2) | select_idle_sibling() { 2) 0.331 us | idle_cpu(); 2) 1.458 us | } 2) 8.350 us | } 2) 0.200 us | _raw_spin_lock(); 2) | ttwu_do_activate() { 2) | activate_task() { 2) 0.136 us | update_rq_clock.part.77(); 2) | enqueue_task_fair() { 2) | enqueue_entity() { 2) 0.146 us | update_curr(); 2) 0.330 us | account_entity_enqueue(); 2) 0.280 us | update_cfs_shares(); 2) 0.321 us | place_entity(); 2) 0.206 us | __enqueue_entity(); 2) 6.926 us | } 2) | enqueue_entity() { 2) 0.105 us | update_curr(); 2) 0.175 us | account_entity_enqueue(); 2) 0.531 us | update_cfs_shares(); # Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Steven Rostedt Cc: kernel-team@lge.com Link: http://lkml.kernel.org/r/20170224011251.14946-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-ftrace.txt | 4 + tools/perf/builtin-ftrace.c | 93 +++++++++++++++++------- 2 files changed, 69 insertions(+), 28 deletions(-) diff --git a/tools/perf/Documentation/perf-ftrace.txt b/tools/perf/Documentation/perf-ftrace.txt index 2d96de6132a9..2d39397f3f30 100644 --- a/tools/perf/Documentation/perf-ftrace.txt +++ b/tools/perf/Documentation/perf-ftrace.txt @@ -30,6 +30,10 @@ OPTIONS --verbose=:: Verbosity level. +-p:: +--pid=:: + Trace on existing process id (comma separated list). + SEE ALSO -------- diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c index c3e643666c72..85eee9c444ae 100644 --- a/tools/perf/builtin-ftrace.c +++ b/tools/perf/builtin-ftrace.c @@ -11,6 +11,7 @@ #include #include +#include #include "debug.h" #include @@ -50,11 +51,12 @@ static void ftrace__workload_exec_failed_signal(int signo __maybe_unused, done = true; } -static int write_tracing_file(const char *name, const char *val) +static int __write_tracing_file(const char *name, const char *val, bool append) { char *file; int fd, ret = -1; ssize_t size = strlen(val); + int flags = O_WRONLY; file = get_tracing_file(name); if (!file) { @@ -62,7 +64,12 @@ static int write_tracing_file(const char *name, const char *val) return -1; } - fd = open(file, O_WRONLY); + if (append) + flags |= O_APPEND; + else + flags |= O_TRUNC; + + fd = open(file, flags); if (fd < 0) { pr_debug("cannot open tracing file: %s\n", name); goto out; @@ -79,6 +86,16 @@ out: return ret; } +static int write_tracing_file(const char *name, const char *val) +{ + return __write_tracing_file(name, val, false); +} + +static int append_tracing_file(const char *name, const char *val) +{ + return __write_tracing_file(name, val, true); +} + static int reset_tracing_files(struct perf_ftrace *ftrace __maybe_unused) { if (write_tracing_file("tracing_on", "0") < 0) @@ -93,11 +110,27 @@ static int reset_tracing_files(struct perf_ftrace *ftrace __maybe_unused) return 0; } +static int set_tracing_pid(struct perf_ftrace *ftrace) +{ + int i; + char buf[16]; + + if (target__has_cpu(&ftrace->target)) + return 0; + + for (i = 0; i < thread_map__nr(ftrace->evlist->threads); i++) { + scnprintf(buf, sizeof(buf), "%d", + ftrace->evlist->threads->map[i]); + if (append_tracing_file("set_ftrace_pid", buf) < 0) + return -1; + } + return 0; +} + static int __cmd_ftrace(struct perf_ftrace *ftrace, int argc, const char **argv) { char *trace_file; int trace_fd; - char *trace_pid; char buf[4096]; struct pollfd pollfd = { .events = POLLIN, @@ -108,42 +141,37 @@ static int __cmd_ftrace(struct perf_ftrace *ftrace, int argc, const char **argv) return -1; } - if (argc < 1) - return -1; - signal(SIGINT, sig_handler); signal(SIGUSR1, sig_handler); signal(SIGCHLD, sig_handler); - reset_tracing_files(ftrace); + if (reset_tracing_files(ftrace) < 0) + goto out; /* reset ftrace buffer */ if (write_tracing_file("trace", "0") < 0) goto out; - if (perf_evlist__prepare_workload(ftrace->evlist, &ftrace->target, - argv, false, ftrace__workload_exec_failed_signal) < 0) + if (argc && perf_evlist__prepare_workload(ftrace->evlist, + &ftrace->target, argv, false, + ftrace__workload_exec_failed_signal) < 0) { goto out; + } + + if (set_tracing_pid(ftrace) < 0) { + pr_err("failed to set ftrace pid\n"); + goto out_reset; + } if (write_tracing_file("current_tracer", ftrace->tracer) < 0) { pr_err("failed to set current_tracer to %s\n", ftrace->tracer); - goto out; - } - - if (asprintf(&trace_pid, "%d", thread_map__pid(ftrace->evlist->threads, 0)) < 0) { - pr_err("failed to allocate pid string\n"); - goto out; - } - - if (write_tracing_file("set_ftrace_pid", trace_pid) < 0) { - pr_err("failed to set pid: %s\n", trace_pid); - goto out_free_pid; + goto out_reset; } trace_file = get_tracing_file("trace_pipe"); if (!trace_file) { pr_err("failed to open trace_pipe\n"); - goto out_free_pid; + goto out_reset; } trace_fd = open(trace_file, O_RDONLY); @@ -152,7 +180,7 @@ static int __cmd_ftrace(struct perf_ftrace *ftrace, int argc, const char **argv) if (trace_fd < 0) { pr_err("failed to open trace_pipe\n"); - goto out_free_pid; + goto out_reset; } fcntl(trace_fd, F_SETFL, O_NONBLOCK); @@ -191,11 +219,9 @@ static int __cmd_ftrace(struct perf_ftrace *ftrace, int argc, const char **argv) out_close_fd: close(trace_fd); -out_free_pid: - free(trace_pid); -out: +out_reset: reset_tracing_files(ftrace); - +out: return done ? 0 : -1; } @@ -227,13 +253,15 @@ int cmd_ftrace(int argc, const char **argv, const char *prefix __maybe_unused) .target = { .uid = UINT_MAX, }, }; const char * const ftrace_usage[] = { - "perf ftrace [] ", + "perf ftrace [] []", "perf ftrace [] -- []", NULL }; const struct option ftrace_options[] = { OPT_STRING('t', "tracer", &ftrace.tracer, "tracer", "tracer to use: function_graph(default) or function"), + OPT_STRING('p', "pid", &ftrace.target.pid, "pid", + "trace on existing process id"), OPT_INCR('v', "verbose", &verbose, "be more verbose"), OPT_END() @@ -245,9 +273,18 @@ int cmd_ftrace(int argc, const char **argv, const char *prefix __maybe_unused) argc = parse_options(argc, argv, ftrace_options, ftrace_usage, PARSE_OPT_STOP_AT_NON_OPTION); - if (!argc) + if (!argc && target__none(&ftrace.target)) usage_with_options(ftrace_usage, ftrace_options); + ret = target__validate(&ftrace.target); + if (ret) { + char errbuf[512]; + + target__strerror(&ftrace.target, ret, errbuf, 512); + pr_err("%s\n", errbuf); + return -EINVAL; + } + ftrace.evlist = perf_evlist__new(); if (ftrace.evlist == NULL) return -ENOMEM; From 4400ac8a9a900318f8516dc0fb94075cb3fdb50d Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 24 Feb 2017 10:12:49 +0900 Subject: [PATCH 24/37] perf cpumap: Introduce cpu_map__snprint_mask() The cpu_map__snprint_mask() generates a string representation of a cpumask bitmap. For cpu 0 to 11, it'll return "fff". Committer notes: Fix compiler warning on some toolchains: 19 fedora:24-x-ARC-uClibc: FAIL CC /tmp/build/perf/util/cpumap.o util/cpumap.c: In function 'hex_char': util/cpumap.c:679:2: error: comparison is always true due to limited range of data type [-Werror=type-limits] if (0 <= val && val <= 9) ^ cc1: all warnings being treated as errors Applying patch from Namhyung that makes function receive an 'unsigned char', that is what the callers are passing to this function. Signed-off-by: Namhyung Kim Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Steven Rostedt Cc: kernel-team@lge.com Link: http://lkml.kernel.org/r/20170224011251.14946-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/cpumap.c | 46 ++++++++++++++++++++++++++++++++++++++++ tools/perf/util/cpumap.h | 1 + 2 files changed, 47 insertions(+) diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index 39ad2caccf56..061018b42393 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -673,3 +673,49 @@ size_t cpu_map__snprint(struct cpu_map *map, char *buf, size_t size) pr_debug("cpumask list: %s\n", buf); return ret; } + +static char hex_char(unsigned char val) +{ + if (val < 10) + return val + '0'; + if (val < 16) + return val - 10 + 'a'; + return '?'; +} + +size_t cpu_map__snprint_mask(struct cpu_map *map, char *buf, size_t size) +{ + int i, cpu; + char *ptr = buf; + unsigned char *bitmap; + int last_cpu = cpu_map__cpu(map, map->nr - 1); + + bitmap = zalloc((last_cpu + 7) / 8); + if (bitmap == NULL) { + buf[0] = '\0'; + return 0; + } + + for (i = 0; i < map->nr; i++) { + cpu = cpu_map__cpu(map, i); + bitmap[cpu / 8] |= 1 << (cpu % 8); + } + + for (cpu = last_cpu / 4 * 4; cpu >= 0; cpu -= 4) { + unsigned char bits = bitmap[cpu / 8]; + + if (cpu % 8) + bits >>= 4; + else + bits &= 0xf; + + *ptr++ = hex_char(bits); + if ((cpu % 32) == 0 && cpu > 0) + *ptr++ = ','; + } + *ptr = '\0'; + free(bitmap); + + buf[size - 1] = '\0'; + return ptr - buf; +} diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index e84491636c1b..6b8bff87481d 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -20,6 +20,7 @@ struct cpu_map *cpu_map__dummy_new(void); struct cpu_map *cpu_map__new_data(struct cpu_map_data *data); struct cpu_map *cpu_map__read(FILE *file); size_t cpu_map__snprint(struct cpu_map *map, char *buf, size_t size); +size_t cpu_map__snprint_mask(struct cpu_map *map, char *buf, size_t size); size_t cpu_map__fprintf(struct cpu_map *map, FILE *fp); int cpu_map__get_socket_id(int cpu); int cpu_map__get_socket(struct cpu_map *map, int idx, void *data); From dc23103278c5ad53c177a25e209ef687e6d5d293 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 24 Feb 2017 10:12:50 +0900 Subject: [PATCH 25/37] perf ftrace: Add support for -a and -C option The -a/--all-cpus and -C/--cpu option is for controlling tracing cpus. Signed-off-by: Namhyung Kim Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Steven Rostedt Cc: kernel-team@lge.com Link: http://lkml.kernel.org/r/20170224011251.14946-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-ftrace.txt | 14 ++++++ tools/perf/builtin-ftrace.c | 60 ++++++++++++++++++++++++ 2 files changed, 74 insertions(+) diff --git a/tools/perf/Documentation/perf-ftrace.txt b/tools/perf/Documentation/perf-ftrace.txt index 2d39397f3f30..6e6a8b22c859 100644 --- a/tools/perf/Documentation/perf-ftrace.txt +++ b/tools/perf/Documentation/perf-ftrace.txt @@ -34,6 +34,20 @@ OPTIONS --pid=:: Trace on existing process id (comma separated list). +-a:: +--all-cpus:: + Force system-wide collection. Scripts run without a + normally use -a by default, while scripts run with a + normally don't - this option allows the latter to be run in + system-wide mode. + +-C:: +--cpu=:: + Only trace for the list of CPUs provided. Multiple CPUs can + be provided as a comma separated list with no space like: 0,1. + Ranges of CPUs are specified with -: 0-2. + Default is to trace on all online CPUs. + SEE ALSO -------- diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c index 85eee9c444ae..d5b566ed7178 100644 --- a/tools/perf/builtin-ftrace.c +++ b/tools/perf/builtin-ftrace.c @@ -17,6 +17,7 @@ #include #include "evlist.h" #include "target.h" +#include "cpumap.h" #include "thread_map.h" #include "util/config.h" @@ -96,6 +97,8 @@ static int append_tracing_file(const char *name, const char *val) return __write_tracing_file(name, val, true); } +static int reset_tracing_cpu(void); + static int reset_tracing_files(struct perf_ftrace *ftrace __maybe_unused) { if (write_tracing_file("tracing_on", "0") < 0) @@ -107,6 +110,9 @@ static int reset_tracing_files(struct perf_ftrace *ftrace __maybe_unused) if (write_tracing_file("set_ftrace_pid", " ") < 0) return -1; + if (reset_tracing_cpu() < 0) + return -1; + return 0; } @@ -127,6 +133,51 @@ static int set_tracing_pid(struct perf_ftrace *ftrace) return 0; } +static int set_tracing_cpumask(struct cpu_map *cpumap) +{ + char *cpumask; + size_t mask_size; + int ret; + int last_cpu; + + last_cpu = cpu_map__cpu(cpumap, cpumap->nr - 1); + mask_size = (last_cpu + 3) / 4 + 1; + mask_size += last_cpu / 32; /* ',' is needed for every 32th cpus */ + + cpumask = malloc(mask_size); + if (cpumask == NULL) { + pr_debug("failed to allocate cpu mask\n"); + return -1; + } + + cpu_map__snprint_mask(cpumap, cpumask, mask_size); + + ret = write_tracing_file("tracing_cpumask", cpumask); + + free(cpumask); + return ret; +} + +static int set_tracing_cpu(struct perf_ftrace *ftrace) +{ + struct cpu_map *cpumap = ftrace->evlist->cpus; + + if (!target__has_cpu(&ftrace->target)) + return 0; + + return set_tracing_cpumask(cpumap); +} + +static int reset_tracing_cpu(void) +{ + struct cpu_map *cpumap = cpu_map__new(NULL); + int ret; + + ret = set_tracing_cpumask(cpumap); + cpu_map__put(cpumap); + return ret; +} + static int __cmd_ftrace(struct perf_ftrace *ftrace, int argc, const char **argv) { char *trace_file; @@ -163,6 +214,11 @@ static int __cmd_ftrace(struct perf_ftrace *ftrace, int argc, const char **argv) goto out_reset; } + if (set_tracing_cpu(ftrace) < 0) { + pr_err("failed to set tracing cpumask\n"); + goto out_reset; + } + if (write_tracing_file("current_tracer", ftrace->tracer) < 0) { pr_err("failed to set current_tracer to %s\n", ftrace->tracer); goto out_reset; @@ -264,6 +320,10 @@ int cmd_ftrace(int argc, const char **argv, const char *prefix __maybe_unused) "trace on existing process id"), OPT_INCR('v', "verbose", &verbose, "be more verbose"), + OPT_BOOLEAN('a', "all-cpus", &ftrace.target.system_wide, + "system-wide collection from all CPUs"), + OPT_STRING('C', "cpu", &ftrace.target.cpu_list, "cpu", + "list of cpus to monitor"), OPT_END() }; From 583359646fde8526ea9456618cc24dc359b34094 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 24 Feb 2017 10:12:51 +0900 Subject: [PATCH 26/37] perf ftrace: Use pager for displaying result It's convenient to use the pager when seeing many lines of result. Note that setup_pager() should be called after perf_evlist__prepare_workload() since they can interfere each other regarding shared stdio streams. Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Steven Rostedt Cc: kernel-team@lge.com Link: http://lkml.kernel.org/r/20170224011251.14946-4-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-ftrace.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c index d5b566ed7178..6087295f8827 100644 --- a/tools/perf/builtin-ftrace.c +++ b/tools/perf/builtin-ftrace.c @@ -195,6 +195,7 @@ static int __cmd_ftrace(struct perf_ftrace *ftrace, int argc, const char **argv) signal(SIGINT, sig_handler); signal(SIGUSR1, sig_handler); signal(SIGCHLD, sig_handler); + signal(SIGPIPE, sig_handler); if (reset_tracing_files(ftrace) < 0) goto out; @@ -247,6 +248,8 @@ static int __cmd_ftrace(struct perf_ftrace *ftrace, int argc, const char **argv) goto out_close_fd; } + setup_pager(); + perf_evlist__start_workload(ftrace->evlist); while (!done) { From 90ec5e89e393c76e19afc845d8f88a5dc8315919 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Wed, 22 Feb 2017 19:23:37 +0530 Subject: [PATCH 27/37] kretprobes: Ensure probe location is at function entry kretprobes can be registered by specifying an absolute address or by specifying offset to a symbol. However, we need to ensure this falls at function entry so as to be able to determine the return address. Validate the same during kretprobe registration. By default, there should not be any offset from a function entry, as determined through a kallsyms_lookup(). Introduce arch_function_offset_within_entry() as a way for architectures to override this. Signed-off-by: Naveen N. Rao Acked-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Michael Ellerman Cc: Steven Rostedt Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/f1583bc4839a3862cfc2acefcc56f9c8837fa2ba.1487770934.git.naveen.n.rao@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- include/linux/kprobes.h | 1 + kernel/kprobes.c | 13 +++++++++++++ 2 files changed, 14 insertions(+) diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index c328e4f7dcad..177bdf6c6aeb 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -267,6 +267,7 @@ extern int arch_init_kprobes(void); extern void show_registers(struct pt_regs *regs); extern void kprobes_inc_nmissed_count(struct kprobe *p); extern bool arch_within_kprobe_blacklist(unsigned long addr); +extern bool arch_function_offset_within_entry(unsigned long offset); extern bool within_kprobe_blacklist(unsigned long addr); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 699c5bc51a92..448759d4a263 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1875,12 +1875,25 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) } NOKPROBE_SYMBOL(pre_handler_kretprobe); +bool __weak arch_function_offset_within_entry(unsigned long offset) +{ + return !offset; +} + int register_kretprobe(struct kretprobe *rp) { int ret = 0; struct kretprobe_instance *inst; int i; void *addr; + unsigned long offset; + + addr = kprobe_addr(&rp->kp); + if (!kallsyms_lookup_size_offset((unsigned long)addr, NULL, &offset)) + return -EINVAL; + + if (!arch_function_offset_within_entry(offset)) + return -EINVAL; if (kretprobe_blacklist_size) { addr = kprobe_addr(&rp->kp); From 35b6f55aa9ba65141f2def0997e23aab13715d3f Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Wed, 22 Feb 2017 19:23:39 +0530 Subject: [PATCH 28/37] trace/kprobes: Allow return probes with offsets and absolute addresses Since the kernel includes many non-global functions with same names, we will need to use offsets from other symbols (typically _text/_stext) or absolute addresses to place return probes on specific functions. Also, the core register_kretprobe() API never forbid use of offsets or absolute addresses with kretprobes. Allow its use with the trace infrastructure. To distinguish kernels that support this, update ftrace README to explicitly call this out. Signed-off-by: Naveen N. Rao Acked-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Michael Ellerman Cc: Steven Rostedt Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/183e7ce2921a08c9c755ee9a5da3134febc6695b.1487770934.git.naveen.n.rao@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/trace/trace.c | 1 + kernel/trace/trace_kprobe.c | 8 -------- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f35109514a01..0ed834d6beb0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4355,6 +4355,7 @@ static const char readme_msg[] = "\t -:[/]\n" #ifdef CONFIG_KPROBE_EVENTS "\t place: [:][+]|\n" + "place (kretprobe): [:][+]|\n" #endif #ifdef CONFIG_UPROBE_EVENTS "\t place: :\n" diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index eadd96ef772f..18775ef182f8 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -680,10 +680,6 @@ static int create_trace_kprobe(int argc, char **argv) return -EINVAL; } if (isdigit(argv[1][0])) { - if (is_return) { - pr_info("Return probe point must be a symbol.\n"); - return -EINVAL; - } /* an address specified */ ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr); if (ret) { @@ -699,10 +695,6 @@ static int create_trace_kprobe(int argc, char **argv) pr_info("Failed to parse symbol.\n"); return ret; } - if (offset && is_return) { - pr_info("Return probe must be used without offset.\n"); - return -EINVAL; - } } argc -= 2; argv += 2; From e491bc2f0dd9f1b4a23ba6f3da88f6b695c4a4c9 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 23 Feb 2017 17:07:23 +0530 Subject: [PATCH 29/37] perf probe: Generalize probe event file open routine Generalize probe event file open routine into a generic function for opening trace files. Signed-off-by: Naveen N. Rao Acked-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Michael Ellerman Cc: Steven Rostedt Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/b580465c7a4dcd5d3b40fdf8568e6be45d0a6333.1487849577.git.naveen.n.rao@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/probe-file.c | 20 +++++++++++--------- tools/perf/util/probe-file.h | 1 + 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/tools/perf/util/probe-file.c b/tools/perf/util/probe-file.c index 436b64731f65..1a62daceb028 100644 --- a/tools/perf/util/probe-file.c +++ b/tools/perf/util/probe-file.c @@ -70,7 +70,7 @@ static void print_both_open_warning(int kerr, int uerr) } } -static int open_probe_events(const char *trace_file, bool readwrite) +int open_trace_file(const char *trace_file, bool readwrite) { char buf[PATH_MAX]; int ret; @@ -92,12 +92,12 @@ static int open_probe_events(const char *trace_file, bool readwrite) static int open_kprobe_events(bool readwrite) { - return open_probe_events("kprobe_events", readwrite); + return open_trace_file("kprobe_events", readwrite); } static int open_uprobe_events(bool readwrite) { - return open_probe_events("uprobe_events", readwrite); + return open_trace_file("uprobe_events", readwrite); } int probe_file__open(int flag) @@ -899,6 +899,7 @@ bool probe_type_is_available(enum probe_type type) size_t len = 0; bool target_line = false; bool ret = probe_type_table[type].avail; + int fd; if (type >= PROBE_TYPE_END) return false; @@ -906,14 +907,16 @@ bool probe_type_is_available(enum probe_type type) if (ret || probe_type_table[type].checked) return ret; - if (asprintf(&buf, "%s/README", tracing_path) < 0) + fd = open_trace_file("README", false); + if (fd < 0) return ret; - fp = fopen(buf, "r"); - if (!fp) - goto end; + fp = fdopen(fd, "r"); + if (!fp) { + close(fd); + return ret; + } - zfree(&buf); while (getline(&buf, &len, fp) > 0 && !ret) { if (!target_line) { target_line = !!strstr(buf, " type: "); @@ -928,7 +931,6 @@ bool probe_type_is_available(enum probe_type type) probe_type_table[type].avail = ret; fclose(fp); -end: free(buf); return ret; diff --git a/tools/perf/util/probe-file.h b/tools/perf/util/probe-file.h index eba44c3e9dca..a17a82eff8a0 100644 --- a/tools/perf/util/probe-file.h +++ b/tools/perf/util/probe-file.h @@ -35,6 +35,7 @@ enum probe_type { /* probe-file.c depends on libelf */ #ifdef HAVE_LIBELF_SUPPORT +int open_trace_file(const char *trace_file, bool readwrite); int probe_file__open(int flag); int probe_file__open_both(int *kfd, int *ufd, int flag); struct strlist *probe_file__get_namelist(int fd); From f1c4d1ad39b5f7c617572f93658bb7159ec9c686 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 28 Feb 2017 12:02:36 +0200 Subject: [PATCH 30/37] perf intel-PT/BTS: Add missing initialization $ perf test decoder 57: x86 instruction decoder - new instructions : FAILED! $ Failed to decode 'rel' value (0xfffffffc vs expected 0): 0f 1b 80 78 56 34 12 bndstx %bnd0,0x12345678(%rax) Failed to decode 'rel' value (0xfffffffc vs expected 0): 0f 1b 85 78 56 34 12 bndstx %bnd0,0x12345678(%rbp) Failed to decode 'rel' value (0xfffffffc vs expected 0): 0f 1b 84 01 78 56 34 12 bndstx %bnd0,0x12345678(%rcx,%rax,1) Failed to decode 'rel' value (0xfffffffc vs expected 0): 0f 1b 84 05 78 56 34 12 bndstx %bnd0,0x12345678(%rbp,%rax,1) Failed to decode 'rel' value (0xfffffffc vs expected 0): 0f 1b 84 08 78 56 34 12 bndstx %bnd0,0x12345678(%rax,%rcx,1) There is missing initialization. It only affects the test because it is checking 'rel' even in cases where there is no value. Fix it. Reported-and-Tested-by: Arnaldo Carvalho de Melo Signed-off-by: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/r/08c6ad07-7994-3e56-b20e-d75727ca7765@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c index 7913363bde5c..55b6250350d7 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c +++ b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c @@ -39,6 +39,8 @@ static void intel_pt_insn_decoder(struct insn *insn, enum intel_pt_insn_branch branch = INTEL_PT_BR_NO_BRANCH; int ext; + intel_pt_insn->rel = 0; + if (insn_is_avx(insn)) { intel_pt_insn->op = INTEL_PT_OP_OTHER; intel_pt_insn->branch = INTEL_PT_BR_NO_BRANCH; From d0e02579c282ccf34c79818045ec2d2934b56c19 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 27 Feb 2017 11:52:04 -0500 Subject: [PATCH 31/37] trace/kprobes: Add back warning about offset in return probes Let's not remove the warning about offsets and return probes when the offset is invalid. Signed-off-by: Steven Rostedt Acked-by: Masami Hiramatsu Acked-by: Naveen N. Rao Cc: Ananth N Mavinakayanahalli Cc: Michael Ellerman Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20170227115204.00f92846@gandalf.local.home Signed-off-by: Arnaldo Carvalho de Melo --- kernel/trace/trace_kprobe.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 18775ef182f8..2b7d0dd938ba 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -695,6 +695,11 @@ static int create_trace_kprobe(int argc, char **argv) pr_info("Failed to parse symbol.\n"); return ret; } + if (offset && is_return && + !arch_function_offset_within_entry(offset)) { + pr_info("Given offset is not valid for return probe.\n"); + return -EINVAL; + } } argc -= 2; argv += 2; From e3ba76deef23064fc272424b86b506cd80b04fc5 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 27 Feb 2017 10:48:18 +0100 Subject: [PATCH 32/37] perf tools: Force uncore events to system wide monitoring Make system wide (-a) the default option if no target was specified and one of following conditions is met: - there's no workload specified (current behaviour) - there is workload specified but all requested events are system wide ones Mixed events core/uncore with workload: $ perf stat -e 'uncore_cbox_0/clockticks/,cycles' sleep 1 Performance counter stats for 'sleep 1': uncore_cbox_0/clockticks/ 980,489 cycles 1.000897406 seconds time elapsed Uncore event with workload: $ perf stat -e 'uncore_cbox_0/clockticks/' sleep 1 Performance counter stats for 'system wide': 281,473,897,192,670 uncore_cbox_0/clockticks/ 1.000833784 seconds time elapsed Committer note: When testing I realized the default case for !root, i.e. no events passed via -e, was broke by v2 of this patch, reported and after a patch provided by Jiri it is back working: [acme@jouet linux]$ perf stat usleep 1 Performance counter stats for 'usleep 1': 0.401335 task-clock:u (msec) # 0.297 CPUs utilized 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 48 page-faults:u # 0.120 M/sec 458,146 cycles:u # 1.142 GHz 245,113 instructions:u # 0.54 insn per cycle 47,991 branches:u # 119.578 M/sec 4,022 branch-misses:u # 8.38% of all branches 0.001350029 seconds time elapsed [acme@jouet linux]$ Suggested-and-Tested-by: Borislav Petkov Signed-off-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170227094818.GA12764@krava Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 33 ++++++++++++++++++++++++++++++--- tools/perf/util/parse-events.c | 5 +++-- 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index f4f555a67e9b..f53f449d864d 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -2350,6 +2350,35 @@ static int __cmd_report(int argc, const char **argv) return 0; } +static void setup_system_wide(int forks) +{ + /* + * Make system wide (-a) the default target if + * no target was specified and one of following + * conditions is met: + * + * - there's no workload specified + * - there is workload specified but all requested + * events are system wide events + */ + if (!target__none(&target)) + return; + + if (!forks) + target.system_wide = true; + else { + struct perf_evsel *counter; + + evlist__for_each_entry(evsel_list, counter) { + if (!counter->system_wide) + return; + } + + if (evsel_list->nr_entries) + target.system_wide = true; + } +} + int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused) { const char * const stat_usage[] = { @@ -2456,9 +2485,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused) } else if (big_num_opt == 0) /* User passed --no-big-num */ big_num = false; - /* Make system wide (-a) the default target. */ - if (!argc && target__none(&target)) - target.system_wide = true; + setup_system_wide(argc); if (run_count < 0) { pr_err("Run count must be a positive number\n"); diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 67a8aebc67ab..54355d3caf09 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -316,8 +316,9 @@ __add_event(struct list_head *list, int *idx, return NULL; (*idx)++; - evsel->cpus = cpu_map__get(cpus); - evsel->own_cpus = cpu_map__get(cpus); + evsel->cpus = cpu_map__get(cpus); + evsel->own_cpus = cpu_map__get(cpus); + evsel->system_wide = !!cpus; if (name) evsel->name = strdup(name); From 120010cb1eea151d38a3e66f5ffc79a0c3110292 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 2 Mar 2017 12:55:49 -0300 Subject: [PATCH 33/37] tools build: Add test for sched_getcpu() Instead of trying to go on adding more ifdef conditions, do a feature test and define HAVE_SCHED_GETCPU_SUPPORT instead, then use it to provide the prototype. No need to change the stub, as it is already a __weak symbol. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-yge89er9g90sc0v6k0a0r5tr@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/build/Makefile.feature | 1 + tools/build/feature/Makefile | 6 +++++- tools/build/feature/test-all.c | 5 +++++ tools/build/feature/test-sched_getcpu.c | 7 +++++++ tools/perf/Makefile.config | 4 ++++ tools/perf/util/cloexec.h | 6 ------ tools/perf/util/util.h | 4 ++-- 7 files changed, 24 insertions(+), 9 deletions(-) create mode 100644 tools/build/feature/test-sched_getcpu.c diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature index e3fb5ecbdcb6..523911f316ce 100644 --- a/tools/build/Makefile.feature +++ b/tools/build/Makefile.feature @@ -63,6 +63,7 @@ FEATURE_TESTS_BASIC := \ lzma \ get_cpuid \ bpf \ + sched_getcpu \ sdt # FEATURE_TESTS_BASIC + FEATURE_TESTS_EXTRA is the complete list diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile index b564a2eea039..ab1e2bbc2e96 100644 --- a/tools/build/feature/Makefile +++ b/tools/build/feature/Makefile @@ -48,7 +48,8 @@ FILES= \ test-get_cpuid.bin \ test-sdt.bin \ test-cxx.bin \ - test-jvmti.bin + test-jvmti.bin \ + test-sched_getcpu.bin FILES := $(addprefix $(OUTPUT),$(FILES)) @@ -91,6 +92,9 @@ $(OUTPUT)test-libelf.bin: $(OUTPUT)test-glibc.bin: $(BUILD) +$(OUTPUT)test-sched_getcpu.bin: + $(BUILD) + DWARFLIBS := -ldw ifeq ($(findstring -static,${LDFLAGS}),-static) DWARFLIBS += -lelf -lebl -lz -llzma -lbz2 diff --git a/tools/build/feature/test-all.c b/tools/build/feature/test-all.c index 699e43627397..cc6c7c01f4ca 100644 --- a/tools/build/feature/test-all.c +++ b/tools/build/feature/test-all.c @@ -117,6 +117,10 @@ # include "test-pthread-attr-setaffinity-np.c" #undef main +#define main main_test_sched_getcpu +# include "test-sched_getcpu.c" +#undef main + # if 0 /* * Disable libbabeltrace check for test-all, because the requested @@ -182,6 +186,7 @@ int main(int argc, char *argv[]) main_test_get_cpuid(); main_test_bpf(); main_test_libcrypto(); + main_test_sched_getcpu(); main_test_sdt(); return 0; diff --git a/tools/build/feature/test-sched_getcpu.c b/tools/build/feature/test-sched_getcpu.c new file mode 100644 index 000000000000..c4a148dd7104 --- /dev/null +++ b/tools/build/feature/test-sched_getcpu.c @@ -0,0 +1,7 @@ +#define _GNU_SOURCE +#include + +int main(void) +{ + return sched_getcpu(); +} diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 27c9fbca7bd9..2b656de99495 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -317,6 +317,10 @@ ifdef NO_DWARF NO_LIBDW_DWARF_UNWIND := 1 endif +ifeq ($(feature-sched_getcpu), 1) + CFLAGS += -DHAVE_SCHED_GETCPU_SUPPORT +endif + ifndef NO_LIBELF CFLAGS += -DHAVE_LIBELF_SUPPORT EXTLIBS += -lelf diff --git a/tools/perf/util/cloexec.h b/tools/perf/util/cloexec.h index d0d465953d36..94a5a7d829d5 100644 --- a/tools/perf/util/cloexec.h +++ b/tools/perf/util/cloexec.h @@ -3,10 +3,4 @@ unsigned long perf_event_open_cloexec_flag(void); -#ifdef __GLIBC_PREREQ -#if !__GLIBC_PREREQ(2, 6) && !defined(__UCLIBC__) -int sched_getcpu(void) __THROW; -#endif -#endif - #endif /* __PERF_CLOEXEC_H */ diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index c74708da8571..b2cfa47990dc 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -355,8 +355,8 @@ void print_binary(unsigned char *data, size_t len, size_t bytes_per_line, print_binary_t printer, void *extra); -#if !defined(__GLIBC__) && !defined(__ANDROID__) -extern int sched_getcpu(void); +#ifndef HAVE_SCHED_GETCPU_SUPPORT +int sched_getcpu(void); #endif int is_printable_array(char *p, unsigned int len); From b8d1fd7ec661f5ccb9facd57589a563f627df230 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 2 Mar 2017 15:31:43 -0300 Subject: [PATCH 34/37] perf bench futex: Use __maybe_unused Instead of attributing a variable to itself to silence the compiler, use the attribute designed for that, avoiding this: In file included from bench/futex-hash.c:24: bench/futex.h:95:7: error: explicitly assigning value of variable of type 'pthread_attr_t *' to itself [-Werror,-Wself-assign] attr = attr; ~~~~ ^ ~~~~ bench/futex.h:96:13: error: explicitly assigning value of variable of type 'size_t' (aka 'unsigned long') to itself [-Werror,-Wself-assign] cpusetsize = cpusetsize; ~~~~~~~~~~ ^ ~~~~~~~~~~ bench/futex.h:97:9: error: explicitly assigning value of variable of type 'cpu_set_t *' (aka 'struct cpu_set_t *') to itself [-Werror,-Wself-assign] cpuset = cpuset; ~~~~~~ ^ ~~~~~~ That is only triggered when HAVE_PTHREAD_ATTR_SETAFFINITY_NP isn't set. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-14ws1d1elj2d5ej8g7cwdqau@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/bench/futex.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tools/perf/bench/futex.h b/tools/perf/bench/futex.h index b2e06d1190d0..e44fd3239530 100644 --- a/tools/perf/bench/futex.h +++ b/tools/perf/bench/futex.h @@ -88,13 +88,11 @@ futex_cmp_requeue(u_int32_t *uaddr, u_int32_t val, u_int32_t *uaddr2, int nr_wak #ifndef HAVE_PTHREAD_ATTR_SETAFFINITY_NP #include -static inline int pthread_attr_setaffinity_np(pthread_attr_t *attr, - size_t cpusetsize, - cpu_set_t *cpuset) +#include +static inline int pthread_attr_setaffinity_np(pthread_attr_t *attr __maybe_unused, + size_t cpusetsize __maybe_unused, + cpu_set_t *cpuset __maybe_unused) { - attr = attr; - cpusetsize = cpusetsize; - cpuset = cpuset; return 0; } #endif From a0f213e14bb5cf4f190809b5811e1292bd614899 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 2 Mar 2017 15:44:19 -0300 Subject: [PATCH 35/37] perf bench futex: Fix build on musl + clang When building with clang on a musl libc system, Alpine Linux, we end up hitting a problem where memset() is used but its prototype is not present, add it to avoid this: bench/futex-wake.c:99:3: error: implicitly declaring library function 'memset' with type 'void *(void *, int, unsigned long)' [-Werror,-Wimplicit-function-declaration] CPU_ZERO(&cpu); ^ /usr/include/sched.h:127:23: note: expanded from macro 'CPU_ZERO' #define CPU_ZERO(set) CPU_ZERO_S(sizeof(cpu_set_t),set) ^ /usr/include/sched.h:110:30: note: expanded from macro 'CPU_ZERO_S' #define CPU_ZERO_S(size,set) memset(set,0,size) ^ bench/futex-wake.c:99:3: note: include the header or explicitly provide a declaration for 'memset' Found while updating my test build containers to build perf with clang in more systems. Cc: Adrian Hunter Cc: David Ahern Cc: Davidlohr Bueso Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-jh10vaz2r98zl6gm5iau8prr@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/bench/futex-hash.c | 1 + tools/perf/bench/futex-lock-pi.c | 1 + tools/perf/bench/futex-requeue.c | 1 + tools/perf/bench/futex-wake-parallel.c | 1 + tools/perf/bench/futex-wake.c | 1 + 5 files changed, 5 insertions(+) diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c index da04b8c5568a..2499e1b0c6fb 100644 --- a/tools/perf/bench/futex-hash.c +++ b/tools/perf/bench/futex-hash.c @@ -9,6 +9,7 @@ */ /* For the CLR_() macros */ +#include #include #include diff --git a/tools/perf/bench/futex-lock-pi.c b/tools/perf/bench/futex-lock-pi.c index 91877777ec6e..a20814d94af1 100644 --- a/tools/perf/bench/futex-lock-pi.c +++ b/tools/perf/bench/futex-lock-pi.c @@ -3,6 +3,7 @@ */ /* For the CLR_() macros */ +#include #include #include diff --git a/tools/perf/bench/futex-requeue.c b/tools/perf/bench/futex-requeue.c index 2b9705a8734c..9fad1e4fcd3e 100644 --- a/tools/perf/bench/futex-requeue.c +++ b/tools/perf/bench/futex-requeue.c @@ -9,6 +9,7 @@ */ /* For the CLR_() macros */ +#include #include #include diff --git a/tools/perf/bench/futex-wake-parallel.c b/tools/perf/bench/futex-wake-parallel.c index 2c8fa67ad537..40f5fcf1d120 100644 --- a/tools/perf/bench/futex-wake-parallel.c +++ b/tools/perf/bench/futex-wake-parallel.c @@ -8,6 +8,7 @@ */ /* For the CLR_() macros */ +#include #include #include diff --git a/tools/perf/bench/futex-wake.c b/tools/perf/bench/futex-wake.c index e246b1b8388a..789490281ae3 100644 --- a/tools/perf/bench/futex-wake.c +++ b/tools/perf/bench/futex-wake.c @@ -9,6 +9,7 @@ */ /* For the CLR_() macros */ +#include #include #include From c8c188679ccfc86d9c7bac57ecf4b8205a061a06 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 2 Mar 2017 16:00:26 -0300 Subject: [PATCH 36/37] tools build: Use the same CC for feature detection and actual build When build with: 'make CC=clang' we were not using that CC to do feature detection, which resulted in features being detected with gcc and then the actual tools being built with clang. Most of the time these compilers are compatible enough, so no problem was being noticed. As soon as a system with an old enough clang, one that hasn't the cpuid.h header is used, and a gcc with it, the "get_cpuid" feature will be found available but then code that will use can't be compiled. Noticed with this combination: / $ gcc --version | head -1 gcc (Alpine 6.3.0) 6.3.0 / $ clang --version | head -1 clang version 3.8.1 (tags/RELEASE_381/final) / $ cat /etc/alpine-release 3.5.0 / $ Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-60q18nvlvgpyfv7e2qqgx4ou@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/build/feature/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile index ab1e2bbc2e96..09c9626ea666 100644 --- a/tools/build/feature/Makefile +++ b/tools/build/feature/Makefile @@ -53,8 +53,8 @@ FILES= \ FILES := $(addprefix $(OUTPUT),$(FILES)) -CC := $(CROSS_COMPILE)gcc -MD -CXX := $(CROSS_COMPILE)g++ -MD +CC ?= $(CROSS_COMPILE)gcc -MD +CXX ?= $(CROSS_COMPILE)g++ -MD PKG_CONFIG := $(CROSS_COMPILE)pkg-config LLVM_CONFIG ?= llvm-config From 001916b94a04809a94abb07daba6f9ace01906ba Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 5 Mar 2017 17:40:11 +0100 Subject: [PATCH 37/37] perf bench numa: Add more comment for -c option Adding more commentary for -c/--show_convergence option, to explain how the convergence is defined. Before: -c, --show_convergence show convergence details Now: -c, --show_convergence convergence is reached when each process \ (all its threads) is running on a single NUMA node. Suggested--by: Jiri Hladky Signed-off-by: Jiri Olsa Cc: David Ahern Cc: Jiri Hladky Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1488732011-27384-1-git-send-email-jolsa@kernel.org [ Rephrased a bit based on a IRC conversation with Jiri ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/bench/numa.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c index 3083fc36282b..6bd0581de298 100644 --- a/tools/perf/bench/numa.c +++ b/tools/perf/bench/numa.c @@ -187,7 +187,8 @@ static const struct option options[] = { OPT_INCR ('d', "show_details" , &p0.show_details, "Show details"), OPT_INCR ('a', "all" , &p0.run_all, "Run all tests in the suite"), OPT_INTEGER('H', "thp" , &p0.thp, "MADV_NOHUGEPAGE < 0 < MADV_HUGEPAGE"), - OPT_BOOLEAN('c', "show_convergence", &p0.show_convergence, "show convergence details"), + OPT_BOOLEAN('c', "show_convergence", &p0.show_convergence, "show convergence details, " + "convergence is reached when each process (all its threads) is running on a single NUMA node."), OPT_BOOLEAN('m', "measure_convergence", &p0.measure_convergence, "measure convergence latency"), OPT_BOOLEAN('q', "quiet" , &p0.show_quiet, "quiet mode"), OPT_BOOLEAN('S', "serialize-startup", &p0.serialize_startup,"serialize thread startup"),