From 6fe7c745f2acb73e4cc961d7f91125eef5a8861f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 10 Aug 2021 11:07:14 +0900 Subject: [PATCH 01/35] tracing/boot: Fix a hist trigger dependency for boot time tracing Fixes a build error when CONFIG_HIST_TRIGGERS=n with boot-time tracing. Since the trigger_process_regex() is defined only when CONFIG_HIST_TRIGGERS=y, if it is disabled, the 'actions' event option also must be disabled. Link: https://lkml.kernel.org/r/162856123376.203126.582144262622247352.stgit@devnote2 Fixes: 81a59555ff15 ("tracing/boot: Add per-event settings") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 94ef2d099e32..d713714cba67 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -205,12 +205,15 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, pr_err("Failed to apply filter: %s\n", buf); } - xbc_node_for_each_array_value(enode, "actions", anode, p) { - if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) - pr_err("action string is too long: %s\n", p); - else if (trigger_process_regex(file, buf) < 0) - pr_err("Failed to apply an action: %s\n", buf); - } + if (IS_ENABLED(CONFIG_HIST_TRIGGERS)) { + xbc_node_for_each_array_value(enode, "actions", anode, p) { + if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) + pr_err("action string is too long: %s\n", p); + else if (trigger_process_regex(file, buf) < 0) + pr_err("Failed to apply an action: %s\n", buf); + } + } else if (xbc_node_find_value(enode, "actions", NULL)) + pr_err("Failed to apply event actions because CONFIG_HIST_TRIGGERS is not set.\n"); if (xbc_node_find_value(enode, "enable", NULL)) { if (trace_event_enable_disable(file, 1, 0) < 0) From de9a48a360b70d5318061cf1237431d1869555e4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 7 Jul 2021 17:36:24 -0400 Subject: [PATCH 02/35] tracing: Add linear buckets to histogram logic There's been several times I wished the histogram logic had a "grouping" feature for the buckets. Currently, each bucket has a size of one. That is, if you trace the amount of requested allocations, each allocation is its own bucket, even if you are interested in what allocates 100 bytes or less, 100 to 200, 200 to 300, etc. Also, without grouping, it fills up the allocated histogram buckets quickly. If you are tracking latency, and don't care if something is 200 microseconds off, or 201 microseconds off, but want to track them by say 10 microseconds each. This can not currently be done. There is a log2 but that grouping get's too big too fast for a lot of cases. Introduce a "buckets=SIZE" command to each field where it will record in a rounded number. For example: ># echo 'hist:keys=bytes_req.buckets=100:sort=bytes_req' > events/kmem/kmalloc/trigger ># cat events/kmem/kmalloc/hist # event histogram # # trigger info: hist:keys=bytes_req.buckets=100:vals=hitcount:sort=bytes_req.buckets=100:size=2048 [active] # { bytes_req: ~ 0-99 } hitcount: 3149 { bytes_req: ~ 100-199 } hitcount: 1468 { bytes_req: ~ 200-299 } hitcount: 39 { bytes_req: ~ 300-399 } hitcount: 306 { bytes_req: ~ 400-499 } hitcount: 364 { bytes_req: ~ 500-599 } hitcount: 32 { bytes_req: ~ 600-699 } hitcount: 69 { bytes_req: ~ 700-799 } hitcount: 37 { bytes_req: ~ 1200-1299 } hitcount: 16 { bytes_req: ~ 1400-1499 } hitcount: 30 { bytes_req: ~ 2000-2099 } hitcount: 6 { bytes_req: ~ 4000-4099 } hitcount: 2168 { bytes_req: ~ 5000-5099 } hitcount: 6 Totals: Hits: 7690 Entries: 13 Dropped: 0 Link: https://lkml.kernel.org/r/20210707213921.980359719@goodmis.org Acked-by: Namhyung Kim Reviewed-by: Tom Zanussi Reviewed-by: Masami Hiramatsu Tested-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 65 ++++++++++++++++++++++++++++---- 1 file changed, 58 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index a48aa2a2875b..8e87c4a429fd 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -121,6 +121,7 @@ struct hist_field { unsigned int size; unsigned int offset; unsigned int is_signed; + unsigned long buckets; const char *type; struct hist_field *operands[HIST_FIELD_OPERANDS_MAX]; struct hist_trigger_data *hist_data; @@ -219,6 +220,27 @@ static u64 hist_field_log2(struct hist_field *hist_field, return (u64) ilog2(roundup_pow_of_two(val)); } +static u64 hist_field_bucket(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_field *operand = hist_field->operands[0]; + unsigned long buckets = hist_field->buckets; + + u64 val = operand->fn(operand, elt, buffer, rbe, event); + + if (WARN_ON_ONCE(!buckets)) + return val; + + if (val >= LONG_MAX) + val = div64_ul(val, buckets); + else + val = (u64)((unsigned long)val / buckets); + return val * buckets; +} + static u64 hist_field_plus(struct hist_field *hist_field, struct tracing_map_elt *elt, struct trace_buffer *buffer, @@ -318,6 +340,7 @@ enum hist_field_flags { HIST_FIELD_FL_VAR_REF = 1 << 14, HIST_FIELD_FL_CPU = 1 << 15, HIST_FIELD_FL_ALIAS = 1 << 16, + HIST_FIELD_FL_BUCKET = 1 << 17, }; struct var_defs { @@ -1109,7 +1132,8 @@ static const char *hist_field_name(struct hist_field *field, if (field->field) field_name = field->field->name; else if (field->flags & HIST_FIELD_FL_LOG2 || - field->flags & HIST_FIELD_FL_ALIAS) + field->flags & HIST_FIELD_FL_ALIAS || + field->flags & HIST_FIELD_FL_BUCKET) field_name = hist_field_name(field->operands[0], ++level); else if (field->flags & HIST_FIELD_FL_CPU) field_name = "common_cpu"; @@ -1470,6 +1494,8 @@ static const char *get_hist_field_flags(struct hist_field *hist_field) flags_str = "syscall"; else if (hist_field->flags & HIST_FIELD_FL_LOG2) flags_str = "log2"; + else if (hist_field->flags & HIST_FIELD_FL_BUCKET) + flags_str = "buckets"; else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP_USECS) flags_str = "usecs"; @@ -1658,9 +1684,10 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, goto out; } - if (flags & HIST_FIELD_FL_LOG2) { - unsigned long fl = flags & ~HIST_FIELD_FL_LOG2; - hist_field->fn = hist_field_log2; + if (flags & (HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET)) { + unsigned long fl = flags & ~(HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET); + hist_field->fn = flags & HIST_FIELD_FL_LOG2 ? hist_field_log2 : + hist_field_bucket; hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL); hist_field->size = hist_field->operands[0]->size; hist_field->type = kstrdup(hist_field->operands[0]->type, GFP_KERNEL); @@ -1953,7 +1980,7 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data, static struct ftrace_event_field * parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, - char *field_str, unsigned long *flags) + char *field_str, unsigned long *flags, unsigned long *buckets) { struct ftrace_event_field *field = NULL; char *field_name, *modifier, *str; @@ -1980,7 +2007,22 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, *flags |= HIST_FIELD_FL_LOG2; else if (strcmp(modifier, "usecs") == 0) *flags |= HIST_FIELD_FL_TIMESTAMP_USECS; - else { + else if (strncmp(modifier, "bucket", 6) == 0) { + int ret; + + modifier += 6; + + if (*modifier == 's') + modifier++; + if (*modifier != '=') + goto error; + modifier++; + ret = kstrtoul(modifier, 0, buckets); + if (ret || !(*buckets)) + goto error; + *flags |= HIST_FIELD_FL_BUCKET; + } else { + error: hist_err(tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier)); field = ERR_PTR(-EINVAL); goto out; @@ -2049,6 +2091,7 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, char *s, *ref_system = NULL, *ref_event = NULL, *ref_var = str; struct ftrace_event_field *field = NULL; struct hist_field *hist_field = NULL; + unsigned long buckets = 0; int ret = 0; s = strchr(str, '.'); @@ -2086,7 +2129,7 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, } else str = s; - field = parse_field(hist_data, file, str, flags); + field = parse_field(hist_data, file, str, flags, &buckets); if (IS_ERR(field)) { ret = PTR_ERR(field); goto out; @@ -2097,6 +2140,7 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, ret = -ENOMEM; goto out; } + hist_field->buckets = buckets; return hist_field; out: @@ -4698,6 +4742,11 @@ static void hist_trigger_print_key(struct seq_file *m, } else if (key_field->flags & HIST_FIELD_FL_LOG2) { seq_printf(m, "%s: ~ 2^%-2llu", field_name, *(u64 *)(key + key_field->offset)); + } else if (key_field->flags & HIST_FIELD_FL_BUCKET) { + unsigned long buckets = key_field->buckets; + uval = *(u64 *)(key + key_field->offset); + seq_printf(m, "%s: ~ %llu-%llu", field_name, + uval, uval + buckets -1); } else if (key_field->flags & HIST_FIELD_FL_STRING) { seq_printf(m, "%s: %-50s", field_name, (char *)(key + key_field->offset)); @@ -5137,6 +5186,8 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) seq_printf(m, ".%s", flags); } } + if (hist_field->buckets) + seq_printf(m, "=%ld", hist_field->buckets); } static int event_hist_trigger_print(struct seq_file *m, From 370364351926e4fcc7c1a486901bfaae0172b7d9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 7 Jul 2021 17:36:25 -0400 Subject: [PATCH 03/35] tracing/histogram: Update the documentation for the buckets modifier Update both the tracefs README file as well as the histogram.rst to include an explanation of what the buckets modifier is and how to use it. Include an example with the wakeup_latency example for both log2 and the buckets modifiers as there was no existing log2 example. Link: https://lkml.kernel.org/r/20210707213922.167218794@goodmis.org Acked-by: Namhyung Kim Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/histogram.rst | 92 +++++++++++++++++++++++++++++-- kernel/trace/trace.c | 1 + 2 files changed, 87 insertions(+), 6 deletions(-) diff --git a/Documentation/trace/histogram.rst b/Documentation/trace/histogram.rst index f99be8062bc8..4e650671f245 100644 --- a/Documentation/trace/histogram.rst +++ b/Documentation/trace/histogram.rst @@ -77,6 +77,7 @@ Documentation written by Tom Zanussi .syscall display a syscall id as a system call name .execname display a common_pid as a program name .log2 display log2 value rather than raw number + .buckets=size display grouping of values rather than raw number .usecs display a common_timestamp in microseconds =========== ========================================== @@ -228,7 +229,7 @@ Extended error information that lists the total number of bytes requested for each function in the kernel that made one or more calls to kmalloc:: - # echo 'hist:key=call_site:val=bytes_req' > \ + # echo 'hist:key=call_site:val=bytes_req.buckets=32' > \ /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger This tells the tracing system to create a 'hist' trigger using the @@ -1823,20 +1824,99 @@ and variables defined on other events (see Section 2.2.3 below on how that is done using hist trigger 'onmatch' action). Once that is done, the 'wakeup_latency' synthetic event instance is created. -A histogram can now be defined for the new synthetic event:: - - # echo 'hist:keys=pid,prio,lat.log2:sort=pid,lat' >> \ - /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger - The new event is created under the tracing/events/synthetic/ directory and looks and behaves just like any other event:: # ls /sys/kernel/debug/tracing/events/synthetic/wakeup_latency enable filter format hist id trigger +A histogram can now be defined for the new synthetic event:: + + # echo 'hist:keys=pid,prio,lat.log2:sort=lat' >> \ + /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger + +The above shows the latency "lat" in a power of 2 grouping. + Like any other event, once a histogram is enabled for the event, the output can be displayed by reading the event's 'hist' file. + # cat /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/hist + + # event histogram + # + # trigger info: hist:keys=pid,prio,lat.log2:vals=hitcount:sort=lat.log2:size=2048 [active] + # + + { pid: 2035, prio: 9, lat: ~ 2^2 } hitcount: 43 + { pid: 2034, prio: 9, lat: ~ 2^2 } hitcount: 60 + { pid: 2029, prio: 9, lat: ~ 2^2 } hitcount: 965 + { pid: 2034, prio: 120, lat: ~ 2^2 } hitcount: 9 + { pid: 2033, prio: 120, lat: ~ 2^2 } hitcount: 5 + { pid: 2030, prio: 9, lat: ~ 2^2 } hitcount: 335 + { pid: 2030, prio: 120, lat: ~ 2^2 } hitcount: 10 + { pid: 2032, prio: 120, lat: ~ 2^2 } hitcount: 1 + { pid: 2035, prio: 120, lat: ~ 2^2 } hitcount: 2 + { pid: 2031, prio: 9, lat: ~ 2^2 } hitcount: 176 + { pid: 2028, prio: 120, lat: ~ 2^2 } hitcount: 15 + { pid: 2033, prio: 9, lat: ~ 2^2 } hitcount: 91 + { pid: 2032, prio: 9, lat: ~ 2^2 } hitcount: 125 + { pid: 2029, prio: 120, lat: ~ 2^2 } hitcount: 4 + { pid: 2031, prio: 120, lat: ~ 2^2 } hitcount: 3 + { pid: 2029, prio: 120, lat: ~ 2^3 } hitcount: 2 + { pid: 2035, prio: 9, lat: ~ 2^3 } hitcount: 41 + { pid: 2030, prio: 120, lat: ~ 2^3 } hitcount: 1 + { pid: 2032, prio: 9, lat: ~ 2^3 } hitcount: 32 + { pid: 2031, prio: 9, lat: ~ 2^3 } hitcount: 44 + { pid: 2034, prio: 9, lat: ~ 2^3 } hitcount: 40 + { pid: 2030, prio: 9, lat: ~ 2^3 } hitcount: 29 + { pid: 2033, prio: 9, lat: ~ 2^3 } hitcount: 31 + { pid: 2029, prio: 9, lat: ~ 2^3 } hitcount: 31 + { pid: 2028, prio: 120, lat: ~ 2^3 } hitcount: 18 + { pid: 2031, prio: 120, lat: ~ 2^3 } hitcount: 2 + { pid: 2028, prio: 120, lat: ~ 2^4 } hitcount: 1 + { pid: 2029, prio: 9, lat: ~ 2^4 } hitcount: 4 + { pid: 2031, prio: 120, lat: ~ 2^7 } hitcount: 1 + { pid: 2032, prio: 120, lat: ~ 2^7 } hitcount: 1 + + Totals: + Hits: 2122 + Entries: 30 + Dropped: 0 + + +The latency values can also be grouped linearly by a given size with +the ".buckets" modifier and specify a size (in this case groups of 10). + + # echo 'hist:keys=pid,prio,lat.buckets=10:sort=lat' >> \ + /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger + + # event histogram + # + # trigger info: hist:keys=pid,prio,lat.buckets=10:vals=hitcount:sort=lat.buckets=10:size=2048 [active] + # + + { pid: 2067, prio: 9, lat: ~ 0-9 } hitcount: 220 + { pid: 2068, prio: 9, lat: ~ 0-9 } hitcount: 157 + { pid: 2070, prio: 9, lat: ~ 0-9 } hitcount: 100 + { pid: 2067, prio: 120, lat: ~ 0-9 } hitcount: 6 + { pid: 2065, prio: 120, lat: ~ 0-9 } hitcount: 2 + { pid: 2066, prio: 120, lat: ~ 0-9 } hitcount: 2 + { pid: 2069, prio: 9, lat: ~ 0-9 } hitcount: 122 + { pid: 2069, prio: 120, lat: ~ 0-9 } hitcount: 8 + { pid: 2070, prio: 120, lat: ~ 0-9 } hitcount: 1 + { pid: 2068, prio: 120, lat: ~ 0-9 } hitcount: 7 + { pid: 2066, prio: 9, lat: ~ 0-9 } hitcount: 365 + { pid: 2064, prio: 120, lat: ~ 0-9 } hitcount: 35 + { pid: 2065, prio: 9, lat: ~ 0-9 } hitcount: 998 + { pid: 2071, prio: 9, lat: ~ 0-9 } hitcount: 85 + { pid: 2065, prio: 9, lat: ~ 10-19 } hitcount: 2 + { pid: 2064, prio: 120, lat: ~ 10-19 } hitcount: 2 + + Totals: + Hits: 2112 + Entries: 16 + Dropped: 0 + 2.2.3 Hist trigger 'handlers' and 'actions' ------------------------------------------- diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a1adb29ef5c1..be0169594de5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5654,6 +5654,7 @@ static const char readme_msg[] = "\t .execname display a common_pid as a program name\n" "\t .syscall display a syscall id as a syscall name\n" "\t .log2 display log2 value rather than raw number\n" + "\t .buckets=size display values in groups of size rather than raw number\n" "\t .usecs display a common_timestamp in microseconds\n\n" "\t The 'pause' parameter can be used to pause an existing hist\n" "\t trigger or to start a hist trigger but not log any events\n" From 3347d80baa41c357cf263923f60aa8051a753d76 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 22 Jul 2021 10:27:06 -0400 Subject: [PATCH 04/35] tracing: Have histogram types be constant when possible Instead of kstrdup("const", GFP_KERNEL), have the hist_field type simply assign the constant hist_field->type = "const"; And when the value passed to it is a variable, use "kstrdup_const(var, GFP_KERNEL);" which will just copy the value if the variable is already a constant. This saves on having to allocate when not needed. All frees of the hist_field->type will need to use kfree_const(). Link: https://lkml.kernel.org/r/20210722142837.280718447@goodmis.org Suggested-by: Masami Hiramatsu Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 8e87c4a429fd..bb466a82b938 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1616,7 +1616,9 @@ static void __destroy_hist_field(struct hist_field *hist_field) kfree(hist_field->var.name); kfree(hist_field->name); - kfree(hist_field->type); + + /* Can likely be a const */ + kfree_const(hist_field->type); kfree(hist_field->system); kfree(hist_field->event_name); @@ -1673,9 +1675,7 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, if (flags & HIST_FIELD_FL_HITCOUNT) { hist_field->fn = hist_field_counter; hist_field->size = sizeof(u64); - hist_field->type = kstrdup("u64", GFP_KERNEL); - if (!hist_field->type) - goto free; + hist_field->type = "u64"; goto out; } @@ -1690,7 +1690,7 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, hist_field_bucket; hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL); hist_field->size = hist_field->operands[0]->size; - hist_field->type = kstrdup(hist_field->operands[0]->type, GFP_KERNEL); + hist_field->type = kstrdup_const(hist_field->operands[0]->type, GFP_KERNEL); if (!hist_field->type) goto free; goto out; @@ -1699,18 +1699,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, if (flags & HIST_FIELD_FL_TIMESTAMP) { hist_field->fn = hist_field_timestamp; hist_field->size = sizeof(u64); - hist_field->type = kstrdup("u64", GFP_KERNEL); - if (!hist_field->type) - goto free; + hist_field->type = "u64"; goto out; } if (flags & HIST_FIELD_FL_CPU) { hist_field->fn = hist_field_cpu; hist_field->size = sizeof(int); - hist_field->type = kstrdup("unsigned int", GFP_KERNEL); - if (!hist_field->type) - goto free; + hist_field->type = "unsigned int"; goto out; } @@ -1723,7 +1719,7 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, flags |= HIST_FIELD_FL_STRING; hist_field->size = MAX_FILTER_STR_VAL; - hist_field->type = kstrdup(field->type, GFP_KERNEL); + hist_field->type = kstrdup_const(field->type, GFP_KERNEL); if (!hist_field->type) goto free; @@ -1736,7 +1732,7 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, } else { hist_field->size = field->size; hist_field->is_signed = field->is_signed; - hist_field->type = kstrdup(field->type, GFP_KERNEL); + hist_field->type = kstrdup_const(field->type, GFP_KERNEL); if (!hist_field->type) goto free; @@ -1822,7 +1818,7 @@ static int init_var_ref(struct hist_field *ref_field, } } - ref_field->type = kstrdup(var_field->type, GFP_KERNEL); + ref_field->type = kstrdup_const(var_field->type, GFP_KERNEL); if (!ref_field->type) { err = -ENOMEM; goto free; @@ -2215,7 +2211,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, expr->operands[0] = operand1; expr->operator = FIELD_OP_UNARY_MINUS; expr->name = expr_str(expr, 0); - expr->type = kstrdup(operand1->type, GFP_KERNEL); + expr->type = kstrdup_const(operand1->type, GFP_KERNEL); if (!expr->type) { ret = -ENOMEM; goto free; @@ -2355,7 +2351,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, expr->operator = field_op; expr->name = expr_str(expr, 0); - expr->type = kstrdup(operand1->type, GFP_KERNEL); + expr->type = kstrdup_const(operand1->type, GFP_KERNEL); if (!expr->type) { ret = -ENOMEM; goto free; @@ -2743,10 +2739,10 @@ static struct hist_field *create_var(struct hist_trigger_data *hist_data, var->var.hist_data = var->hist_data = hist_data; var->size = size; var->var.name = kstrdup(name, GFP_KERNEL); - var->type = kstrdup(type, GFP_KERNEL); + var->type = kstrdup_const(type, GFP_KERNEL); if (!var->var.name || !var->type) { + kfree_const(var->type); kfree(var->var.name); - kfree(var->type); kfree(var); var = ERR_PTR(-ENOMEM); } From ed2cf90735daf40ab8d938b4b6d3ca43c0f84466 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 22 Jul 2021 10:27:07 -0400 Subject: [PATCH 05/35] tracing: Allow execnames to be passed as args for synthetic events Allow common_pid.execname to be saved in a variable in one histogram to be passed to another histogram that can pass it as a parameter to a synthetic event. ># echo 'hist:keys=pid:__arg__1=common_timestamp.usecs:arg2=common_pid.execname' \ > events/sched/sched_waking/trigger ># echo 'wakeup_lat s32 pid; u64 delta; char wake_comm[]' > synthetic_events ># echo 'hist:keys=next_pid:pid=next_pid,delta=common_timestamp.usecs-$__arg__1,exec=$arg2'\ ':onmatch(sched.sched_waking).trace(wakeup_lat,$pid,$delta,$exec)' \ > events/sched/sched_switch/trigger The above is a wake up latency synthetic event setup that passes the execname of the common_pid that woke the task to the scheduling of that task, which triggers a synthetic event that passes the original execname as a parameter to display it. ># echo 1 > events/synthetic/enable ># cat trace -0 [006] d..4 186.863801: wakeup_lat: pid=1306 delta=65 wake_comm=kworker/u16:3 -0 [000] d..4 186.863858: wakeup_lat: pid=163 delta=27 wake_comm= -0 [001] d..4 186.863903: wakeup_lat: pid=1307 delta=36 wake_comm=kworker/u16:4 -0 [000] d..4 186.863927: wakeup_lat: pid=163 delta=5 wake_comm= -0 [006] d..4 186.863957: wakeup_lat: pid=1306 delta=24 wake_comm=kworker/u16:3 sshd-1306 [006] d..4 186.864051: wakeup_lat: pid=61 delta=62 wake_comm= -0 [000] d..4 186.965030: wakeup_lat: pid=609 delta=18 wake_comm= -0 [006] d..4 186.987582: wakeup_lat: pid=1306 delta=65 wake_comm=kworker/u16:3 -0 [000] d..4 186.987639: wakeup_lat: pid=163 delta=27 wake_comm= Link: https://lkml.kernel.org/r/20210722142837.458596338@goodmis.org Reviewed-by: Tom Zanussi Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 46 +++++++++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index bb466a82b938..9d91b1c06957 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1420,17 +1420,17 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt) struct hist_trigger_data *hist_data = elt->map->private_data; unsigned int size = TASK_COMM_LEN; struct hist_elt_data *elt_data; - struct hist_field *key_field; + struct hist_field *hist_field; unsigned int i, n_str; elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL); if (!elt_data) return -ENOMEM; - for_each_hist_key_field(i, hist_data) { - key_field = hist_data->fields[i]; + for_each_hist_field(i, hist_data) { + hist_field = hist_data->fields[i]; - if (key_field->flags & HIST_FIELD_FL_EXECNAME) { + if (hist_field->flags & HIST_FIELD_FL_EXECNAME) { elt_data->comm = kzalloc(size, GFP_KERNEL); if (!elt_data->comm) { kfree(elt_data); @@ -3771,6 +3771,41 @@ static int create_val_field(struct hist_trigger_data *hist_data, return __create_val_field(hist_data, val_idx, file, NULL, field_str, 0); } +static const char *no_comm = "(no comm)"; + +static u64 hist_field_execname(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_elt_data *elt_data; + + if (WARN_ON_ONCE(!elt)) + return (u64)(unsigned long)no_comm; + + elt_data = elt->private_data; + + if (WARN_ON_ONCE(!elt_data->comm)) + return (u64)(unsigned long)no_comm; + + return (u64)(unsigned long)(elt_data->comm); +} + +/* Convert a var that points to common_pid.execname to a string */ +static void update_var_execname(struct hist_field *hist_field) +{ + hist_field->flags = HIST_FIELD_FL_STRING | HIST_FIELD_FL_VAR | + HIST_FIELD_FL_EXECNAME; + hist_field->size = MAX_FILTER_STR_VAL; + hist_field->is_signed = 0; + + kfree_const(hist_field->type); + hist_field->type = "char[]"; + + hist_field->fn = hist_field_execname; +} + static int create_var_field(struct hist_trigger_data *hist_data, unsigned int val_idx, struct trace_event_file *file, @@ -3795,6 +3830,9 @@ static int create_var_field(struct hist_trigger_data *hist_data, ret = __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags); + if (!ret && hist_data->fields[val_idx]->flags & HIST_FIELD_FL_EXECNAME) + update_var_execname(hist_data->fields[val_idx]); + if (!ret && hist_data->fields[val_idx]->flags & HIST_FIELD_FL_STRING) hist_data->fields[val_idx]->var_str_idx = hist_data->n_var_str++; From de32951b29be3d6cc7a92bfbf366f48a9f4c4407 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 31 Jul 2021 14:22:31 +0900 Subject: [PATCH 06/35] tracing: Simplify the Kconfig dependency of FTRACE The entire FTRACE block is surrounded by 'if TRACING_SUPPORT' ... 'endif'. Using 'depends on' is a simpler way to guard FTRACE. Link: https://lkml.kernel.org/r/20210731052233.4703-1-masahiroy@kernel.org Signed-off-by: Masahiro Yamada Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 3ee23f4d437f..420ff4bc67fd 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -135,10 +135,9 @@ config TRACING_SUPPORT depends on STACKTRACE_SUPPORT default y -if TRACING_SUPPORT - menuconfig FTRACE bool "Tracers" + depends on TRACING_SUPPORT default y if DEBUG_KERNEL help Enable the kernel tracing infrastructure. @@ -1037,6 +1036,3 @@ config HIST_TRIGGERS_DEBUG If unsure, say N. endif # FTRACE - -endif # TRACING_SUPPORT - From 4aae683f132777a92b80d6971d56174d5f4cb3f3 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 31 Jul 2021 14:22:32 +0900 Subject: [PATCH 07/35] tracing: Refactor TRACE_IRQFLAGS_SUPPORT in Kconfig MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make architectures select TRACE_IRQFLAGS_SUPPORT instead of having many defines. Link: https://lkml.kernel.org/r/20210731052233.4703-2-masahiroy@kernel.org Acked-by: Heiko Carstens Acked-by: Vineet Gupta    #arch/arc Acked-by: Michael Ellerman (powerpc) Acked-by: Catalin Marinas Acked-by: Max Filippov Signed-off-by: Masahiro Yamada Signed-off-by: Steven Rostedt (VMware) --- arch/Kconfig | 3 +++ arch/arc/Kconfig | 4 +--- arch/arm/Kconfig | 5 +---- arch/arm64/Kconfig | 4 +--- arch/csky/Kconfig | 4 +--- arch/hexagon/Kconfig | 4 +--- arch/microblaze/Kconfig | 1 + arch/microblaze/Kconfig.debug | 5 ----- arch/mips/Kconfig | 1 + arch/mips/Kconfig.debug | 4 ---- arch/nds32/Kconfig | 4 +--- arch/nios2/Kconfig | 3 --- arch/openrisc/Kconfig | 4 +--- arch/parisc/Kconfig | 1 + arch/parisc/Kconfig.debug | 3 --- arch/powerpc/Kconfig | 5 +---- arch/riscv/Kconfig | 4 +--- arch/s390/Kconfig | 1 + arch/s390/Kconfig.debug | 3 --- arch/sh/Kconfig | 1 + arch/sh/Kconfig.debug | 3 --- arch/sparc/Kconfig | 1 + arch/sparc/Kconfig.debug | 4 ---- arch/um/Kconfig | 5 +---- arch/x86/Kconfig | 1 + arch/x86/Kconfig.debug | 3 --- arch/xtensa/Kconfig | 4 +--- 27 files changed, 21 insertions(+), 64 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 129df498a8e1..9471a0feecaf 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -197,6 +197,9 @@ config HAVE_FUNCTION_ERROR_INJECTION config HAVE_NMI bool +config TRACE_IRQFLAGS_SUPPORT + bool + # # An arch should select this if it provides all these things: # diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index d8f51eb8963b..0c81df3a5c7a 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -49,9 +49,7 @@ config ARC select PERF_USE_VMALLOC if ARC_CACHE_VIPT_ALIASING select HAVE_ARCH_JUMP_LABEL if ISA_ARCV2 && !CPU_ENDIAN_BE32 select SET_FS - -config TRACE_IRQFLAGS_SUPPORT - def_bool y + select TRACE_IRQFLAGS_SUPPORT config LOCKDEP_SUPPORT def_bool y diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 82f908fa5676..3564647283e1 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -128,6 +128,7 @@ config ARM select RTC_LIB select SET_FS select SYS_SUPPORTS_APM_EMULATION + select TRACE_IRQFLAGS_SUPPORT if !CPU_V7M # Above selects are sorted alphabetically; please add new ones # according to that. Thanks. help @@ -191,10 +192,6 @@ config LOCKDEP_SUPPORT bool default y -config TRACE_IRQFLAGS_SUPPORT - bool - default !CPU_V7M - config ARCH_HAS_ILOG2_U32 bool diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index b5b13a932561..67b04ae5d010 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -221,6 +221,7 @@ config ARM64 select SYSCTL_EXCEPTION_TRACE select THREAD_INFO_IN_TASK select HAVE_ARCH_USERFAULTFD_MINOR if USERFAULTFD + select TRACE_IRQFLAGS_SUPPORT help ARM 64-bit (AArch64) Linux support. @@ -288,9 +289,6 @@ config ILLEGAL_POINTER_VALUE config LOCKDEP_SUPPORT def_bool y -config TRACE_IRQFLAGS_SUPPORT - def_bool y - config GENERIC_BUG def_bool y depends on BUG diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index 2716f6395ba7..9d4d898df76b 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -82,6 +82,7 @@ config CSKY select PCI_SYSCALL if PCI select PCI_MSI if PCI select SET_FS + select TRACE_IRQFLAGS_SUPPORT config LOCKDEP_SUPPORT def_bool y @@ -139,9 +140,6 @@ config STACKTRACE_SUPPORT config TIME_LOW_RES def_bool y -config TRACE_IRQFLAGS_SUPPORT - def_bool y - config CPU_TLB_SIZE int default "128" if (CPU_CK610 || CPU_CK807 || CPU_CK810) diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index e5a852080730..f993c4deaf23 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -31,6 +31,7 @@ config HEXAGON select GENERIC_CPU_DEVICES select SET_FS select ARCH_WANT_LD_ORPHAN_WARN + select TRACE_IRQFLAGS_SUPPORT help Qualcomm Hexagon is a processor architecture designed for high performance and low power across a wide variety of applications. @@ -52,9 +53,6 @@ config EARLY_PRINTK config MMU def_bool y -config TRACE_IRQFLAGS_SUPPORT - def_bool y - config GENERIC_CSUM def_bool y diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 14a67a42fcae..59798e43cdb0 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -44,6 +44,7 @@ config MICROBLAZE select SPARSE_IRQ select SET_FS select ZONE_DMA + select TRACE_IRQFLAGS_SUPPORT # Endianness selection choice diff --git a/arch/microblaze/Kconfig.debug b/arch/microblaze/Kconfig.debug index 865527ac332a..a4e40e534e6a 100644 --- a/arch/microblaze/Kconfig.debug +++ b/arch/microblaze/Kconfig.debug @@ -1,6 +1 @@ # SPDX-License-Identifier: GPL-2.0-only -# For a description of the syntax of this configuration file, -# see Documentation/kbuild/kconfig-language.rst. - -config TRACE_IRQFLAGS_SUPPORT - def_bool y diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index cee6087cd686..3017c9e8bb07 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -97,6 +97,7 @@ config MIPS select PCI_MSI_ARCH_FALLBACKS if PCI_MSI select RTC_LIB select SYSCTL_EXCEPTION_TRACE + select TRACE_IRQFLAGS_SUPPORT select VIRT_TO_BUS select ARCH_HAS_ELFCORE_COMPAT diff --git a/arch/mips/Kconfig.debug b/arch/mips/Kconfig.debug index 43dbf5930796..f4ae7900fcd3 100644 --- a/arch/mips/Kconfig.debug +++ b/arch/mips/Kconfig.debug @@ -1,9 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -config TRACE_IRQFLAGS_SUPPORT - bool - default y - config EARLY_PRINTK bool "Early printk" if EXPERT depends on SYS_HAS_EARLY_PRINTK diff --git a/arch/nds32/Kconfig b/arch/nds32/Kconfig index 62313902d75d..dd50123335d8 100644 --- a/arch/nds32/Kconfig +++ b/arch/nds32/Kconfig @@ -48,6 +48,7 @@ config NDS32 select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE select SET_FS + select TRACE_IRQFLAGS_SUPPORT help Andes(nds32) Linux support. @@ -64,9 +65,6 @@ config GENERIC_LOCKBREAK def_bool y depends on PREEMPTION -config TRACE_IRQFLAGS_SUPPORT - def_bool y - config STACKTRACE_SUPPORT def_bool y diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig index c24955c81c92..eabdc41f67f6 100644 --- a/arch/nios2/Kconfig +++ b/arch/nios2/Kconfig @@ -43,9 +43,6 @@ config NO_IOPORT_MAP config FPU def_bool n -config TRACE_IRQFLAGS_SUPPORT - def_bool n - menu "Kernel features" source "kernel/Kconfig.hz" diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index 591acc5990dc..ebc5e7eafc5c 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -39,6 +39,7 @@ config OPENRISC select GENERIC_IRQ_MULTI_HANDLER select MMU_GATHER_NO_RANGE if MMU select SET_FS + select TRACE_IRQFLAGS_SUPPORT config CPU_BIG_ENDIAN def_bool y @@ -52,9 +53,6 @@ config GENERIC_HWEIGHT config NO_IOPORT_MAP def_bool y -config TRACE_IRQFLAGS_SUPPORT - def_bool y - # For now, use generic checksum functions #These can be reimplemented in assembly later if so inclined config GENERIC_CSUM diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index bde9907bc5b2..57a0d0896ef6 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -67,6 +67,7 @@ config PARISC select HAVE_DYNAMIC_FTRACE_WITH_REGS select HAVE_SOFTIRQ_ON_OWN_STACK if IRQSTACKS select SET_FS + select TRACE_IRQFLAGS_SUPPORT help The PA-RISC microprocessor is designed by Hewlett-Packard and used diff --git a/arch/parisc/Kconfig.debug b/arch/parisc/Kconfig.debug index 1478ded0e247..f66554cd5c45 100644 --- a/arch/parisc/Kconfig.debug +++ b/arch/parisc/Kconfig.debug @@ -1,4 +1 @@ # SPDX-License-Identifier: GPL-2.0 - -config TRACE_IRQFLAGS_SUPPORT - def_bool y diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index d01e3401581d..76a28452c042 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -94,10 +94,6 @@ config STACKTRACE_SUPPORT bool default y -config TRACE_IRQFLAGS_SUPPORT - bool - default y - config LOCKDEP_SUPPORT bool default y @@ -271,6 +267,7 @@ config PPC select STRICT_KERNEL_RWX if STRICT_MODULE_RWX select SYSCTL_EXCEPTION_TRACE select THREAD_INFO_IN_TASK + select TRACE_IRQFLAGS_SUPPORT select VIRT_TO_BUS if !PPC64 # # Please keep this list sorted alphabetically. diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 8fcceb8eda07..15f012b9c6d7 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -109,6 +109,7 @@ config RISCV select SPARSE_IRQ select SYSCTL_EXCEPTION_TRACE select THREAD_INFO_IN_TASK + select TRACE_IRQFLAGS_SUPPORT select UACCESS_MEMCPY if !MMU select ZONE_DMA32 if 64BIT @@ -178,9 +179,6 @@ config ARCH_SUPPORTS_UPROBES config STACKTRACE_SUPPORT def_bool y -config TRACE_IRQFLAGS_SUPPORT - def_bool y - config GENERIC_BUG def_bool y depends on BUG diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index a0e2130f0100..8bce206e93a3 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -207,6 +207,7 @@ config S390 select SWIOTLB select SYSCTL_EXCEPTION_TRACE select THREAD_INFO_IN_TASK + select TRACE_IRQFLAGS_SUPPORT select TTY select VIRT_CPU_ACCOUNTING select ZONE_DMA diff --git a/arch/s390/Kconfig.debug b/arch/s390/Kconfig.debug index 9ea6e61d5858..e94a2a7f6bf4 100644 --- a/arch/s390/Kconfig.debug +++ b/arch/s390/Kconfig.debug @@ -1,8 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -config TRACE_IRQFLAGS_SUPPORT - def_bool y - config EARLY_PRINTK def_bool y diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 45a0549421cd..8efeffcdf4cf 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -72,6 +72,7 @@ config SUPERH select RTC_LIB select SET_FS select SPARSE_IRQ + select TRACE_IRQFLAGS_SUPPORT help The SuperH is a RISC processor targeted for use in embedded systems and consumer electronics; it was also used in the Sega Dreamcast diff --git a/arch/sh/Kconfig.debug b/arch/sh/Kconfig.debug index 28a43d63bde1..958f790273ab 100644 --- a/arch/sh/Kconfig.debug +++ b/arch/sh/Kconfig.debug @@ -1,8 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -config TRACE_IRQFLAGS_SUPPORT - def_bool y - config SH_STANDARD_BIOS bool "Use LinuxSH standard BIOS" help diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index c5fa7932b550..fb8bdd40c72a 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -50,6 +50,7 @@ config SPARC select NEED_DMA_MAP_STATE select NEED_SG_DMA_LENGTH select SET_FS + select TRACE_IRQFLAGS_SUPPORT config SPARC32 def_bool !64BIT diff --git a/arch/sparc/Kconfig.debug b/arch/sparc/Kconfig.debug index 50a918d496c8..6b2bec1888b3 100644 --- a/arch/sparc/Kconfig.debug +++ b/arch/sparc/Kconfig.debug @@ -1,9 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -config TRACE_IRQFLAGS_SUPPORT - bool - default y - config DEBUG_DCFLUSH bool "D-cache flush debugging" depends on SPARC64 && DEBUG_KERNEL diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 0561b73cfd9a..0a699440c4c5 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -20,6 +20,7 @@ config UML select GENERIC_CPU_DEVICES select HAVE_GCC_PLUGINS select SET_FS + select TRACE_IRQFLAGS_SUPPORT select TTY # Needed for line.c config MMU @@ -50,10 +51,6 @@ config ISA config SBUS bool -config TRACE_IRQFLAGS_SUPPORT - bool - default y - config LOCKDEP_SUPPORT bool default y diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 49270655e827..171faee05035 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -261,6 +261,7 @@ config X86 select STACK_VALIDATION if HAVE_STACK_VALIDATION && (HAVE_STATIC_CALL_INLINE || RETPOLINE) select SYSCTL_EXCEPTION_TRACE select THREAD_INFO_IN_TASK + select TRACE_IRQFLAGS_SUPPORT select USER_STACKTRACE_SUPPORT select VIRT_TO_BUS select HAVE_ARCH_KCSAN if X86_64 diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 80b57e7f4947..d3a6f74a94bd 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -1,8 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -config TRACE_IRQFLAGS_SUPPORT - def_bool y - config TRACE_IRQFLAGS_NMI_SUPPORT def_bool y diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 2332b2156993..90aabb27f7d7 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -41,6 +41,7 @@ config XTENSA select MODULES_USE_ELF_RELA select PERF_USE_VMALLOC select SET_FS + select TRACE_IRQFLAGS_SUPPORT select VIRT_TO_BUS help Xtensa processors are 32-bit RISC machines designed by Tensilica @@ -72,9 +73,6 @@ config LOCKDEP_SUPPORT config STACKTRACE_SUPPORT def_bool y -config TRACE_IRQFLAGS_SUPPORT - def_bool y - config MMU def_bool n From c3b1c377f0102e88dea6354d9cdb34a9d1c90971 Mon Sep 17 00:00:00 2001 From: Huang Shijie Date: Mon, 2 Aug 2021 14:02:34 +0000 Subject: [PATCH 08/35] tracing: Fix a typo in tracepoint.h It should be @prev_pid, not @prev_prid. Link: https://lkml.kernel.org/r/20210802140234.5383-1-shijie@os.amperecomputing.com Signed-off-by: Huang Shijie Signed-off-by: Steven Rostedt (VMware) --- include/linux/tracepoint.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index ab58696d0ddd..28031b15f878 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -475,7 +475,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) * * * * The declared 'local variable' is called '__entry' * * - * * __field(pid_t, prev_prid) is equivalent to a standard declaration: + * * __field(pid_t, prev_pid) is equivalent to a standard declaration: * * * * pid_t prev_pid; * * From e66ed86ca6c52488249e95f7b3a6a3d7d6ab5e1e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 10 Aug 2021 11:07:21 +0900 Subject: [PATCH 09/35] tracing/boot: Add per-event histogram action options Add a hist-trigger action syntax support to boot-time tracing. Currently, boot-time tracing supports per-event actions as option strings. However, for the histogram action, it has a special syntax and usually needs a long action definition. To make it readable and fit to the bootconfig syntax, this introduces a new options for histogram. Here are the histogram action options for boot-time tracing. ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist { keys = [,...] values = [,...] sort = [,...] size = name = var { = ... } pause|continue|clear onmax|onchange { var = ; [= ] } onmatch { event = ; [= ] } filter = } Where is one of below; trace = , [, ...] save = [, ...] snapshot Link: https://lkml.kernel.org/r/162856124106.203126.10501871028479029087.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 231 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 231 insertions(+) diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index d713714cba67..3d0e51368f51 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -171,6 +171,231 @@ trace_boot_add_synth_event(struct xbc_node *node, const char *event) } #endif +#ifdef CONFIG_HIST_TRIGGERS +static int __init __printf(3, 4) +append_printf(char **bufp, char *end, const char *fmt, ...) +{ + va_list args; + int ret; + + if (*bufp == end) + return -ENOSPC; + + va_start(args, fmt); + ret = vsnprintf(*bufp, end - *bufp, fmt, args); + if (ret < end - *bufp) { + *bufp += ret; + } else { + *bufp = end; + ret = -ERANGE; + } + va_end(args); + + return ret; +} + +static int __init +append_str_nospace(char **bufp, char *end, const char *str) +{ + char *p = *bufp; + int len; + + while (p < end - 1 && *str != '\0') { + if (!isspace(*str)) + *(p++) = *str; + str++; + } + *p = '\0'; + if (p == end - 1) { + *bufp = end; + return -ENOSPC; + } + len = p - *bufp; + *bufp = p; + return (int)len; +} + +static int __init +trace_boot_hist_add_array(struct xbc_node *hnode, char **bufp, + char *end, const char *key) +{ + struct xbc_node *knode, *anode; + const char *p; + char sep; + + knode = xbc_node_find_child(hnode, key); + if (knode) { + anode = xbc_node_get_child(knode); + if (!anode) { + pr_err("hist.%s requires value(s).\n", key); + return -EINVAL; + } + + append_printf(bufp, end, ":%s", key); + sep = '='; + xbc_array_for_each_value(anode, p) { + append_printf(bufp, end, "%c%s", sep, p); + if (sep == '=') + sep = ','; + } + } else + return -ENOENT; + + return 0; +} + +static int __init +trace_boot_hist_add_handler(struct xbc_node *hnode, char **bufp, + char *end, const char *param) +{ + struct xbc_node *knode, *anode; + const char *p; + char sep; + + /* Compose 'handler' parameter */ + p = xbc_node_find_value(hnode, param, NULL); + if (!p) { + pr_err("hist.%s requires '%s' option.\n", + xbc_node_get_data(hnode), param); + return -EINVAL; + } + append_printf(bufp, end, ":%s(%s)", xbc_node_get_data(hnode), p); + + /* Compose 'action' parameter */ + knode = xbc_node_find_child(hnode, "trace"); + if (!knode) + knode = xbc_node_find_child(hnode, "save"); + + if (knode) { + anode = xbc_node_get_child(knode); + if (!anode || !xbc_node_is_value(anode)) { + pr_err("hist.%s.%s requires value(s).\n", + xbc_node_get_data(hnode), + xbc_node_get_data(knode)); + return -EINVAL; + } + + append_printf(bufp, end, ".%s", xbc_node_get_data(knode)); + sep = '('; + xbc_array_for_each_value(anode, p) { + append_printf(bufp, end, "%c%s", sep, p); + if (sep == '(') + sep = ','; + } + append_printf(bufp, end, ")"); + } else if (xbc_node_find_child(hnode, "snapshot")) { + append_printf(bufp, end, ".snapshot()"); + } else { + pr_err("hist.%s requires an action.\n", + xbc_node_get_data(hnode)); + return -EINVAL; + } + + return 0; +} + +/* + * Histogram boottime tracing syntax. + * + * ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist { + * keys = [,...] + * values = [,...] + * sort = [,...] + * size = + * name = + * var { = ... } + * pause|continue|clear + * onmax|onchange { var = ; [= ] } + * onmatch { event = ; [= ] } + * filter = + * } + * + * Where are; + * + * trace = , [, ...] + * save = [, ...] + * snapshot + */ +static int __init +trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size) +{ + struct xbc_node *node, *knode; + char *end = buf + size; + const char *p; + int ret = 0; + + append_printf(&buf, end, "hist"); + + ret = trace_boot_hist_add_array(hnode, &buf, end, "keys"); + if (ret < 0) { + if (ret == -ENOENT) + pr_err("hist requires keys.\n"); + return -EINVAL; + } + + ret = trace_boot_hist_add_array(hnode, &buf, end, "values"); + if (ret == -EINVAL) + return ret; + ret = trace_boot_hist_add_array(hnode, &buf, end, "sort"); + if (ret == -EINVAL) + return ret; + + p = xbc_node_find_value(hnode, "size", NULL); + if (p) + append_printf(&buf, end, ":size=%s", p); + + p = xbc_node_find_value(hnode, "name", NULL); + if (p) + append_printf(&buf, end, ":name=%s", p); + + node = xbc_node_find_child(hnode, "var"); + if (node) { + xbc_node_for_each_key_value(node, knode, p) { + /* Expression must not include spaces. */ + append_printf(&buf, end, ":%s=", + xbc_node_get_data(knode)); + append_str_nospace(&buf, end, p); + } + } + + /* Histogram control attributes (mutual exclusive) */ + if (xbc_node_find_child(hnode, "pause")) + append_printf(&buf, end, ":pause"); + else if (xbc_node_find_child(hnode, "continue")) + append_printf(&buf, end, ":continue"); + else if (xbc_node_find_child(hnode, "clear")) + append_printf(&buf, end, ":clear"); + + /* Histogram handler and actions */ + node = xbc_node_find_child(hnode, "onmax"); + if (node && trace_boot_hist_add_handler(node, &buf, end, "var") < 0) + return -EINVAL; + node = xbc_node_find_child(hnode, "onchange"); + if (node && trace_boot_hist_add_handler(node, &buf, end, "var") < 0) + return -EINVAL; + node = xbc_node_find_child(hnode, "onmatch"); + if (node && trace_boot_hist_add_handler(node, &buf, end, "event") < 0) + return -EINVAL; + + p = xbc_node_find_value(hnode, "filter", NULL); + if (p) + append_printf(&buf, end, " if %s", p); + + if (buf == end) { + pr_err("hist exceeds the max command length.\n"); + return -E2BIG; + } + + return 0; +} +#else +static int __init +trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size) +{ + return -EOPNOTSUPP; +} +#endif + static void __init trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, struct xbc_node *enode) @@ -212,6 +437,12 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, else if (trigger_process_regex(file, buf) < 0) pr_err("Failed to apply an action: %s\n", buf); } + anode = xbc_node_find_child(enode, "hist"); + if (anode && + trace_boot_compose_hist_cmd(anode, buf, ARRAY_SIZE(buf)) == 0) { + if (trigger_process_regex(file, buf) < 0) + pr_err("Failed to apply hist trigger: %s\n", buf); + } } else if (xbc_node_find_value(enode, "actions", NULL)) pr_err("Failed to apply event actions because CONFIG_HIST_TRIGGERS is not set.\n"); From 8993665abcce793f00815b3504a486dce70cc2b9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 10 Aug 2021 11:07:29 +0900 Subject: [PATCH 10/35] tracing/boot: Support multiple handlers for per-event histogram Support multiple handlers for per-event histogram in boot-time tracing. Since the histogram can register multiple same handler-actions with different parameters, this expands the syntax to support such cases. With this update, the 'onmax', 'onchange' and 'onmatch' handler subkeys under per-event histogram option will take a number subkeys optionally as below. (see [.N]) ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist { onmax|onchange[.N] { var = ; [= ] } onmatch[.N] { event = ; [= ] } } The 'N' must be a digit (or digit started word). Thus user can add several handler-actions to the histogram, for example, ftrace.event.SOMEGROUP.SOMEEVENT.hist { keys = SOME_ID; lat = common_timestamp.usecs-$ts0 onmatch.1 { event = GROUP1.STARTEVENT1 trace = latency_event, SOME_ID, $lat } onmatch.2 { event = GROUP2.STARTEVENT2 trace = latency_event, SOME_ID, $lat } } Then, it can trace the elapsed time from GROUP1.STARTEVENT1 to SOMEGROUP.SOMEEVENT, and from GROUP2.STARTEVENT2 to SOMEGROUP.SOMEEVENT with SOME_ID key. Link: https://lkml.kernel.org/r/162856124905.203126.14913731908137885922.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 43 +++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 3d0e51368f51..f024f27b3602 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -245,8 +245,9 @@ trace_boot_hist_add_array(struct xbc_node *hnode, char **bufp, } static int __init -trace_boot_hist_add_handler(struct xbc_node *hnode, char **bufp, - char *end, const char *param) +trace_boot_hist_add_one_handler(struct xbc_node *hnode, char **bufp, + char *end, const char *handler, + const char *param) { struct xbc_node *knode, *anode; const char *p; @@ -259,7 +260,7 @@ trace_boot_hist_add_handler(struct xbc_node *hnode, char **bufp, xbc_node_get_data(hnode), param); return -EINVAL; } - append_printf(bufp, end, ":%s(%s)", xbc_node_get_data(hnode), p); + append_printf(bufp, end, ":%s(%s)", handler, p); /* Compose 'action' parameter */ knode = xbc_node_find_child(hnode, "trace"); @@ -294,6 +295,32 @@ trace_boot_hist_add_handler(struct xbc_node *hnode, char **bufp, return 0; } +static int __init +trace_boot_hist_add_handlers(struct xbc_node *hnode, char **bufp, + char *end, const char *param) +{ + struct xbc_node *node; + const char *p, *handler; + int ret; + + handler = xbc_node_get_data(hnode); + + xbc_node_for_each_subkey(hnode, node) { + p = xbc_node_get_data(node); + if (!isdigit(p[0])) + continue; + /* All digit started node should be instances. */ + ret = trace_boot_hist_add_one_handler(node, bufp, end, handler, param); + if (ret < 0) + break; + } + + if (xbc_node_find_child(hnode, param)) + ret = trace_boot_hist_add_one_handler(hnode, bufp, end, handler, param); + + return ret; +} + /* * Histogram boottime tracing syntax. * @@ -305,8 +332,8 @@ trace_boot_hist_add_handler(struct xbc_node *hnode, char **bufp, * name = * var { = ... } * pause|continue|clear - * onmax|onchange { var = ; [= ] } - * onmatch { event = ; [= ] } + * onmax|onchange[.N] { var = ; [= ] } + * onmatch[.N] { event = ; [= ] } * filter = * } * @@ -368,13 +395,13 @@ trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size) /* Histogram handler and actions */ node = xbc_node_find_child(hnode, "onmax"); - if (node && trace_boot_hist_add_handler(node, &buf, end, "var") < 0) + if (node && trace_boot_hist_add_handlers(node, &buf, end, "var") < 0) return -EINVAL; node = xbc_node_find_child(hnode, "onchange"); - if (node && trace_boot_hist_add_handler(node, &buf, end, "var") < 0) + if (node && trace_boot_hist_add_handlers(node, &buf, end, "var") < 0) return -EINVAL; node = xbc_node_find_child(hnode, "onmatch"); - if (node && trace_boot_hist_add_handler(node, &buf, end, "event") < 0) + if (node && trace_boot_hist_add_handlers(node, &buf, end, "event") < 0) return -EINVAL; p = xbc_node_find_value(hnode, "filter", NULL); From 17abd7c36c77c393fa65cde462059395d6437dba Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 10 Aug 2021 11:07:36 +0900 Subject: [PATCH 11/35] tracing/boot: Support multiple histograms for each event Add multiple histograms support for each event. This allows user to set multiple histograms to an event. ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist[.N] { ... } The 'N' is a digit started string and it can be omitted for the default histogram. For example, multiple hist triggers example in the Documentation/trace/histogram.rst can be written as below; ftrace.event.net.netif_receive_skb.hist { 1 { keys = skbaddr.hex values = len filter = len < 0 } 2 { keys = skbaddr.hex values = len filter = len > 4096 } 3 { keys = skbaddr.hex values = len filter = len == 256 } 4 { keys = skbaddr.hex values = len } 5 { keys = len values = common_preempt_count } } Link: https://lkml.kernel.org/r/162856125628.203126.15846930277378572120.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 43 ++++++++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index f024f27b3602..1a2b270e9cda 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -324,7 +324,7 @@ trace_boot_hist_add_handlers(struct xbc_node *hnode, char **bufp, /* * Histogram boottime tracing syntax. * - * ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist { + * ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist[.N] { * keys = [,...] * values = [,...] * sort = [,...] @@ -415,11 +415,37 @@ trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size) return 0; } -#else -static int __init -trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size) + +static void __init +trace_boot_init_histograms(struct trace_event_file *file, + struct xbc_node *hnode, char *buf, size_t size) { - return -EOPNOTSUPP; + struct xbc_node *node; + const char *p; + + xbc_node_for_each_subkey(hnode, node) { + p = xbc_node_get_data(node); + if (!isdigit(p[0])) + continue; + /* All digit started node should be instances. */ + if (trace_boot_compose_hist_cmd(node, buf, size) == 0) { + if (trigger_process_regex(file, buf) < 0) + pr_err("Failed to apply hist trigger: %s\n", buf); + } + } + + if (xbc_node_find_child(hnode, "keys")) { + if (trace_boot_compose_hist_cmd(hnode, buf, size) == 0) + if (trigger_process_regex(file, buf) < 0) + pr_err("Failed to apply hist trigger: %s\n", buf); + } +} +#else +static void __init +trace_boot_init_histograms(struct trace_event_file *file, + struct xbc_node *hnode, char *buf, size_t size) +{ + /* do nothing */ } #endif @@ -465,11 +491,8 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, pr_err("Failed to apply an action: %s\n", buf); } anode = xbc_node_find_child(enode, "hist"); - if (anode && - trace_boot_compose_hist_cmd(anode, buf, ARRAY_SIZE(buf)) == 0) { - if (trigger_process_regex(file, buf) < 0) - pr_err("Failed to apply hist trigger: %s\n", buf); - } + if (anode) + trace_boot_init_histograms(file, anode, buf, ARRAY_SIZE(buf)); } else if (xbc_node_find_value(enode, "actions", NULL)) pr_err("Failed to apply event actions because CONFIG_HIST_TRIGGERS is not set.\n"); From 64dc7f6958ef56512137ed6ec228127cef7724e9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 10 Aug 2021 11:07:44 +0900 Subject: [PATCH 12/35] tracing/boot: Show correct histogram error command Since trigger_process_regex() modifies given trigger actions while parsing, the error message couldn't show what command was passed to the trigger_process_regex() when it returns an error. To fix that, show the backed up trigger action command instead of parsed buffer. Link: https://lkml.kernel.org/r/162856126413.203126.9465564928450701424.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 1a2b270e9cda..1060b0446032 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -422,6 +422,7 @@ trace_boot_init_histograms(struct trace_event_file *file, { struct xbc_node *node; const char *p; + char *tmp; xbc_node_for_each_subkey(hnode, node) { p = xbc_node_get_data(node); @@ -429,15 +430,20 @@ trace_boot_init_histograms(struct trace_event_file *file, continue; /* All digit started node should be instances. */ if (trace_boot_compose_hist_cmd(node, buf, size) == 0) { + tmp = kstrdup(buf, GFP_KERNEL); if (trigger_process_regex(file, buf) < 0) - pr_err("Failed to apply hist trigger: %s\n", buf); + pr_err("Failed to apply hist trigger: %s\n", tmp); + kfree(tmp); } } if (xbc_node_find_child(hnode, "keys")) { - if (trace_boot_compose_hist_cmd(hnode, buf, size) == 0) + if (trace_boot_compose_hist_cmd(hnode, buf, size) == 0) { + tmp = kstrdup(buf, GFP_KERNEL); if (trigger_process_regex(file, buf) < 0) - pr_err("Failed to apply hist trigger: %s\n", buf); + pr_err("Failed to apply hist trigger: %s\n", tmp); + kfree(tmp); + } } } #else @@ -488,7 +494,7 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) pr_err("action string is too long: %s\n", p); else if (trigger_process_regex(file, buf) < 0) - pr_err("Failed to apply an action: %s\n", buf); + pr_err("Failed to apply an action: %s\n", p); } anode = xbc_node_find_child(enode, "hist"); if (anode) From 559789539255bf17dfe2983b0f4d03b9d37a3395 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 10 Aug 2021 11:07:51 +0900 Subject: [PATCH 13/35] Documentation: tracing: Add histogram syntax to boot-time tracing Add the documentation about histogram syntax in boot-time tracing. This will allow user to write the histogram setting in a structured parameters. Link: https://lkml.kernel.org/r/162856127129.203126.15551542847575916525.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/boottime-trace.rst | 85 ++++++++++++++++++++++++-- 1 file changed, 80 insertions(+), 5 deletions(-) diff --git a/Documentation/trace/boottime-trace.rst b/Documentation/trace/boottime-trace.rst index 8053898cfeb4..6dcfbc64014d 100644 --- a/Documentation/trace/boottime-trace.rst +++ b/Documentation/trace/boottime-trace.rst @@ -125,6 +125,71 @@ Note that kprobe and synthetic event definitions can be written under instance node, but those are also visible from other instances. So please take care for event name conflict. +Ftrace Histogram Options +------------------------ + +Since it is too long to write a histogram action as a string for per-event +action option, there are tree-style options under per-event 'hist' subkey +for the histogram actions. For the detail of the each parameter, +please read the event histogram document [3]_. + +.. [3] See :ref:`Documentation/trace/histogram.rst ` + +ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]keys = KEY1[, KEY2[...]] + Set histogram key parameters. (Mandatory) + The 'N' is a digit string for the multiple histogram. You can omit it + if there is one histogram on the event. + +ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]values = VAL1[, VAL2[...]] + Set histogram value parameters. + +ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]sort = SORT1[, SORT2[...]] + Set histogram sort parameter options. + +ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]size = NR_ENTRIES + Set histogram size (number of entries). + +ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]name = NAME + Set histogram name. + +ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]var.VARIABLE = EXPR + Define a new VARIABLE by EXPR expression. + +ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.] + Set histogram control parameter. You can set one of them. + +ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]onmatch.[M.]event = GROUP.EVENT + Set histogram 'onmatch' handler matching event parameter. + The 'M' is a digit string for the multiple 'onmatch' handler. You can omit it + if there is one 'onmatch' handler on this histogram. + +ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]onmatch.[M.]trace = EVENT[, ARG1[...]] + Set histogram 'trace' action for 'onmatch'. + EVENT must be a synthetic event name, and ARG1... are parameters + for that event. Mandatory if 'onmatch.event' option is set. + +ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]onmax.[M.]var = VAR + Set histogram 'onmax' handler variable parameter. + +ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.]onchange.[M.]var = VAR + Set histogram 'onchange' handler variable parameter. + +ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.].[M.]save = ARG1[, ARG2[...]] + Set histogram 'save' action parameters for 'onmax' or 'onchange' handler. + This option or below 'snapshot' option is mandatory if 'onmax.var' or + 'onchange.var' option is set. + +ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.[N.].[M.]snapshot + Set histogram 'snapshot' action for 'onmax' or 'onchange' handler. + This option or above 'save' option is mandatory if 'onmax.var' or + 'onchange.var' option is set. + +ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist.filter = FILTER_EXPR + Set histogram filter expression. You don't need 'if' in the FILTER_EXPR. + +Note that this 'hist' option can conflict with the per-event 'actions' +option if the 'actions' option has a histogram action. + When to Start ============= @@ -159,13 +224,23 @@ below:: } synthetic.initcall_latency { fields = "unsigned long func", "u64 lat" - actions = "hist:keys=func.sym,lat:vals=lat:sort=lat" + hist { + keys = func.sym, lat + values = lat + sort = lat + } } - initcall.initcall_start { - actions = "hist:keys=func:ts0=common_timestamp.usecs" + initcall.initcall_start.hist { + keys = func + var.ts0 = common_timestamp.usecs } - initcall.initcall_finish { - actions = "hist:keys=func:lat=common_timestamp.usecs-$ts0:onmatch(initcall.initcall_start).initcall_latency(func,$lat)" + initcall.initcall_finish.hist { + keys = func + var.lat = common_timestamp.usecs - $ts0 + onmatch { + event = initcall.initcall_start + trace = initcall_latency, func, $lat + } } } From 1d8365a553a7caeb75246682b0901ce24a335602 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 10 Aug 2021 11:07:58 +0900 Subject: [PATCH 14/35] tools/bootconfig: Support per-group/all event enabling option Add group or all event enabling syntax support to bconf2ftrace.sh. User can pass a bootconfig file which includes ftrace[.instance.INSTANCE].event.enable and ftrace[.instance.INSTANCE].event.GROUP.enable correctly. Link: https://lkml.kernel.org/r/162856127850.203126.16694505101982548237.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- tools/bootconfig/scripts/bconf2ftrace.sh | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tools/bootconfig/scripts/bconf2ftrace.sh b/tools/bootconfig/scripts/bconf2ftrace.sh index feb30c2c7881..651049c782c0 100755 --- a/tools/bootconfig/scripts/bconf2ftrace.sh +++ b/tools/bootconfig/scripts/bconf2ftrace.sh @@ -101,6 +101,12 @@ setup_event() { # prefix group event [instance] else eventdir="$TRACEFS/events/$2/$3" fi + # group enable + if [ "$3" = "enable" ]; then + run_cmd "echo 1 > ${eventdir}" + return + fi + case $2 in kprobes) xbc_get_val ${branch}.probes | while read line; do @@ -127,6 +133,13 @@ setup_events() { # prefix("ftrace" or "ftrace.instance.INSTANCE") [instance] setup_event $prefix ${grpev%.*} ${grpev#*.} $2 done fi + if xbc_has_branch ${1}.event.enable; then + if [ "$2" ]; then + run_cmd "echo 1 > $TRACEFS/instances/$2/events/enable" + else + run_cmd "echo 1 > $TRACEFS/events/enable" + fi + fi } size2kb() { # size[KB|MB] From f134ebb28126d702c8d99e378fb3fb753e54b8f6 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 10 Aug 2021 11:08:06 +0900 Subject: [PATCH 15/35] tools/bootconfig: Add histogram syntax support to bconf2ftrace.sh Add histogram syntax support to bconf2ftrace.sh script. Link: https://lkml.kernel.org/r/162856128672.203126.8240335908303312607.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- tools/bootconfig/scripts/bconf2ftrace.sh | 88 ++++++++++++++++++++++++ tools/bootconfig/scripts/xbc.sh | 4 +- 2 files changed, 90 insertions(+), 2 deletions(-) diff --git a/tools/bootconfig/scripts/bconf2ftrace.sh b/tools/bootconfig/scripts/bconf2ftrace.sh index 651049c782c0..850c2073433e 100755 --- a/tools/bootconfig/scripts/bconf2ftrace.sh +++ b/tools/bootconfig/scripts/bconf2ftrace.sh @@ -94,6 +94,92 @@ compose_synth() { # event_name branch xbc_get_val $2 | while read field; do echo -n "$field; "; done } +print_hist_array() { # prefix key + __sep="=" + if xbc_has_key ${1}.${2}; then + echo -n ":$2" + xbc_get_val ${1}.${2} | while read field; do + echo -n "$__sep$field"; __sep="," + done + fi +} + +print_hist_action_array() { # prefix key + __sep="(" + echo -n ".$2" + xbc_get_val ${1}.${2} | while read field; do + echo -n "$__sep$field"; __sep="," + done + echo -n ")" +} + +print_hist_one_action() { # prefix handler param + echo -n ":${2}("`xbc_get_val ${1}.${3}`")" + if xbc_has_key "${1}.trace"; then + print_hist_action_array ${1} "trace" + elif xbc_has_key "${1}.save"; then + print_hist_action_array ${1} "save" + elif xbc_has_key "${1}.snapshot"; then + echo -n ".snapshot()" + fi +} + +print_hist_actions() { # prefix handler param + for __hdr in `xbc_subkeys ${1}.${2} 1 ".[0-9]"`; do + print_hist_one_action ${1}.${2}.$__hdr ${2} ${3} + done + if xbc_has_key ${1}.${2}.${3} ; then + print_hist_one_action ${1}.${2} ${2} ${3} + fi +} + +print_hist_var() { # prefix varname + echo -n ":${2}="`xbc_get_val ${1}.var.${2} | tr -d [:space:]` +} + +print_one_histogram() { # prefix + echo -n "hist" + print_hist_array $1 "keys" + print_hist_array $1 "values" + print_hist_array $1 "sort" + if xbc_has_key "${1}.size"; then + echo -n ":size="`xbc_get_val ${1}.size` + fi + if xbc_has_key "${1}.name"; then + echo -n ":name="`xbc_get_val ${1}.name` + fi + for __var in `xbc_subkeys "${1}.var" 1`; do + print_hist_var ${1} ${__var} + done + if xbc_has_key "${1}.pause"; then + echo -n ":pause" + elif xbc_has_key "${1}.continue"; then + echo -n ":continue" + elif xbc_has_key "${1}.clear"; then + echo -n ":clear" + fi + print_hist_actions ${1} "onmax" "var" + print_hist_actions ${1} "onchange" "var" + print_hist_actions ${1} "onmatch" "event" + + if xbc_has_key "${1}.filter"; then + echo -n " if "`xbc_get_val ${1}.filter` + fi +} + +setup_one_histogram() { # prefix trigger-file + run_cmd "echo '`print_one_histogram ${1}`' >> ${2}" +} + +setup_histograms() { # prefix trigger-file + for __hist in `xbc_subkeys ${1} 1 ".[0-9]"`; do + setup_one_histogram ${1}.$__hist ${2} + done + if xbc_has_key ${1}.keys; then + setup_one_histogram ${1} ${2} + fi +} + setup_event() { # prefix group event [instance] branch=$1.$2.$3 if [ "$4" ]; then @@ -121,6 +207,8 @@ setup_event() { # prefix group event [instance] set_value_of ${branch}.filter ${eventdir}/filter set_array_of ${branch}.actions ${eventdir}/trigger + setup_histograms ${branch}.hist ${eventdir}/trigger + if xbc_has_key ${branch}.enable; then run_cmd "echo 1 > ${eventdir}/enable" fi diff --git a/tools/bootconfig/scripts/xbc.sh b/tools/bootconfig/scripts/xbc.sh index b8c84e654556..1f0ebf50dd2d 100644 --- a/tools/bootconfig/scripts/xbc.sh +++ b/tools/bootconfig/scripts/xbc.sh @@ -49,8 +49,8 @@ xbc_has_branch() { # prefix-key grep -q "^$1" $XBC_TMPFILE } -xbc_subkeys() { # prefix-key depth +xbc_subkeys() { # prefix-key depth [subkey-pattern] __keys=`echo $1 | sed "s/\./ /g"` __s=`nr_args $__keys` - grep "^$1" $XBC_TMPFILE | cut -d= -f1| cut -d. -f$((__s + 1))-$((__s + $2)) | uniq + grep "^$1$3" $XBC_TMPFILE | cut -d= -f1| cut -d. -f$((__s + 1))-$((__s + $2)) | uniq } From 1eaad3ac3f399434cc50416455dda108aa4ea32e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 10 Aug 2021 11:08:14 +0900 Subject: [PATCH 16/35] tools/bootconfig: Use per-group/all enable option in ftrace2bconf script Use per-group/all enable option instead of ftrace.events option. This will make the bootconfig file more readable. Link: https://lkml.kernel.org/r/162856129436.203126.12462564671412940618.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- tools/bootconfig/scripts/ftrace2bconf.sh | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/tools/bootconfig/scripts/ftrace2bconf.sh b/tools/bootconfig/scripts/ftrace2bconf.sh index a0c3bcc6da4f..fbaf07dc91bf 100755 --- a/tools/bootconfig/scripts/ftrace2bconf.sh +++ b/tools/bootconfig/scripts/ftrace2bconf.sh @@ -92,6 +92,10 @@ referred_vars() { grep "^hist" $1/trigger | grep -o '$[a-zA-Z0-9]*' } +event_is_enabled() { # enable-file + test -f $1 & grep -q "1" $1 +} + per_event_options() { # event-dir evdir=$1 # Check the special event which has no filter and no trigger @@ -113,7 +117,9 @@ per_event_options() { # event-dir emit_kv $PREFIX.event.$group.$event.actions += \'$action\' done - # enable is not checked; this is done by set_event in the instance. + if [ $GROUP_ENABLED -eq 0 ] && event_is_enabled $evdir/enable; then + emit_kv $PREFIX.event.$group.$event.enable + fi val=`cat $evdir/filter` if [ "$val" != "none" ]; then emit_kv $PREFIX.event.$group.$event.filter = "$val" @@ -137,8 +143,19 @@ event_options() { kprobe_event_options synth_event_options fi + ALL_ENABLED=0 + if event_is_enabled $INSTANCE/events/enable; then + emit_kv $PREFIX.event.enable + ALL_ENABLED=1 + fi for group in `ls $INSTANCE/events/` ; do [ ! -d $INSTANCE/events/$group ] && continue + GROUP_ENABLED=$ALL_ENABLED + if [ $ALL_ENABLED -eq 0 ] && \ + event_is_enabled $INSTANCE/events/$group/enable ;then + emit_kv $PREFIX.event.$group.enable + GROUP_ENABLED=1 + fi for event in `ls $INSTANCE/events/$group/` ;do [ ! -d $INSTANCE/events/$group/$event ] && continue per_event_options $INSTANCE/events/$group/$event @@ -226,11 +243,6 @@ instance_options() { # [instance-name] emit_kv $PREFIX.tracing_on = $val fi - val= - for i in `cat $INSTANCE/set_event`; do - val="$val, $i" - done - [ "$val" ] && emit_kv $PREFIX.events = "${val#,}" val=`cat $INSTANCE/current_tracer` [ $val != nop ] && emit_kv $PREFIX.tracer = $val if grep -qv "^#" $INSTANCE/set_ftrace_filter $INSTANCE/set_ftrace_notrace; then From 54b3498d71ae03f70cb7c366d269d0af49d29ec1 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 10 Aug 2021 11:08:22 +0900 Subject: [PATCH 17/35] bootconfig/tracing/ktest: Update ktest example for boot-time tracing Update ktest example for the boot-time tracing with histogram options. Note that since the histogram option uses "trace()" action instead of "EVENT()", this updates the matching pattern too. Link: https://lkml.kernel.org/r/162856130208.203126.4458319094852152589.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- .../examples/bootconfigs/boottrace.bconf | 20 ++++++++++++++----- .../examples/bootconfigs/verify-boottrace.sh | 2 +- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/tools/testing/ktest/examples/bootconfigs/boottrace.bconf b/tools/testing/ktest/examples/bootconfigs/boottrace.bconf index 9db64ec589d5..7aa706cccb3b 100644 --- a/tools/testing/ktest/examples/bootconfigs/boottrace.bconf +++ b/tools/testing/ktest/examples/bootconfigs/boottrace.bconf @@ -10,13 +10,23 @@ ftrace.event { } synthetic.initcall_latency { fields = "unsigned long func", "u64 lat" - actions = "hist:keys=func.sym,lat:vals=lat:sort=lat" + hist { + keys = func.sym,lat + values = lat + sort = lat + } } - initcall.initcall_start { - actions = "hist:keys=func:ts0=common_timestamp.usecs" + initcall.initcall_start.hist { + keys = func; + var.ts0 = common_timestamp.usecs } - initcall.initcall_finish { - actions = "hist:keys=func:lat=common_timestamp.usecs-$ts0:onmatch(initcall.initcall_start).initcall_latency(func,$lat)" + initcall.initcall_finish.hist { + keys = func + var.lat = common_timestamp.usecs - $ts0 + onmatch { + event = initcall.initcall_start + trace = initcall_latency, func, $lat + } } } diff --git a/tools/testing/ktest/examples/bootconfigs/verify-boottrace.sh b/tools/testing/ktest/examples/bootconfigs/verify-boottrace.sh index f271940ce7fb..233e95cfcf20 100755 --- a/tools/testing/ktest/examples/bootconfigs/verify-boottrace.sh +++ b/tools/testing/ktest/examples/bootconfigs/verify-boottrace.sh @@ -58,7 +58,7 @@ compare_file_partial "events/synthetic/initcall_latency/enable" "0" compare_file_partial "events/initcall/initcall_start/trigger" "hist:keys=func:vals=hitcount:ts0=common_timestamp.usecs" compare_file_partial "events/initcall/initcall_start/enable" "1" -compare_file_partial "events/initcall/initcall_finish/trigger" 'hist:keys=func:vals=hitcount:lat=common_timestamp.usecs-\$ts0:sort=hitcount:size=2048:clock=global:onmatch(initcall.initcall_start).initcall_latency(func,\$lat)' +compare_file_partial "events/initcall/initcall_finish/trigger" 'hist:keys=func:vals=hitcount:lat=common_timestamp.usecs-\$ts0:sort=hitcount:size=2048:clock=global:onmatch(initcall.initcall_start).trace(initcall_latency,func,\$lat)' compare_file_partial "events/initcall/initcall_finish/enable" "1" compare_file "instances/foo/current_tracer" "function" From bd74095389b378f2292ae32397b23f4b97ff23c8 Mon Sep 17 00:00:00 2001 From: zhaoxiao Date: Mon, 16 Aug 2021 13:24:30 +0800 Subject: [PATCH 18/35] tracepoint: Fix kerneldoc comments Fix function name in tracepoint.c kernel-doc comment to remove a warning found by clang_w1. kernel/tracepoint.c:589: warning: expecting prototype for register_tracepoint_notifier(). Prototype was for register_tracepoint_module_notifier() instead kernel/tracepoint.c:613: warning: expecting prototype for unregister_tracepoint_notifier(). Prototype was for unregister_tracepoint_module_notifier() instead Link: https://lkml.kernel.org/r/20210816052430.16539-1-zhaoxiao@uniontech.com Acked-by: Mathieu Desnoyers Signed-off-by: zhaoxiao Signed-off-by: Steven Rostedt (VMware) --- kernel/tracepoint.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index efd14c79fab4..64ea283f2f86 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -577,7 +577,7 @@ bool trace_module_has_bad_taint(struct module *mod) static BLOCKING_NOTIFIER_HEAD(tracepoint_notify_list); /** - * register_tracepoint_notifier - register tracepoint coming/going notifier + * register_tracepoint_module_notifier - register tracepoint coming/going notifier * @nb: notifier block * * Notifiers registered with this function are called on module @@ -603,7 +603,7 @@ end: EXPORT_SYMBOL_GPL(register_tracepoint_module_notifier); /** - * unregister_tracepoint_notifier - unregister tracepoint coming/going notifier + * unregister_tracepoint_module_notifier - unregister tracepoint coming/going notifier * @nb: notifier block * * The notifier block callback should expect a "struct tp_module" data From db396be6ddc4b953db3a725e19945fd4f151cfe9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 16 Aug 2021 13:47:48 -0400 Subject: [PATCH 19/35] MAINTAINERS: Add an entry for os noise/latency The "latency" tracers have some different requirements than normal tracing, and also includes Daniel as a maintainer. Add a section in the MAINTAINERS file to help direct patches and bug reports to these tracers to the right people. Signed-off-by: Steven Rostedt (VMware) --- MAINTAINERS | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 6c8be735cc91..4268644cf86a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18798,6 +18798,20 @@ F: arch/x86/mm/testmmiotrace.c F: include/linux/mmiotrace.h F: kernel/trace/trace_mmiotrace.c +TRACING OS NOISE / LATENCY TRACERS +M: Steven Rostedt +M: Daniel Bristot de Oliveira +S: Maintained +F: kernel/trace/trace_osnoise.c +F: include/trace/events/osnoise.h +F: kernel/trace/trace_hwlat.c +F: kernel/trace/trace_irqsoff.c +F: kernel/trace/trace_sched_wakeup.c +F: Documentation/trace/osnoise-tracer.rst +F: Documentation/trace/timerlat-tracer.rst +F: Documentation/trace/hwlat_detector.rst +F: arch/*/kernel/trace.c + TRIVIAL PATCHES M: Jiri Kosina S: Maintained From 99c37d1a63eafcd3673302a7953df760b46d0f6f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 3 Aug 2021 16:16:19 +0200 Subject: [PATCH 20/35] tracing: Replace deprecated CPU-hotplug functions. The functions get_online_cpus() and put_online_cpus() have been deprecated during the CPU hotplug rework. They map directly to cpus_read_lock() and cpus_read_unlock(). Replace deprecated CPU-hotplug functions with the official version. The behavior remains unchanged. Link: https://lkml.kernel.org/r/20210803141621.780504-37-bigeasy@linutronix.de Cc: Peter Zijlstra Cc: Ingo Molnar Acked-by: Daniel Bristot de Oliveira Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 8 ++++---- kernel/trace/trace_hwlat.c | 28 ++++++++++++++-------------- kernel/trace/trace_osnoise.c | 16 ++++++++-------- 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index e592d1df6f88..c5a3fbf19617 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2111,7 +2111,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, } } - get_online_cpus(); + cpus_read_lock(); /* * Fire off all the required work handlers * We can't schedule on offline CPUs, but it's not necessary @@ -2143,7 +2143,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, cpu_buffer->nr_pages_to_update = 0; } - put_online_cpus(); + cpus_read_unlock(); } else { cpu_buffer = buffer->buffers[cpu_id]; @@ -2171,7 +2171,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, goto out_err; } - get_online_cpus(); + cpus_read_lock(); /* Can't run something on an offline CPU. */ if (!cpu_online(cpu_id)) @@ -2183,7 +2183,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, } cpu_buffer->nr_pages_to_update = 0; - put_online_cpus(); + cpus_read_unlock(); } out: diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 14f46aae1981..1b83d75eb103 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -325,10 +325,10 @@ static void move_to_next_cpu(void) if (!cpumask_equal(current_mask, current->cpus_ptr)) goto change_mode; - get_online_cpus(); + cpus_read_lock(); cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask); next_cpu = cpumask_next(raw_smp_processor_id(), current_mask); - put_online_cpus(); + cpus_read_unlock(); if (next_cpu >= nr_cpu_ids) next_cpu = cpumask_first(current_mask); @@ -398,7 +398,7 @@ static void stop_single_kthread(void) struct hwlat_kthread_data *kdata = get_cpu_data(); struct task_struct *kthread; - get_online_cpus(); + cpus_read_lock(); kthread = kdata->kthread; if (!kthread) @@ -408,7 +408,7 @@ static void stop_single_kthread(void) kdata->kthread = NULL; out_put_cpus: - put_online_cpus(); + cpus_read_unlock(); } @@ -425,14 +425,14 @@ static int start_single_kthread(struct trace_array *tr) struct task_struct *kthread; int next_cpu; - get_online_cpus(); + cpus_read_lock(); if (kdata->kthread) goto out_put_cpus; kthread = kthread_create(kthread_fn, NULL, "hwlatd"); if (IS_ERR(kthread)) { pr_err(BANNER "could not start sampling thread\n"); - put_online_cpus(); + cpus_read_unlock(); return -ENOMEM; } @@ -452,7 +452,7 @@ static int start_single_kthread(struct trace_array *tr) wake_up_process(kthread); out_put_cpus: - put_online_cpus(); + cpus_read_unlock(); return 0; } @@ -479,10 +479,10 @@ static void stop_per_cpu_kthreads(void) { unsigned int cpu; - get_online_cpus(); + cpus_read_lock(); for_each_online_cpu(cpu) stop_cpu_kthread(cpu); - put_online_cpus(); + cpus_read_unlock(); } /* @@ -515,7 +515,7 @@ static void hwlat_hotplug_workfn(struct work_struct *dummy) mutex_lock(&trace_types_lock); mutex_lock(&hwlat_data.lock); - get_online_cpus(); + cpus_read_lock(); if (!hwlat_busy || hwlat_data.thread_mode != MODE_PER_CPU) goto out_unlock; @@ -526,7 +526,7 @@ static void hwlat_hotplug_workfn(struct work_struct *dummy) start_cpu_kthread(cpu); out_unlock: - put_online_cpus(); + cpus_read_unlock(); mutex_unlock(&hwlat_data.lock); mutex_unlock(&trace_types_lock); } @@ -582,7 +582,7 @@ static int start_per_cpu_kthreads(struct trace_array *tr) unsigned int cpu; int retval; - get_online_cpus(); + cpus_read_lock(); /* * Run only on CPUs in which hwlat is allowed to run. */ @@ -596,12 +596,12 @@ static int start_per_cpu_kthreads(struct trace_array *tr) if (retval) goto out_error; } - put_online_cpus(); + cpus_read_unlock(); return 0; out_error: - put_online_cpus(); + cpus_read_unlock(); stop_per_cpu_kthreads(); return retval; } diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index b61eefe5ccf5..65b08b8e5bf8 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1498,12 +1498,12 @@ static void stop_per_cpu_kthreads(void) { int cpu; - get_online_cpus(); + cpus_read_lock(); for_each_online_cpu(cpu) stop_kthread(cpu); - put_online_cpus(); + cpus_read_unlock(); } /* @@ -1551,7 +1551,7 @@ static int start_per_cpu_kthreads(struct trace_array *tr) int retval; int cpu; - get_online_cpus(); + cpus_read_lock(); /* * Run only on CPUs in which trace and osnoise are allowed to run. */ @@ -1572,7 +1572,7 @@ static int start_per_cpu_kthreads(struct trace_array *tr) } } - put_online_cpus(); + cpus_read_unlock(); return 0; } @@ -1590,7 +1590,7 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy) goto out_unlock_trace; mutex_lock(&interface_lock); - get_online_cpus(); + cpus_read_lock(); if (!cpumask_test_cpu(cpu, &osnoise_cpumask)) goto out_unlock; @@ -1601,7 +1601,7 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy) start_kthread(cpu); out_unlock: - put_online_cpus(); + cpus_read_unlock(); mutex_unlock(&interface_lock); out_unlock_trace: mutex_unlock(&trace_types_lock); @@ -1743,11 +1743,11 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, /* * osnoise_cpumask is read by CPU hotplug operations. */ - get_online_cpus(); + cpus_read_lock(); cpumask_copy(&osnoise_cpumask, osnoise_cpumask_new); - put_online_cpus(); + cpus_read_unlock(); mutex_unlock(&interface_lock); if (running) From 8b0e6c744fef6462382041b30878c91f15069fc6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 16 Aug 2021 23:42:56 -0400 Subject: [PATCH 21/35] tracing: Add DYNAMIC flag for dynamic events To differentiate between static and dynamic events, add a new flag DYNAMIC to the event flags that all dynamic events have set. This will allow to differentiate when attaching to a dynamic event from a static event. Static events have a mod pointer that references the module they were created in (or NULL for core kernel). This can be incremented when the event has something attached to it. But there exists no such mechanism for dynamic events. This is dangerous as the dynamic events may now disappear without the "attachment" knowing that it no longer exists. To enforce the dynamic flag, change dyn_event_add() to pass the event that is being created such that it can set the DYNAMIC flag of the event. This helps make sure that no location that creates a dynamic event misses setting this flag. Link: https://lore.kernel.org/linux-trace-devel/20210813004448.51c7de69ce432d338f4d226b@kernel.org/ Link: https://lkml.kernel.org/r/20210817035026.936958254@goodmis.org Suggested-by: Masami Hiramatsu Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 3 +++ kernel/trace/trace_dynevent.h | 4 +++- kernel/trace/trace_events_synth.c | 2 +- kernel/trace/trace_kprobe.c | 4 ++-- kernel/trace/trace_uprobe.c | 4 ++-- 5 files changed, 11 insertions(+), 6 deletions(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index ad413b382a3c..53c9dffd87fd 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -310,6 +310,7 @@ enum { TRACE_EVENT_FL_NO_SET_FILTER_BIT, TRACE_EVENT_FL_IGNORE_ENABLE_BIT, TRACE_EVENT_FL_TRACEPOINT_BIT, + TRACE_EVENT_FL_DYNAMIC_BIT, TRACE_EVENT_FL_KPROBE_BIT, TRACE_EVENT_FL_UPROBE_BIT, }; @@ -321,6 +322,7 @@ enum { * NO_SET_FILTER - Set when filter has error and is to be ignored * IGNORE_ENABLE - For trace internal events, do not enable with debugfs file * TRACEPOINT - Event is a tracepoint + * DYNAMIC - Event is a dynamic event (created at run time) * KPROBE - Event is a kprobe * UPROBE - Event is a uprobe */ @@ -330,6 +332,7 @@ enum { TRACE_EVENT_FL_NO_SET_FILTER = (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT), TRACE_EVENT_FL_IGNORE_ENABLE = (1 << TRACE_EVENT_FL_IGNORE_ENABLE_BIT), TRACE_EVENT_FL_TRACEPOINT = (1 << TRACE_EVENT_FL_TRACEPOINT_BIT), + TRACE_EVENT_FL_DYNAMIC = (1 << TRACE_EVENT_FL_DYNAMIC_BIT), TRACE_EVENT_FL_KPROBE = (1 << TRACE_EVENT_FL_KPROBE_BIT), TRACE_EVENT_FL_UPROBE = (1 << TRACE_EVENT_FL_UPROBE_BIT), }; diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h index 7754936b57ee..936477a111d3 100644 --- a/kernel/trace/trace_dynevent.h +++ b/kernel/trace/trace_dynevent.h @@ -76,13 +76,15 @@ int dyn_event_init(struct dyn_event *ev, struct dyn_event_operations *ops) return 0; } -static inline int dyn_event_add(struct dyn_event *ev) +static inline int dyn_event_add(struct dyn_event *ev, + struct trace_event_call *call) { lockdep_assert_held(&event_mutex); if (!ev || !ev->ops) return -EINVAL; + call->flags |= TRACE_EVENT_FL_DYNAMIC; list_add_tail(&ev->list, &dyn_event_list); return 0; } diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 9315fc03e303..f4f5489e1e28 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -1298,7 +1298,7 @@ static int __create_synth_event(const char *name, const char *raw_fields) } ret = register_synth_event(event); if (!ret) - dyn_event_add(&event->devent); + dyn_event_add(&event->devent, &event->call); else free_synth_event(event); out: diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index ea6178cb5e33..bfef43bfce37 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -618,7 +618,7 @@ static int append_trace_kprobe(struct trace_kprobe *tk, struct trace_kprobe *to) if (ret) trace_probe_unlink(&tk->tp); else - dyn_event_add(&tk->devent); + dyn_event_add(&tk->devent, trace_probe_event_call(&tk->tp)); return ret; } @@ -661,7 +661,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk) if (ret < 0) unregister_kprobe_event(tk); else - dyn_event_add(&tk->devent); + dyn_event_add(&tk->devent, trace_probe_event_call(&tk->tp)); end: mutex_unlock(&event_mutex); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 9b50869a5ddb..50eca53b8d22 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -455,7 +455,7 @@ static int append_trace_uprobe(struct trace_uprobe *tu, struct trace_uprobe *to) /* Append to existing event */ ret = trace_probe_append(&tu->tp, &to->tp); if (!ret) - dyn_event_add(&tu->devent); + dyn_event_add(&tu->devent, trace_probe_event_call(&tu->tp)); return ret; } @@ -518,7 +518,7 @@ static int register_trace_uprobe(struct trace_uprobe *tu) goto end; } - dyn_event_add(&tu->devent); + dyn_event_add(&tu->devent, trace_probe_event_call(&tu->tp)); end: mutex_unlock(&event_mutex); From 1d18538e6a09265003a0a94ca779d7a6127cb76c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 16 Aug 2021 23:42:57 -0400 Subject: [PATCH 22/35] tracing: Have dynamic events have a ref counter As dynamic events are not created by modules, if something is attached to one, calling "try_module_get()" on its "mod" field, is not going to keep the dynamic event from going away. Since dynamic events do not need the "mod" pointer of the event structure, make a union out of it in order to save memory (there's one structure for each of the thousand+ events in the kernel), and have any event with the DYNAMIC flag set to use a ref counter instead. Link: https://lore.kernel.org/linux-trace-devel/20210813004448.51c7de69ce432d338f4d226b@kernel.org/ Link: https://lkml.kernel.org/r/20210817035027.174869074@goodmis.org Suggested-by: Masami Hiramatsu Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 45 ++++++++++++++++++++++++++++- kernel/trace/trace.c | 4 +-- kernel/trace/trace_dynevent.c | 38 ++++++++++++++++++++++++ kernel/trace/trace_event_perf.c | 6 ++-- kernel/trace/trace_events.c | 22 +++++++++----- kernel/trace/trace_events_synth.c | 19 +++++++----- kernel/trace/trace_events_trigger.c | 6 ++-- kernel/trace/trace_kprobe.c | 4 +++ kernel/trace/trace_uprobe.c | 4 +++ 9 files changed, 124 insertions(+), 24 deletions(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 53c9dffd87fd..9564c4d9a3b6 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -350,7 +350,14 @@ struct trace_event_call { struct trace_event event; char *print_fmt; struct event_filter *filter; - void *mod; + /* + * Static events can disappear with modules, + * where as dynamic ones need their own ref count. + */ + union { + void *module; + atomic_t refcnt; + }; void *data; /* See the TRACE_EVENT_FL_* flags above */ @@ -366,6 +373,42 @@ struct trace_event_call { #endif }; +#ifdef CONFIG_DYNAMIC_EVENTS +bool trace_event_dyn_try_get_ref(struct trace_event_call *call); +void trace_event_dyn_put_ref(struct trace_event_call *call); +bool trace_event_dyn_busy(struct trace_event_call *call); +#else +static inline bool trace_event_dyn_try_get_ref(struct trace_event_call *call) +{ + /* Without DYNAMIC_EVENTS configured, nothing should be calling this */ + return false; +} +static inline void trace_event_dyn_put_ref(struct trace_event_call *call) +{ +} +static inline bool trace_event_dyn_busy(struct trace_event_call *call) +{ + /* Nothing should call this without DYNAIMIC_EVENTS configured. */ + return true; +} +#endif + +static inline bool trace_event_try_get_ref(struct trace_event_call *call) +{ + if (call->flags & TRACE_EVENT_FL_DYNAMIC) + return trace_event_dyn_try_get_ref(call); + else + return try_module_get(call->module); +} + +static inline void trace_event_put_ref(struct trace_event_call *call) +{ + if (call->flags & TRACE_EVENT_FL_DYNAMIC) + trace_event_dyn_put_ref(call); + else + module_put(call->module); +} + #ifdef CONFIG_PERF_EVENTS static inline bool bpf_prog_array_valid(struct trace_event_call *call) { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index be0169594de5..8425c3d70895 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3697,11 +3697,11 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str) return false; event = container_of(trace_event, struct trace_event_call, event); - if (!event->mod) + if ((event->flags & TRACE_EVENT_FL_DYNAMIC) || !event->module) return false; /* Would rather have rodata, but this will suffice */ - if (within_module_core(addr, event->mod)) + if (within_module_core(addr, event->module)) return true; return false; diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index e57cc0870892..1110112e55bd 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -13,11 +13,49 @@ #include #include "trace.h" +#include "trace_output.h" /* for trace_event_sem */ #include "trace_dynevent.h" static DEFINE_MUTEX(dyn_event_ops_mutex); static LIST_HEAD(dyn_event_ops_list); +bool trace_event_dyn_try_get_ref(struct trace_event_call *dyn_call) +{ + struct trace_event_call *call; + bool ret = false; + + if (WARN_ON_ONCE(!(dyn_call->flags & TRACE_EVENT_FL_DYNAMIC))) + return false; + + down_read(&trace_event_sem); + list_for_each_entry(call, &ftrace_events, list) { + if (call == dyn_call) { + atomic_inc(&dyn_call->refcnt); + ret = true; + } + } + up_read(&trace_event_sem); + return ret; +} + +void trace_event_dyn_put_ref(struct trace_event_call *call) +{ + if (WARN_ON_ONCE(!(call->flags & TRACE_EVENT_FL_DYNAMIC))) + return; + + if (WARN_ON_ONCE(atomic_read(&call->refcnt) <= 0)) { + atomic_set(&call->refcnt, 0); + return; + } + + atomic_dec(&call->refcnt); +} + +bool trace_event_dyn_busy(struct trace_event_call *call) +{ + return atomic_read(&call->refcnt) != 0; +} + int dyn_event_register(struct dyn_event_operations *ops) { if (!ops || !ops->create || !ops->show || !ops->is_busy || diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 03be4435d103..6aed10e2f7ce 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -177,7 +177,7 @@ static void perf_trace_event_unreg(struct perf_event *p_event) } } out: - module_put(tp_event->mod); + trace_event_put_ref(tp_event); } static int perf_trace_event_open(struct perf_event *p_event) @@ -224,10 +224,10 @@ int perf_trace_init(struct perf_event *p_event) list_for_each_entry(tp_event, &ftrace_events, list) { if (tp_event->event.type == event_id && tp_event->class && tp_event->class->reg && - try_module_get(tp_event->mod)) { + trace_event_try_get_ref(tp_event)) { ret = perf_trace_event_init(tp_event, p_event); if (ret) - module_put(tp_event->mod); + trace_event_put_ref(tp_event); break; } } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 80e96989770e..1349b6de5eeb 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2525,7 +2525,10 @@ __register_event(struct trace_event_call *call, struct module *mod) return ret; list_add(&call->list, &ftrace_events); - call->mod = mod; + if (call->flags & TRACE_EVENT_FL_DYNAMIC) + atomic_set(&call->refcnt, 0); + else + call->module = mod; return 0; } @@ -2839,7 +2842,9 @@ static void trace_module_remove_events(struct module *mod) down_write(&trace_event_sem); list_for_each_entry_safe(call, p, &ftrace_events, list) { - if (call->mod == mod) + if ((call->flags & TRACE_EVENT_FL_DYNAMIC) || !call->module) + continue; + if (call->module == mod) __trace_remove_event_call(call); } up_write(&trace_event_sem); @@ -2982,7 +2987,7 @@ struct trace_event_file *trace_get_event_file(const char *instance, } /* Don't let event modules unload while in use */ - ret = try_module_get(file->event_call->mod); + ret = trace_event_try_get_ref(file->event_call); if (!ret) { trace_array_put(tr); ret = -EBUSY; @@ -3012,7 +3017,7 @@ EXPORT_SYMBOL_GPL(trace_get_event_file); void trace_put_event_file(struct trace_event_file *file) { mutex_lock(&event_mutex); - module_put(file->event_call->mod); + trace_event_put_ref(file->event_call); mutex_unlock(&event_mutex); trace_array_put(file->tr); @@ -3147,7 +3152,7 @@ static int free_probe_data(void *data) if (!edata->ref) { /* Remove the SOFT_MODE flag */ __ftrace_event_enable_disable(edata->file, 0, 1); - module_put(edata->file->event_call->mod); + trace_event_put_ref(edata->file->event_call); kfree(edata); } return 0; @@ -3280,7 +3285,7 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, out_reg: /* Don't let event modules unload while probe registered */ - ret = try_module_get(file->event_call->mod); + ret = trace_event_try_get_ref(file->event_call); if (!ret) { ret = -EBUSY; goto out_free; @@ -3310,7 +3315,7 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, out_disable: __ftrace_event_enable_disable(file, 0, 1); out_put: - module_put(file->event_call->mod); + trace_event_put_ref(file->event_call); out_free: kfree(data); goto out; @@ -3376,7 +3381,8 @@ void __trace_early_add_events(struct trace_array *tr) list_for_each_entry(call, &ftrace_events, list) { /* Early boot up should not have any modules loaded */ - if (WARN_ON_ONCE(call->mod)) + if (!(call->flags & TRACE_EVENT_FL_DYNAMIC) && + WARN_ON_ONCE(call->module)) continue; ret = __trace_early_add_new_event(call, tr); diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index f4f5489e1e28..d54094b7a9d7 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -1369,13 +1369,15 @@ static int destroy_synth_event(struct synth_event *se) int ret; if (se->ref) - ret = -EBUSY; - else { - ret = unregister_synth_event(se); - if (!ret) { - dyn_event_remove(&se->devent); - free_synth_event(se); - } + return -EBUSY; + + if (trace_event_dyn_busy(&se->call)) + return -EBUSY; + + ret = unregister_synth_event(se); + if (!ret) { + dyn_event_remove(&se->devent); + free_synth_event(se); } return ret; @@ -2102,6 +2104,9 @@ static int synth_event_release(struct dyn_event *ev) if (event->ref) return -EBUSY; + if (trace_event_dyn_busy(&event->call)) + return -EBUSY; + ret = unregister_synth_event(event); if (ret) return ret; diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index cf84d0f6583a..6b11e335a62e 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1334,7 +1334,7 @@ void event_enable_trigger_free(struct event_trigger_ops *ops, if (!data->ref) { /* Remove the SOFT_MODE flag */ trace_event_enable_disable(enable_data->file, 0, 1); - module_put(enable_data->file->event_call->mod); + trace_event_put_ref(enable_data->file->event_call); trigger_data_free(data); kfree(enable_data); } @@ -1481,7 +1481,7 @@ int event_enable_trigger_func(struct event_command *cmd_ops, out_reg: /* Don't let event modules unload while probe registered */ - ret = try_module_get(event_enable_file->event_call->mod); + ret = trace_event_try_get_ref(event_enable_file->event_call); if (!ret) { ret = -EBUSY; goto out_free; @@ -1510,7 +1510,7 @@ int event_enable_trigger_func(struct event_command *cmd_ops, out_disable: trace_event_enable_disable(event_enable_file, 0, 1); out_put: - module_put(event_enable_file->event_call->mod); + trace_event_put_ref(event_enable_file->event_call); out_free: if (cmd_ops->set_filter) cmd_ops->set_filter(NULL, trigger_data, NULL); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index bfef43bfce37..82c3b86013b2 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -543,6 +543,10 @@ static int unregister_trace_kprobe(struct trace_kprobe *tk) if (trace_probe_is_enabled(&tk->tp)) return -EBUSY; + /* If there's a reference to the dynamic event */ + if (trace_event_dyn_busy(trace_probe_event_call(&tk->tp))) + return -EBUSY; + /* Will fail if probe is being used by ftrace or perf */ if (unregister_kprobe_event(tk)) return -EBUSY; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 50eca53b8d22..1e2a92e7607d 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -393,6 +393,10 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu) if (trace_probe_has_sibling(&tu->tp)) goto unreg; + /* If there's a reference to the dynamic event */ + if (trace_event_dyn_busy(trace_probe_event_call(&tu->tp))) + return -EBUSY; + ret = unregister_uprobe_event(tu); if (ret) return ret; From fcd9db51df8e219e3a61b14e9b8c5ee67d39d37c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 16 Aug 2021 23:42:58 -0400 Subject: [PATCH 23/35] tracing/probe: Have traceprobe_parse_probe_arg() take a const arg The two places that call traceprobe_parse_probe_arg() allocate a temporary buffer to copy the argv[i] into, because argv[i] is constant and the traceprobe_parse_probe_arg() will modify it to do the parsing. These two places allocate this buffer and then free it right after calling this function, leaving the onus of this allocation to the caller. As there's about to be a third user of this function that will have to do the same thing, instead of having the caller allocate the temporary buffer, simply move that allocation into the traceprobe_parse_probe_arg() itself, which will simplify the code of the callers. Link: https://lkml.kernel.org/r/20210817035027.385422828@goodmis.org Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 9 +------ kernel/trace/trace_probe.c | 47 ++++++++++++++++++++++--------------- kernel/trace/trace_probe.h | 2 +- kernel/trace/trace_uprobe.c | 9 +------ 4 files changed, 31 insertions(+), 36 deletions(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 82c3b86013b2..ed1e3c2087ab 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -873,15 +873,8 @@ static int __trace_kprobe_create(int argc, const char *argv[]) /* parse arguments */ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { - tmp = kstrdup(argv[i], GFP_KERNEL); - if (!tmp) { - ret = -ENOMEM; - goto error; - } - trace_probe_log_set_index(i + 2); - ret = traceprobe_parse_probe_arg(&tk->tp, i, tmp, flags); - kfree(tmp); + ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], flags); if (ret) goto error; /* This can be -ENOMEM */ } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 15413ad7cef2..ef717b373443 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -540,26 +540,34 @@ static int __parse_bitfield_probe_arg(const char *bf, } /* String length checking wrapper */ -static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, +static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, struct probe_arg *parg, unsigned int flags, int offset) { struct fetch_insn *code, *scode, *tmp = NULL; char *t, *t2, *t3; + char *arg; int ret, len; + arg = kstrdup(argv, GFP_KERNEL); + if (!arg) + return -ENOMEM; + + ret = -EINVAL; len = strlen(arg); if (len > MAX_ARGSTR_LEN) { trace_probe_log_err(offset, ARG_TOO_LONG); - return -EINVAL; + goto out; } else if (len == 0) { trace_probe_log_err(offset, NO_ARG_BODY); - return -EINVAL; + goto out; } + ret = -ENOMEM; parg->comm = kstrdup(arg, GFP_KERNEL); if (!parg->comm) - return -ENOMEM; + goto out; + ret = -EINVAL; t = strchr(arg, ':'); if (t) { *t = '\0'; @@ -571,22 +579,22 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, offset += t2 + strlen(t2) - arg; trace_probe_log_err(offset, ARRAY_NO_CLOSE); - return -EINVAL; + goto out; } else if (t3[1] != '\0') { trace_probe_log_err(offset + t3 + 1 - arg, BAD_ARRAY_SUFFIX); - return -EINVAL; + goto out; } *t3 = '\0'; if (kstrtouint(t2, 0, &parg->count) || !parg->count) { trace_probe_log_err(offset + t2 - arg, BAD_ARRAY_NUM); - return -EINVAL; + goto out; } if (parg->count > MAX_ARRAY_LEN) { trace_probe_log_err(offset + t2 - arg, ARRAY_TOO_BIG); - return -EINVAL; + goto out; } } } @@ -598,29 +606,30 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, if (strcmp(arg, "$comm") == 0 || strncmp(arg, "\\\"", 2) == 0) { /* The type of $comm must be "string", and not an array. */ if (parg->count || (t && strcmp(t, "string"))) - return -EINVAL; + goto out; parg->type = find_fetch_type("string"); } else parg->type = find_fetch_type(t); if (!parg->type) { trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_TYPE); - return -EINVAL; + goto out; } parg->offset = *size; *size += parg->type->size * (parg->count ?: 1); + ret = -ENOMEM; if (parg->count) { len = strlen(parg->type->fmttype) + 6; parg->fmt = kmalloc(len, GFP_KERNEL); if (!parg->fmt) - return -ENOMEM; + goto out; snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype, parg->count); } code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL); if (!code) - return -ENOMEM; + goto out; code[FETCH_INSN_MAX - 1].op = FETCH_OP_END; ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1], @@ -628,6 +637,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, if (ret) goto fail; + ret = -EINVAL; /* Store operation */ if (!strcmp(parg->type->name, "string") || !strcmp(parg->type->name, "ustring")) { @@ -636,7 +646,6 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, code->op != FETCH_OP_DATA) { trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_STRING); - ret = -EINVAL; goto fail; } if ((code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM || @@ -650,7 +659,6 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, code++; if (code->op != FETCH_OP_NOP) { trace_probe_log_err(offset, TOO_MANY_OPS); - ret = -EINVAL; goto fail; } } @@ -672,7 +680,6 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, code++; if (code->op != FETCH_OP_NOP) { trace_probe_log_err(offset, TOO_MANY_OPS); - ret = -EINVAL; goto fail; } code->op = FETCH_OP_ST_RAW; @@ -687,6 +694,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, goto fail; } } + ret = -EINVAL; /* Loop(Array) operation */ if (parg->count) { if (scode->op != FETCH_OP_ST_MEM && @@ -694,13 +702,11 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, scode->op != FETCH_OP_ST_USTRING) { trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_STRING); - ret = -EINVAL; goto fail; } code++; if (code->op != FETCH_OP_NOP) { trace_probe_log_err(offset, TOO_MANY_OPS); - ret = -EINVAL; goto fail; } code->op = FETCH_OP_LP_ARRAY; @@ -709,6 +715,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, code++; code->op = FETCH_OP_END; + ret = 0; /* Shrink down the code buffer */ parg->code = kcalloc(code - tmp + 1, sizeof(*code), GFP_KERNEL); if (!parg->code) @@ -724,6 +731,8 @@ fail: kfree(code->data); } kfree(tmp); +out: + kfree(arg); return ret; } @@ -745,11 +754,11 @@ static int traceprobe_conflict_field_name(const char *name, return 0; } -int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg, +int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *arg, unsigned int flags) { struct probe_arg *parg = &tp->args[i]; - char *body; + const char *body; /* Increment count for freeing args in error case */ tp->nr_args++; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 227d518e5ba5..42aa084902fa 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -354,7 +354,7 @@ int trace_probe_create(const char *raw_command, int (*createfn)(int, const char #define TPARG_FL_MASK GENMASK(2, 0) extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, - char *arg, unsigned int flags); + const char *argv, unsigned int flags); extern int traceprobe_update_arg(struct probe_arg *arg); extern void traceprobe_free_probe_arg(struct probe_arg *arg); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 1e2a92e7607d..93ff96541971 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -684,16 +684,9 @@ static int __trace_uprobe_create(int argc, const char **argv) /* parse arguments */ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { - tmp = kstrdup(argv[i], GFP_KERNEL); - if (!tmp) { - ret = -ENOMEM; - goto error; - } - trace_probe_log_set_index(i + 2); - ret = traceprobe_parse_probe_arg(&tu->tp, i, tmp, + ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], is_return ? TPARG_FL_RETURN : 0); - kfree(tmp); if (ret) goto error; } From bc1b973455fd5d84dac4a094da44202f2d8a98ef Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 16 Aug 2021 23:42:59 -0400 Subject: [PATCH 24/35] tracing/probes: Allow for dot delimiter as well as slash for system names Kprobe and uprobe events can add a "system" to the events that are created via the kprobe_events and uprobe_events files respectively. If they do not include a "system" in the name, then the default "kprobes" or "uprobes" is used. The current notation to specify a system for one of these probe events is to add a '/' delimiter in the name, where the content before the '/' will be the system to use, and the content after will be the event name. echo 'p:my_system/my_event' > kprobe_events But this is inconsistent with the way histogram triggers separate their system / event names. The histogram triggers use a '.' delimiter, which can be confusing. To allow this to be more consistent, as well as keep backward compatibility, allow the kprobe and uprobe events to denote a system name with either a '/' or a '.'. That is: echo 'p:my_system/my_event' > kprobe_events is equivalent to: echo 'p:my_system.my_event' > kprobe_events Link: https://lore.kernel.org/linux-trace-devel/20210813004448.51c7de69ce432d338f4d226b@kernel.org/ Link: https://lkml.kernel.org/r/20210817035027.580493202@goodmis.org Suggested-by: Masami Hiramatsu Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index ef717b373443..0916a9964719 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -233,6 +233,9 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, int len; slash = strchr(event, '/'); + if (!slash) + slash = strchr(event, '.'); + if (slash) { if (slash == event) { trace_probe_log_err(offset, NO_GROUP_NAME); From 845cbf3e11acc263ec7a46a89097d88e7e50a9ae Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 16 Aug 2021 23:43:00 -0400 Subject: [PATCH 25/35] tracing/probes: Use struct_size() instead of defining custom macros Remove SIZEOF_TRACE_KPROBE() and SIZEOF_TRACE_UPROBE() and use struct_size() as that's what it is made for. No need to have custom macros. Especially since struct_size() has some extra memory checks for correctness. Link: https://lkml.kernel.org/r/20210817035027.795000217@goodmis.org Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 6 +----- kernel/trace/trace_uprobe.c | 6 +----- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index ed1e3c2087ab..ca726c9d0859 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -80,10 +80,6 @@ static struct trace_kprobe *to_trace_kprobe(struct dyn_event *ev) for_each_dyn_event(dpos) \ if (is_trace_kprobe(dpos) && (pos = to_trace_kprobe(dpos))) -#define SIZEOF_TRACE_KPROBE(n) \ - (offsetof(struct trace_kprobe, tp.args) + \ - (sizeof(struct probe_arg) * (n))) - static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk) { return tk->rp.handler != NULL; @@ -265,7 +261,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, struct trace_kprobe *tk; int ret = -ENOMEM; - tk = kzalloc(SIZEOF_TRACE_KPROBE(nargs), GFP_KERNEL); + tk = kzalloc(struct_size(tk, tp.args, nargs), GFP_KERNEL); if (!tk) return ERR_PTR(ret); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 93ff96541971..590bb9a02f8d 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -83,10 +83,6 @@ static struct trace_uprobe *to_trace_uprobe(struct dyn_event *ev) for_each_dyn_event(dpos) \ if (is_trace_uprobe(dpos) && (pos = to_trace_uprobe(dpos))) -#define SIZEOF_TRACE_UPROBE(n) \ - (offsetof(struct trace_uprobe, tp.args) + \ - (sizeof(struct probe_arg) * (n))) - static int register_uprobe_event(struct trace_uprobe *tu); static int unregister_uprobe_event(struct trace_uprobe *tu); @@ -340,7 +336,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) struct trace_uprobe *tu; int ret; - tu = kzalloc(SIZEOF_TRACE_UPROBE(nargs), GFP_KERNEL); + tu = kzalloc(struct_size(tu, tp.args, nargs), GFP_KERNEL); if (!tu) return ERR_PTR(-ENOMEM); From 007517a01995fb24f2f4effc9cf34814361a9d10 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 19 Aug 2021 00:13:27 -0400 Subject: [PATCH 26/35] tracing/probe: Change traceprobe_set_print_fmt() to take a type Instead of a boolean "is_return" have traceprobe_set_print_fmt() take a type (currently just PROBE_PRINT_NORMAL and PROBE_PRINT_RETURN). This will simplify adding different types. For example, the development of the event_probe, will need its own type as it prints an event, and not an IP. Link: https://lkml.kernel.org/r/20210819041842.104626301@goodmis.org Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 9 +++++++-- kernel/trace/trace_probe.c | 18 ++++++++++++------ kernel/trace/trace_probe.h | 7 ++++++- kernel/trace/trace_uprobe.c | 8 ++++++-- 4 files changed, 31 insertions(+), 11 deletions(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index ca726c9d0859..c6fe7a6e3f35 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -742,6 +742,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) bool is_return = false; char *symbol = NULL, *tmp = NULL; const char *event = NULL, *group = KPROBE_EVENT_SYSTEM; + enum probe_print_type ptype; int maxactive = 0; long offset = 0; void *addr = NULL; @@ -875,7 +876,8 @@ static int __trace_kprobe_create(int argc, const char *argv[]) goto error; /* This can be -ENOMEM */ } - ret = traceprobe_set_print_fmt(&tk->tp, is_return); + ptype = is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; + ret = traceprobe_set_print_fmt(&tk->tp, ptype); if (ret < 0) goto error; @@ -1799,6 +1801,7 @@ struct trace_event_call * create_local_trace_kprobe(char *func, void *addr, unsigned long offs, bool is_return) { + enum probe_print_type ptype; struct trace_kprobe *tk; int ret; char *event; @@ -1822,7 +1825,9 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs, init_trace_event_call(tk); - if (traceprobe_set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) { + ptype = trace_kprobe_is_return(tk) ? + PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; + if (traceprobe_set_print_fmt(&tk->tp, ptype) < 0) { ret = -ENOMEM; goto error; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 0916a9964719..9c9c83a063b2 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -851,19 +851,25 @@ int traceprobe_update_arg(struct probe_arg *arg) /* When len=0, we just calculate the needed length */ #define LEN_OR_ZERO (len ? len - pos : 0) static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, - bool is_return) + enum probe_print_type ptype) { struct probe_arg *parg; int i, j; int pos = 0; const char *fmt, *arg; - if (!is_return) { + switch (ptype) { + case PROBE_PRINT_NORMAL: fmt = "(%lx)"; arg = "REC->" FIELD_STRING_IP; - } else { + break; + case PROBE_PRINT_RETURN: fmt = "(%lx <- %lx)"; arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; + break; + default: + WARN_ON_ONCE(1); + return 0; } pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); @@ -912,20 +918,20 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, } #undef LEN_OR_ZERO -int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return) +int traceprobe_set_print_fmt(struct trace_probe *tp, enum probe_print_type ptype) { struct trace_event_call *call = trace_probe_event_call(tp); int len; char *print_fmt; /* First: called with 0 length to calculate the needed length */ - len = __set_print_fmt(tp, NULL, 0, is_return); + len = __set_print_fmt(tp, NULL, 0, ptype); print_fmt = kmalloc(len + 1, GFP_KERNEL); if (!print_fmt) return -ENOMEM; /* Second: actually write the @print_fmt */ - __set_print_fmt(tp, print_fmt, len + 1, is_return); + __set_print_fmt(tp, print_fmt, len + 1, ptype); call->print_fmt = print_fmt; return 0; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 42aa084902fa..8adf5f3542a6 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -363,7 +363,12 @@ extern int traceprobe_split_symbol_offset(char *symbol, long *offset); int traceprobe_parse_event_name(const char **pevent, const char **pgroup, char *buf, int offset); -extern int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return); +enum probe_print_type { + PROBE_PRINT_NORMAL, + PROBE_PRINT_RETURN, +}; + +extern int traceprobe_set_print_fmt(struct trace_probe *tp, enum probe_print_type ptype); #ifdef CONFIG_PERF_EVENTS extern struct trace_event_call * diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 590bb9a02f8d..09f8ca7f7ba0 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -536,6 +536,7 @@ static int __trace_uprobe_create(int argc, const char **argv) const char *event = NULL, *group = UPROBE_EVENT_SYSTEM; char *arg, *filename, *rctr, *rctr_end, *tmp; char buf[MAX_EVENT_NAME_LEN]; + enum probe_print_type ptype; struct path path; unsigned long offset, ref_ctr_offset; bool is_return = false; @@ -687,7 +688,8 @@ static int __trace_uprobe_create(int argc, const char **argv) goto error; } - ret = traceprobe_set_print_fmt(&tu->tp, is_ret_probe(tu)); + ptype = is_ret_probe(tu) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; + ret = traceprobe_set_print_fmt(&tu->tp, ptype); if (ret < 0) goto error; @@ -1578,6 +1580,7 @@ struct trace_event_call * create_local_trace_uprobe(char *name, unsigned long offs, unsigned long ref_ctr_offset, bool is_return) { + enum probe_print_type ptype; struct trace_uprobe *tu; struct path path; int ret; @@ -1612,7 +1615,8 @@ create_local_trace_uprobe(char *name, unsigned long offs, tu->filename = kstrdup(name, GFP_KERNEL); init_trace_event_call(tu); - if (traceprobe_set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) { + ptype = is_ret_probe(tu) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; + if (traceprobe_set_print_fmt(&tu->tp, ptype) < 0) { ret = -ENOMEM; goto error; } From 8565a45d0858078b63c7d84074a21a42ba9ebf01 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 19 Aug 2021 00:13:28 -0400 Subject: [PATCH 27/35] tracing/probes: Have process_fetch_insn() take a void * instead of pt_regs In preparation to allow event probes to use the process_fetch_insn() callback in trace_probe_tmpl.h, change the data passed to it from a pointer to pt_regs, as the event probe will not be using regs, and make it a void pointer instead. Update the process_fetch_insn() callers for kprobe and uprobe events to have the regs defined in the function and just typecast the void pointer parameter. Link: https://lkml.kernel.org/r/20210819041842.291622924@goodmis.org Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 3 ++- kernel/trace/trace_probe_tmpl.h | 6 +++--- kernel/trace/trace_uprobe.c | 3 ++- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c6fe7a6e3f35..4b013d24f5a9 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1325,9 +1325,10 @@ probe_mem_read(void *dest, void *src, size_t size) /* Note that we don't verify it, since the code does not come from user space */ static int -process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, +process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, void *base) { + struct pt_regs *regs = rec; unsigned long val; retry: diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index f003c5d02a3a..b3bdb8ddb862 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -54,7 +54,7 @@ fetch_apply_bitfield(struct fetch_insn *code, void *buf) * If dest is NULL, don't store result and return required dynamic data size. */ static int -process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, +process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, void *base); static nokprobe_inline int fetch_store_strlen(unsigned long addr); static nokprobe_inline int @@ -188,7 +188,7 @@ __get_data_size(struct trace_probe *tp, struct pt_regs *regs) /* Store the value of each argument */ static nokprobe_inline void -store_trace_args(void *data, struct trace_probe *tp, struct pt_regs *regs, +store_trace_args(void *data, struct trace_probe *tp, void *rec, int header_size, int maxlen) { struct probe_arg *arg; @@ -203,7 +203,7 @@ store_trace_args(void *data, struct trace_probe *tp, struct pt_regs *regs, /* Point the dynamic data area if needed */ if (unlikely(arg->dynamic)) *dl = make_data_loc(maxlen, dyndata - base); - ret = process_fetch_insn(arg->code, regs, dl, base); + ret = process_fetch_insn(arg->code, rec, dl, base); if (unlikely(ret < 0 && arg->dynamic)) { *dl = make_data_loc(0, dyndata - base); } else { diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 09f8ca7f7ba0..d219ba50efbd 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -213,9 +213,10 @@ static unsigned long translate_user_vaddr(unsigned long file_offset) /* Note that we don't verify it, since the code does not come from user space */ static int -process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, +process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, void *base) { + struct pt_regs *regs = rec; unsigned long val; /* 1st stage: get value from context */ From 8e242060c6a4947e8ae7d29794af6a581db08841 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 19 Aug 2021 19:26:02 +0900 Subject: [PATCH 28/35] tracing/probes: Reject events which have the same name of existing one Since kprobe_events and uprobe_events only check whether the other same-type probe event has the same name or not, if the user gives the same name of the existing tracepoint event (or the other type of probe events), it silently fails to create the tracefs entry (but registered.) as below. /sys/kernel/tracing # ls events/task/task_rename enable filter format hist id trigger /sys/kernel/tracing # echo p:task/task_rename vfs_read >> kprobe_events [ 113.048508] Could not create tracefs 'task_rename' directory /sys/kernel/tracing # cat kprobe_events p:task/task_rename vfs_read To fix this issue, check whether the existing events have the same name or not in trace_probe_register_event_call(). If exists, it rejects to register the new event. Link: https://lkml.kernel.org/r/162936876189.187130.17558311387542061930.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 6 +++++- kernel/trace/trace_probe.c | 25 +++++++++++++++++++++++++ kernel/trace/trace_probe.h | 1 + kernel/trace/trace_uprobe.c | 6 +++++- 4 files changed, 36 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 4b013d24f5a9..882c27044029 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -647,7 +647,11 @@ static int register_trace_kprobe(struct trace_kprobe *tk) /* Register new event */ ret = register_kprobe_event(tk); if (ret) { - pr_warn("Failed to register probe event(%d)\n", ret); + if (ret == -EEXIST) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, EVENT_EXIST); + } else + pr_warn("Failed to register probe event(%d)\n", ret); goto end; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 9c9c83a063b2..782c00eb6859 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1047,11 +1047,36 @@ error: return ret; } +static struct trace_event_call * +find_trace_event_call(const char *system, const char *event_name) +{ + struct trace_event_call *tp_event; + const char *name; + + list_for_each_entry(tp_event, &ftrace_events, list) { + if (!tp_event->class->system || + strcmp(system, tp_event->class->system)) + continue; + name = trace_event_name(tp_event); + if (!name || strcmp(event_name, name)) + continue; + return tp_event; + } + + return NULL; +} + int trace_probe_register_event_call(struct trace_probe *tp) { struct trace_event_call *call = trace_probe_event_call(tp); int ret; + lockdep_assert_held(&event_mutex); + + if (find_trace_event_call(trace_probe_group_name(tp), + trace_probe_name(tp))) + return -EEXIST; + ret = register_trace_event(&call->event); if (!ret) return -ENODEV; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 8adf5f3542a6..66701a92d186 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -404,6 +404,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(NO_EVENT_NAME, "Event name is not specified"), \ C(EVENT_TOO_LONG, "Event name is too long"), \ C(BAD_EVENT_NAME, "Event name must follow the same rules as C identifiers"), \ + C(EVENT_EXIST, "Given group/event name is already used by another event"), \ C(RETVAL_ON_PROBE, "$retval is not available on probe"), \ C(BAD_STACK_NUM, "Invalid stack number"), \ C(BAD_ARG_NUM, "Invalid argument number"), \ diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index d219ba50efbd..225ce569bf8f 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -515,7 +515,11 @@ static int register_trace_uprobe(struct trace_uprobe *tu) ret = register_uprobe_event(tu); if (ret) { - pr_warn("Failed to register probe event(%d)\n", ret); + if (ret == -EEXIST) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, EVENT_EXIST); + } else + pr_warn("Failed to register probe event(%d)\n", ret); goto end; } From 7491e2c442781a1860181adb5ab472a52075f393 Mon Sep 17 00:00:00 2001 From: "Tzvetomir Stoyanov (VMware)" Date: Thu, 19 Aug 2021 11:26:06 -0400 Subject: [PATCH 29/35] tracing: Add a probe that attaches to trace events A new dynamic event is introduced: event probe. The event is attached to an existing tracepoint and uses its fields as arguments. The user can specify custom format string of the new event, select what tracepoint arguments will be printed and how to print them. An event probe is created by writing configuration string in 'dynamic_events' ftrace file: e[:[SNAME/]ENAME] SYSTEM/EVENT [FETCHARGS] - Set an event probe -:SNAME/ENAME - Delete an event probe Where: SNAME - System name, if omitted 'eprobes' is used. ENAME - Name of the new event in SNAME, if omitted the SYSTEM_EVENT is used. SYSTEM - Name of the system, where the tracepoint is defined, mandatory. EVENT - Name of the tracepoint event in SYSTEM, mandatory. FETCHARGS - Arguments: =$[:TYPE] - Fetch given filed of the tracepoint and print it as given TYPE with given name. Supported types are: (u8/u16/u32/u64/s8/s16/s32/s64), basic type (x8/x16/x32/x64), hexadecimal types "string", "ustring" and bitfield. Example, attach an event probe on openat system call and print name of the file that will be opened: echo "e:esys/eopen syscalls/sys_enter_openat file=\$filename:string" >> dynamic_events A new dynamic event is created in events/esys/eopen/ directory. It can be deleted with: echo "-:esys/eopen" >> dynamic_events Filters, triggers and histograms can be attached to the new event, it can be matched in synthetic events. There is one limitation - an event probe can not be attached to kprobe, uprobe or another event probe. Link: https://lkml.kernel.org/r/20210812145805.2292326-1-tz.stoyanov@gmail.com Link: https://lkml.kernel.org/r/20210819152825.142428383@goodmis.org Acked-by: Masami Hiramatsu Co-developed-by: Steven Rostedt (VMware) Signed-off-by: Tzvetomir Stoyanov (VMware) Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 4 + kernel/trace/Makefile | 1 + kernel/trace/trace.c | 5 +- kernel/trace/trace.h | 18 + kernel/trace/trace_eprobe.c | 903 ++++++++++++++++++++++++++++ kernel/trace/trace_events_trigger.c | 14 +- kernel/trace/trace_kprobe.c | 8 - kernel/trace/trace_probe.c | 16 +- kernel/trace/trace_probe.h | 6 +- 9 files changed, 962 insertions(+), 13 deletions(-) create mode 100644 kernel/trace/trace_eprobe.c diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 9564c4d9a3b6..0a0144580bbd 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -313,6 +313,7 @@ enum { TRACE_EVENT_FL_DYNAMIC_BIT, TRACE_EVENT_FL_KPROBE_BIT, TRACE_EVENT_FL_UPROBE_BIT, + TRACE_EVENT_FL_EPROBE_BIT, }; /* @@ -325,6 +326,7 @@ enum { * DYNAMIC - Event is a dynamic event (created at run time) * KPROBE - Event is a kprobe * UPROBE - Event is a uprobe + * EPROBE - Event is an event probe */ enum { TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT), @@ -335,6 +337,7 @@ enum { TRACE_EVENT_FL_DYNAMIC = (1 << TRACE_EVENT_FL_DYNAMIC_BIT), TRACE_EVENT_FL_KPROBE = (1 << TRACE_EVENT_FL_KPROBE_BIT), TRACE_EVENT_FL_UPROBE = (1 << TRACE_EVENT_FL_UPROBE_BIT), + TRACE_EVENT_FL_EPROBE = (1 << TRACE_EVENT_FL_EPROBE_BIT), }; #define TRACE_EVENT_FL_UKPROBE (TRACE_EVENT_FL_KPROBE | TRACE_EVENT_FL_UPROBE) @@ -680,6 +683,7 @@ enum event_trigger_type { ETT_EVENT_ENABLE = (1 << 3), ETT_EVENT_HIST = (1 << 4), ETT_HIST_ENABLE = (1 << 5), + ETT_EVENT_EPROBE = (1 << 6), }; extern int filter_match_preds(struct event_filter *filter, void *rec); diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index b1c47ccf4f73..6de5d4d63165 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -77,6 +77,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o +obj-$(CONFIG_PROBE_EVENTS) += trace_eprobe.o obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8425c3d70895..489924cde4f8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5543,6 +5543,7 @@ static const char readme_msg[] = #ifdef CONFIG_HIST_TRIGGERS "\t s:[synthetic/] []\n" #endif + "\t e[:[/]] . []\n" "\t -:[/]\n" #ifdef CONFIG_KPROBE_EVENTS "\t place: [:][+]|\n" @@ -5552,7 +5553,7 @@ static const char readme_msg[] = " place (uprobe): :[%return][(ref_ctr_offset)]\n" #endif "\t args: =fetcharg[:type]\n" - "\t fetcharg: %, @
, @[+|-],\n" + "\t fetcharg: (%|$), @
, @[+|-],\n" #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API "\t $stack, $stack, $retval, $comm, $arg,\n" #else @@ -5567,6 +5568,8 @@ static const char readme_msg[] = "\t stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n" "\t [unsigned] char/int/long\n" #endif + "\t efield: For event probes ('e' types), the field is on of the fields\n" + "\t of the /.\n" #endif " events/\t\t- Directory containing all trace event subsystems:\n" " enable\t\t- Write 0/1 to enable/disable tracing of all events\n" diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 4a0e693000c6..b7c0f8e160fb 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -126,6 +126,11 @@ struct kprobe_trace_entry_head { unsigned long ip; }; +struct eprobe_trace_entry_head { + struct trace_entry ent; + unsigned int type; +}; + struct kretprobe_trace_entry_head { struct trace_entry ent; unsigned long func; @@ -1508,9 +1513,14 @@ static inline int register_trigger_hist_enable_disable_cmds(void) { return 0; } extern int register_trigger_cmds(void); extern void clear_event_triggers(struct trace_array *tr); +enum { + EVENT_TRIGGER_FL_PROBE = BIT(0), +}; + struct event_trigger_data { unsigned long count; int ref; + int flags; struct event_trigger_ops *ops; struct event_command *cmd_ops; struct event_filter __rcu *filter; @@ -1918,6 +1928,14 @@ static inline bool is_good_name(const char *name) return true; } +/* Convert certain expected symbols into '_' when generating event names */ +static inline void sanitize_event_name(char *name) +{ + while (*name++ != '\0') + if (*name == ':' || *name == '.') + *name = '_'; +} + /* * This is a generic way to read and write a u64 value from a file in tracefs. * diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c new file mode 100644 index 000000000000..56a96e9750cf --- /dev/null +++ b/kernel/trace/trace_eprobe.c @@ -0,0 +1,903 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * event probes + * + * Part of this code was copied from kernel/trace/trace_kprobe.c written by + * Masami Hiramatsu + * + * Copyright (C) 2021, VMware Inc, Steven Rostedt + * Copyright (C) 2021, VMware Inc, Tzvetomir Stoyanov tz.stoyanov@gmail.com> + * + */ +#include +#include +#include + +#include "trace_dynevent.h" +#include "trace_probe.h" +#include "trace_probe_tmpl.h" + +#define EPROBE_EVENT_SYSTEM "eprobes" + +struct trace_eprobe { + /* tracepoint system */ + const char *event_system; + + /* tracepoint event */ + const char *event_name; + + struct trace_event_call *event; + + struct dyn_event devent; + struct trace_probe tp; +}; + +struct eprobe_data { + struct trace_event_file *file; + struct trace_eprobe *ep; +}; + +static int __trace_eprobe_create(int argc, const char *argv[]); + +static void trace_event_probe_cleanup(struct trace_eprobe *ep) +{ + if (!ep) + return; + trace_probe_cleanup(&ep->tp); + kfree(ep->event_name); + kfree(ep->event_system); + if (ep->event) + trace_event_put_ref(ep->event); + kfree(ep); +} + +static struct trace_eprobe *to_trace_eprobe(struct dyn_event *ev) +{ + return container_of(ev, struct trace_eprobe, devent); +} + +static int eprobe_dyn_event_create(const char *raw_command) +{ + return trace_probe_create(raw_command, __trace_eprobe_create); +} + +static int eprobe_dyn_event_show(struct seq_file *m, struct dyn_event *ev) +{ + struct trace_eprobe *ep = to_trace_eprobe(ev); + int i; + + seq_printf(m, "e:%s/%s", trace_probe_group_name(&ep->tp), + trace_probe_name(&ep->tp)); + seq_printf(m, " %s.%s", ep->event_system, ep->event_name); + + for (i = 0; i < ep->tp.nr_args; i++) + seq_printf(m, " %s=%s", ep->tp.args[i].name, ep->tp.args[i].comm); + seq_putc(m, '\n'); + + return 0; +} + +static int unregister_trace_eprobe(struct trace_eprobe *ep) +{ + /* If other probes are on the event, just unregister eprobe */ + if (trace_probe_has_sibling(&ep->tp)) + goto unreg; + + /* Enabled event can not be unregistered */ + if (trace_probe_is_enabled(&ep->tp)) + return -EBUSY; + + /* Will fail if probe is being used by ftrace or perf */ + if (trace_probe_unregister_event_call(&ep->tp)) + return -EBUSY; + +unreg: + dyn_event_remove(&ep->devent); + trace_probe_unlink(&ep->tp); + + return 0; +} + +static int eprobe_dyn_event_release(struct dyn_event *ev) +{ + struct trace_eprobe *ep = to_trace_eprobe(ev); + int ret = unregister_trace_eprobe(ep); + + if (!ret) + trace_event_probe_cleanup(ep); + return ret; +} + +static bool eprobe_dyn_event_is_busy(struct dyn_event *ev) +{ + struct trace_eprobe *ep = to_trace_eprobe(ev); + + return trace_probe_is_enabled(&ep->tp); +} + +static bool eprobe_dyn_event_match(const char *system, const char *event, + int argc, const char **argv, struct dyn_event *ev) +{ + struct trace_eprobe *ep = to_trace_eprobe(ev); + + return strcmp(trace_probe_name(&ep->tp), event) == 0 && + (!system || strcmp(trace_probe_group_name(&ep->tp), system) == 0) && + trace_probe_match_command_args(&ep->tp, argc, argv); +} + +static struct dyn_event_operations eprobe_dyn_event_ops = { + .create = eprobe_dyn_event_create, + .show = eprobe_dyn_event_show, + .is_busy = eprobe_dyn_event_is_busy, + .free = eprobe_dyn_event_release, + .match = eprobe_dyn_event_match, +}; + +static struct trace_eprobe *alloc_event_probe(const char *group, + const char *this_event, + struct trace_event_call *event, + int nargs) +{ + struct trace_eprobe *ep; + const char *event_name; + const char *sys_name; + int ret = -ENOMEM; + + if (!event) + return ERR_PTR(-ENODEV); + + sys_name = event->class->system; + event_name = trace_event_name(event); + + ep = kzalloc(struct_size(ep, tp.args, nargs), GFP_KERNEL); + if (!ep) { + trace_event_put_ref(ep->event); + goto error; + } + ep->event = event; + ep->event_name = kstrdup(event_name, GFP_KERNEL); + if (!ep->event_name) + goto error; + ep->event_system = kstrdup(sys_name, GFP_KERNEL); + if (!ep->event_system) + goto error; + + ret = trace_probe_init(&ep->tp, this_event, group, false); + if (ret < 0) + goto error; + + dyn_event_init(&ep->devent, &eprobe_dyn_event_ops); + return ep; +error: + trace_event_probe_cleanup(ep); + return ERR_PTR(ret); +} + +static int trace_eprobe_tp_arg_update(struct trace_eprobe *ep, int i) +{ + struct probe_arg *parg = &ep->tp.args[i]; + struct ftrace_event_field *field; + struct list_head *head; + + head = trace_get_fields(ep->event); + list_for_each_entry(field, head, link) { + if (!strcmp(parg->code->data, field->name)) { + kfree(parg->code->data); + parg->code->data = field; + return 0; + } + } + kfree(parg->code->data); + parg->code->data = NULL; + return -ENOENT; +} + +static int eprobe_event_define_fields(struct trace_event_call *event_call) +{ + int ret; + struct eprobe_trace_entry_head field; + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(event_call); + if (WARN_ON_ONCE(!tp)) + return -ENOENT; + + DEFINE_FIELD(unsigned int, type, FIELD_STRING_TYPE, 0); + + return traceprobe_define_arg_fields(event_call, sizeof(field), tp); +} + +static struct trace_event_fields eprobe_fields_array[] = { + { .type = TRACE_FUNCTION_TYPE, + .define_fields = eprobe_event_define_fields }, + {} +}; + +/* Event entry printers */ +static enum print_line_t +print_eprobe_event(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct eprobe_trace_entry_head *field; + struct trace_event_call *pevent; + struct trace_event *probed_event; + struct trace_seq *s = &iter->seq; + struct trace_probe *tp; + + field = (struct eprobe_trace_entry_head *)iter->ent; + tp = trace_probe_primary_from_call( + container_of(event, struct trace_event_call, event)); + if (WARN_ON_ONCE(!tp)) + goto out; + + trace_seq_printf(s, "%s: (", trace_probe_name(tp)); + + probed_event = ftrace_find_event(field->type); + if (probed_event) { + pevent = container_of(probed_event, struct trace_event_call, event); + trace_seq_printf(s, "%s.%s", pevent->class->system, + trace_event_name(pevent)); + } else { + trace_seq_printf(s, "%u", field->type); + } + + trace_seq_putc(s, ')'); + + if (print_probe_args(s, tp->args, tp->nr_args, + (u8 *)&field[1], field) < 0) + goto out; + + trace_seq_putc(s, '\n'); + out: + return trace_handle_return(s); +} + +static unsigned long get_event_field(struct fetch_insn *code, void *rec) +{ + struct ftrace_event_field *field = code->data; + unsigned long val; + void *addr; + + addr = rec + field->offset; + + switch (field->size) { + case 1: + if (field->is_signed) + val = *(char *)addr; + else + val = *(unsigned char *)addr; + break; + case 2: + if (field->is_signed) + val = *(short *)addr; + else + val = *(unsigned short *)addr; + break; + case 4: + if (field->is_signed) + val = *(int *)addr; + else + val = *(unsigned int *)addr; + break; + default: + if (field->is_signed) + val = *(long *)addr; + else + val = *(unsigned long *)addr; + break; + } + return val; +} + +static int get_eprobe_size(struct trace_probe *tp, void *rec) +{ + struct probe_arg *arg; + int i, len, ret = 0; + + for (i = 0; i < tp->nr_args; i++) { + arg = tp->args + i; + if (unlikely(arg->dynamic)) { + unsigned long val; + + val = get_event_field(arg->code, rec); + len = process_fetch_insn_bottom(arg->code + 1, val, NULL, NULL); + if (len > 0) + ret += len; + } + } + + return ret; +} + +/* Kprobe specific fetch functions */ + +/* Note that we don't verify it, since the code does not come from user space */ +static int +process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, + void *base) +{ + unsigned long val; + + val = get_event_field(code, rec); + return process_fetch_insn_bottom(code + 1, val, dest, base); +} +NOKPROBE_SYMBOL(process_fetch_insn) + +/* Return the length of string -- including null terminal byte */ +static nokprobe_inline int +fetch_store_strlen_user(unsigned long addr) +{ + const void __user *uaddr = (__force const void __user *)addr; + + return strnlen_user_nofault(uaddr, MAX_STRING_SIZE); +} + +/* Return the length of string -- including null terminal byte */ +static nokprobe_inline int +fetch_store_strlen(unsigned long addr) +{ + int ret, len = 0; + u8 c; + +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if (addr < TASK_SIZE) + return fetch_store_strlen_user(addr); +#endif + + do { + ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1); + len++; + } while (c && ret == 0 && len < MAX_STRING_SIZE); + + return (ret < 0) ? ret : len; +} + +/* + * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf + * with max length and relative data location. + */ +static nokprobe_inline int +fetch_store_string_user(unsigned long addr, void *dest, void *base) +{ + const void __user *uaddr = (__force const void __user *)addr; + int maxlen = get_loc_len(*(u32 *)dest); + void *__dest; + long ret; + + if (unlikely(!maxlen)) + return -ENOMEM; + + __dest = get_loc_data(dest, base); + + ret = strncpy_from_user_nofault(__dest, uaddr, maxlen); + if (ret >= 0) + *(u32 *)dest = make_data_loc(ret, __dest - base); + + return ret; +} + +/* + * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max + * length and relative data location. + */ +static nokprobe_inline int +fetch_store_string(unsigned long addr, void *dest, void *base) +{ + int maxlen = get_loc_len(*(u32 *)dest); + void *__dest; + long ret; + +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if ((unsigned long)addr < TASK_SIZE) + return fetch_store_string_user(addr, dest, base); +#endif + + if (unlikely(!maxlen)) + return -ENOMEM; + + __dest = get_loc_data(dest, base); + + /* + * Try to get string again, since the string can be changed while + * probing. + */ + ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen); + if (ret >= 0) + *(u32 *)dest = make_data_loc(ret, __dest - base); + + return ret; +} + +static nokprobe_inline int +probe_mem_read_user(void *dest, void *src, size_t size) +{ + const void __user *uaddr = (__force const void __user *)src; + + return copy_from_user_nofault(dest, uaddr, size); +} + +static nokprobe_inline int +probe_mem_read(void *dest, void *src, size_t size) +{ +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if ((unsigned long)src < TASK_SIZE) + return probe_mem_read_user(dest, src, size); +#endif + return copy_from_kernel_nofault(dest, src, size); +} + +/* eprobe handler */ +static inline void +__eprobe_trace_func(struct eprobe_data *edata, void *rec) +{ + struct eprobe_trace_entry_head *entry; + struct trace_event_call *call = trace_probe_event_call(&edata->ep->tp); + struct trace_event_buffer fbuffer; + int dsize; + + if (WARN_ON_ONCE(call != edata->file->event_call)) + return; + + if (trace_trigger_soft_disabled(edata->file)) + return; + + fbuffer.trace_ctx = tracing_gen_ctx(); + fbuffer.trace_file = edata->file; + + dsize = get_eprobe_size(&edata->ep->tp, rec); + fbuffer.regs = NULL; + + fbuffer.event = + trace_event_buffer_lock_reserve(&fbuffer.buffer, edata->file, + call->event.type, + sizeof(*entry) + edata->ep->tp.size + dsize, + fbuffer.trace_ctx); + if (!fbuffer.event) + return; + + entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); + if (edata->ep->event) + entry->type = edata->ep->event->event.type; + else + entry->type = 0; + store_trace_args(&entry[1], &edata->ep->tp, rec, sizeof(*entry), dsize); + + trace_event_buffer_commit(&fbuffer); +} + +/* + * The event probe implementation uses event triggers to get access to + * the event it is attached to, but is not an actual trigger. The below + * functions are just stubs to fulfill what is needed to use the trigger + * infrastructure. + */ +static int eprobe_trigger_init(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + return 0; +} + +static void eprobe_trigger_free(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + +} + +static int eprobe_trigger_print(struct seq_file *m, + struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + /* Do not print eprobe event triggers */ + return 0; +} + +static void eprobe_trigger_func(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *rbe) +{ + struct eprobe_data *edata = data->private_data; + + __eprobe_trace_func(edata, rec); +} + +static struct event_trigger_ops eprobe_trigger_ops = { + .func = eprobe_trigger_func, + .print = eprobe_trigger_print, + .init = eprobe_trigger_init, + .free = eprobe_trigger_free, +}; + +static int eprobe_trigger_cmd_func(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param) +{ + return -1; +} + +static int eprobe_trigger_reg_func(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct trace_event_file *file) +{ + return -1; +} + +static void eprobe_trigger_unreg_func(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct trace_event_file *file) +{ + +} + +static struct event_trigger_ops *eprobe_trigger_get_ops(char *cmd, + char *param) +{ + return &eprobe_trigger_ops; +} + +static struct event_command event_trigger_cmd = { + .name = "eprobe", + .trigger_type = ETT_EVENT_EPROBE, + .flags = EVENT_CMD_FL_NEEDS_REC, + .func = eprobe_trigger_cmd_func, + .reg = eprobe_trigger_reg_func, + .unreg = eprobe_trigger_unreg_func, + .unreg_all = NULL, + .get_trigger_ops = eprobe_trigger_get_ops, + .set_filter = NULL, +}; + +static struct event_trigger_data * +new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file) +{ + struct event_trigger_data *trigger; + struct eprobe_data *edata; + + edata = kzalloc(sizeof(*edata), GFP_KERNEL); + trigger = kzalloc(sizeof(*trigger), GFP_KERNEL); + if (!trigger || !edata) { + kfree(edata); + kfree(trigger); + return ERR_PTR(-ENOMEM); + } + + trigger->flags = EVENT_TRIGGER_FL_PROBE; + trigger->count = -1; + trigger->ops = &eprobe_trigger_ops; + + /* + * EVENT PROBE triggers are not registered as commands with + * register_event_command(), as they are not controlled by the user + * from the trigger file + */ + trigger->cmd_ops = &event_trigger_cmd; + + INIT_LIST_HEAD(&trigger->list); + RCU_INIT_POINTER(trigger->filter, NULL); + + edata->file = file; + edata->ep = ep; + trigger->private_data = edata; + + return trigger; +} + +static int enable_eprobe(struct trace_eprobe *ep, + struct trace_event_file *eprobe_file) +{ + struct event_trigger_data *trigger; + struct trace_event_file *file; + struct trace_array *tr = eprobe_file->tr; + + file = find_event_file(tr, ep->event_system, ep->event_name); + if (!file) + return -ENOENT; + trigger = new_eprobe_trigger(ep, eprobe_file); + if (IS_ERR(trigger)) + return PTR_ERR(trigger); + + list_add_tail_rcu(&trigger->list, &file->triggers); + + trace_event_trigger_enable_disable(file, 1); + update_cond_flag(file); + + return 0; +} + +static struct trace_event_functions eprobe_funcs = { + .trace = print_eprobe_event +}; + +static int disable_eprobe(struct trace_eprobe *ep, + struct trace_array *tr) +{ + struct event_trigger_data *trigger; + struct trace_event_file *file; + struct eprobe_data *edata; + + file = find_event_file(tr, ep->event_system, ep->event_name); + if (!file) + return -ENOENT; + + list_for_each_entry(trigger, &file->triggers, list) { + if (!(trigger->flags & EVENT_TRIGGER_FL_PROBE)) + continue; + edata = trigger->private_data; + if (edata->ep == ep) + break; + } + if (list_entry_is_head(trigger, &file->triggers, list)) + return -ENODEV; + + list_del_rcu(&trigger->list); + + trace_event_trigger_enable_disable(file, 0); + update_cond_flag(file); + return 0; +} + +static int enable_trace_eprobe(struct trace_event_call *call, + struct trace_event_file *file) +{ + struct trace_probe *pos, *tp; + struct trace_eprobe *ep; + bool enabled; + int ret = 0; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + enabled = trace_probe_is_enabled(tp); + + /* This also changes "enabled" state */ + if (file) { + ret = trace_probe_add_file(tp, file); + if (ret) + return ret; + } else + trace_probe_set_flag(tp, TP_FLAG_PROFILE); + + if (enabled) + return 0; + + list_for_each_entry(pos, trace_probe_probe_list(tp), list) { + ep = container_of(pos, struct trace_eprobe, tp); + ret = enable_eprobe(ep, file); + if (ret) + break; + enabled = true; + } + + if (ret) { + /* Failed to enable one of them. Roll back all */ + if (enabled) + disable_eprobe(ep, file->tr); + if (file) + trace_probe_remove_file(tp, file); + else + trace_probe_clear_flag(tp, TP_FLAG_PROFILE); + } + + return ret; +} + +static int disable_trace_eprobe(struct trace_event_call *call, + struct trace_event_file *file) +{ + struct trace_probe *pos, *tp; + struct trace_eprobe *ep; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + + if (file) { + if (!trace_probe_get_file_link(tp, file)) + return -ENOENT; + if (!trace_probe_has_single_file(tp)) + goto out; + trace_probe_clear_flag(tp, TP_FLAG_TRACE); + } else + trace_probe_clear_flag(tp, TP_FLAG_PROFILE); + + if (!trace_probe_is_enabled(tp)) { + list_for_each_entry(pos, trace_probe_probe_list(tp), list) { + ep = container_of(pos, struct trace_eprobe, tp); + disable_eprobe(ep, file->tr); + } + } + + out: + if (file) + /* + * Synchronization is done in below function. For perf event, + * file == NULL and perf_trace_event_unreg() calls + * tracepoint_synchronize_unregister() to ensure synchronize + * event. We don't need to care about it. + */ + trace_probe_remove_file(tp, file); + + return 0; +} + +static int eprobe_register(struct trace_event_call *event, + enum trace_reg type, void *data) +{ + struct trace_event_file *file = data; + + switch (type) { + case TRACE_REG_REGISTER: + return enable_trace_eprobe(event, file); + case TRACE_REG_UNREGISTER: + return disable_trace_eprobe(event, file); +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + case TRACE_REG_PERF_UNREGISTER: + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: + return 0; +#endif + } + return 0; +} + +static inline void init_trace_eprobe_call(struct trace_eprobe *ep) +{ + struct trace_event_call *call = trace_probe_event_call(&ep->tp); + + call->flags = TRACE_EVENT_FL_EPROBE; + call->event.funcs = &eprobe_funcs; + call->class->fields_array = eprobe_fields_array; + call->class->reg = eprobe_register; +} + +static struct trace_event_call * +find_and_get_event(const char *system, const char *event_name) +{ + struct trace_event_call *tp_event; + const char *name; + + list_for_each_entry(tp_event, &ftrace_events, list) { + /* Skip other probes and ftrace events */ + if (tp_event->flags & + (TRACE_EVENT_FL_IGNORE_ENABLE | + TRACE_EVENT_FL_KPROBE | + TRACE_EVENT_FL_UPROBE | + TRACE_EVENT_FL_EPROBE)) + continue; + if (!tp_event->class->system || + strcmp(system, tp_event->class->system)) + continue; + name = trace_event_name(tp_event); + if (!name || strcmp(event_name, name)) + continue; + if (!trace_event_try_get_ref(tp_event)) { + return NULL; + break; + } + return tp_event; + break; + } + return NULL; +} + +static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[], int i) +{ + unsigned int flags = TPARG_FL_KERNEL | TPARG_FL_TPOINT; + int ret; + + ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], flags); + if (ret) + return ret; + + if (ep->tp.args[i].code->op == FETCH_OP_TP_ARG) + ret = trace_eprobe_tp_arg_update(ep, i); + + return ret; +} + +static int __trace_eprobe_create(int argc, const char *argv[]) +{ + /* + * Argument syntax: + * e[:[GRP/]ENAME] SYSTEM.EVENT [FETCHARGS] + * Fetch args: + * =$[:TYPE] + */ + const char *event = NULL, *group = EPROBE_EVENT_SYSTEM; + const char *sys_event = NULL, *sys_name = NULL; + struct trace_event_call *event_call; + struct trace_eprobe *ep = NULL; + char buf1[MAX_EVENT_NAME_LEN]; + char buf2[MAX_EVENT_NAME_LEN]; + int ret = 0; + int i; + + if (argc < 2 || argv[0][0] != 'e') + return -ECANCELED; + + trace_probe_log_init("event_probe", argc, argv); + + event = strchr(&argv[0][1], ':'); + if (event) { + event++; + ret = traceprobe_parse_event_name(&event, &group, buf1, + event - argv[0]); + if (ret) + goto parse_error; + } else { + strscpy(buf1, argv[1], MAX_EVENT_NAME_LEN); + sanitize_event_name(buf1); + event = buf1; + } + if (!is_good_name(event) || !is_good_name(group)) + goto parse_error; + + sys_event = argv[1]; + ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, + sys_event - argv[1]); + if (ret || !sys_name) + goto parse_error; + if (!is_good_name(sys_event) || !is_good_name(sys_name)) + goto parse_error; + + mutex_lock(&event_mutex); + event_call = find_and_get_event(sys_name, sys_event); + ep = alloc_event_probe(group, event, event_call, argc - 2); + mutex_unlock(&event_mutex); + + if (IS_ERR(ep)) { + ret = PTR_ERR(ep); + /* This must return -ENOMEM, else there is a bug */ + WARN_ON_ONCE(ret != -ENOMEM); + goto error; /* We know ep is not allocated */ + } + + argc -= 2; argv += 2; + /* parse arguments */ + for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + trace_probe_log_set_index(i + 2); + ret = trace_eprobe_tp_update_arg(ep, argv, i); + if (ret) + goto error; + } + ret = traceprobe_set_print_fmt(&ep->tp, PROBE_PRINT_EVENT); + if (ret < 0) + goto error; + init_trace_eprobe_call(ep); + mutex_lock(&event_mutex); + ret = trace_probe_register_event_call(&ep->tp); + if (ret) { + if (ret == -EEXIST) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, EVENT_EXIST); + } + mutex_unlock(&event_mutex); + goto error; + } + ret = dyn_event_add(&ep->devent, &ep->tp.event->call); + mutex_unlock(&event_mutex); + return ret; +parse_error: + ret = -EINVAL; +error: + trace_event_probe_cleanup(ep); + return ret; +} + +/* + * Register dynevent at core_initcall. This allows kernel to setup eprobe + * events in postcore_initcall without tracefs. + */ +static __init int trace_events_eprobe_init_early(void) +{ + int err = 0; + + err = dyn_event_register(&eprobe_dyn_event_ops); + if (err) + pr_warn("Could not register eprobe_dyn_event_ops\n"); + + return err; +} +core_initcall(trace_events_eprobe_init_early); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 6b11e335a62e..3d5c07239a2a 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -124,6 +124,18 @@ static void *trigger_next(struct seq_file *m, void *t, loff_t *pos) return seq_list_next(t, &event_file->triggers, pos); } +static bool check_user_trigger(struct trace_event_file *file) +{ + struct event_trigger_data *data; + + list_for_each_entry_rcu(data, &file->triggers, list) { + if (data->flags & EVENT_TRIGGER_FL_PROBE) + continue; + return true; + } + return false; +} + static void *trigger_start(struct seq_file *m, loff_t *pos) { struct trace_event_file *event_file; @@ -134,7 +146,7 @@ static void *trigger_start(struct seq_file *m, loff_t *pos) if (unlikely(!event_file)) return ERR_PTR(-ENODEV); - if (list_empty(&event_file->triggers)) + if (list_empty(&event_file->triggers) || !check_user_trigger(event_file)) return *pos == 0 ? SHOW_AVAILABLE_TRIGGERS : NULL; return seq_list_start(&event_file->triggers, *pos); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 882c27044029..3a64ba4bbad6 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -707,14 +707,6 @@ static struct notifier_block trace_kprobe_module_nb = { .priority = 1 /* Invoked after kprobe module callback */ }; -/* Convert certain expected symbols into '_' when generating event names */ -static inline void sanitize_event_name(char *name) -{ - while (*name++ != '\0') - if (*name == ':' || *name == '.') - *name = '_'; -} - static int __trace_kprobe_create(int argc, const char *argv[]) { /* diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 782c00eb6859..3ed2a3f37297 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -319,6 +319,13 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, code->op = FETCH_OP_ARG; code->param = (unsigned int)param - 1; #endif + } else if (flags & TPARG_FL_TPOINT) { + if (code->data) + return -EFAULT; + code->data = kstrdup(arg, GFP_KERNEL); + if (!code->data) + return -ENOMEM; + code->op = FETCH_OP_TP_ARG; } else goto inval_var; @@ -646,13 +653,14 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, !strcmp(parg->type->name, "ustring")) { if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF && code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM && - code->op != FETCH_OP_DATA) { + code->op != FETCH_OP_DATA && code->op != FETCH_OP_TP_ARG) { trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_STRING); goto fail; } if ((code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM || - code->op == FETCH_OP_DATA) || parg->count) { + code->op == FETCH_OP_DATA) || code->op == FETCH_OP_TP_ARG || + parg->count) { /* * IMM, DATA and COMM is pointing actual address, those * must be kept, and if parg->count != 0, this is an @@ -867,6 +875,10 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, fmt = "(%lx <- %lx)"; arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; break; + case PROBE_PRINT_EVENT: + fmt = "(%u)"; + arg = "REC->" FIELD_STRING_TYPE; + break; default: WARN_ON_ONCE(1); return 0; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 66701a92d186..99e7a5df025e 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -38,6 +38,7 @@ #define FIELD_STRING_IP "__probe_ip" #define FIELD_STRING_RETIP "__probe_ret_ip" #define FIELD_STRING_FUNC "__probe_func" +#define FIELD_STRING_TYPE "__probe_type" #undef DEFINE_FIELD #define DEFINE_FIELD(type, item, name, is_signed) \ @@ -102,6 +103,7 @@ enum fetch_op { FETCH_OP_MOD_BF, /* Bitfield: .basesize, .lshift, .rshift */ // Stage 5 (loop) op FETCH_OP_LP_ARRAY, /* Array: .param = loop count */ + FETCH_OP_TP_ARG, /* Trace Point argument */ FETCH_OP_END, FETCH_NOP_SYMBOL, /* Unresolved Symbol holder */ }; @@ -351,7 +353,8 @@ int trace_probe_create(const char *raw_command, int (*createfn)(int, const char #define TPARG_FL_RETURN BIT(0) #define TPARG_FL_KERNEL BIT(1) #define TPARG_FL_FENTRY BIT(2) -#define TPARG_FL_MASK GENMASK(2, 0) +#define TPARG_FL_TPOINT BIT(3) +#define TPARG_FL_MASK GENMASK(3, 0) extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *argv, unsigned int flags); @@ -366,6 +369,7 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, enum probe_print_type { PROBE_PRINT_NORMAL, PROBE_PRINT_RETURN, + PROBE_PRINT_EVENT, }; extern int traceprobe_set_print_fmt(struct trace_probe *tp, enum probe_print_type ptype); From aaac2820a36707a8f44adbd2774c623b4e21031c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 19 Aug 2021 11:26:07 -0400 Subject: [PATCH 30/35] selftests/ftrace: Add clear_dynamic_events() to test cases Add a function to remove all dynamic events from the tracing directory. It requires a loop as some of the dynamic events may depend on others being removed first. Also add a safety that prevents it from looping infinitely due to a bug where an event never gets removed. Link: https://lkml.kernel.org/r/20210819152825.348941368@goodmis.org Cc: "Tzvetomir Stoyanov" Cc: Tom Zanussi Cc: Shuah Khan Cc: Shuah Khan Cc: linux-kselftest@vger.kernel.org Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- .../testing/selftests/ftrace/test.d/functions | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tools/testing/selftests/ftrace/test.d/functions b/tools/testing/selftests/ftrace/test.d/functions index a6fac927ee82..f68d336b961b 100644 --- a/tools/testing/selftests/ftrace/test.d/functions +++ b/tools/testing/selftests/ftrace/test.d/functions @@ -83,6 +83,27 @@ clear_synthetic_events() { # reset all current synthetic events done } +clear_dynamic_events() { # reset all current dynamic events + again=1 + stop=1 + # loop mulitple times as some events require other to be removed first + while [ $again -eq 1 ]; do + stop=$((stop+1)) + # Prevent infinite loops + if [ $stop -gt 10 ]; then + break; + fi + again=2 + grep -v '^#' dynamic_events| + while read line; do + del=`echo $line | sed -e 's/^.\([^ ]*\).*/-\1/'` + if ! echo "$del" >> dynamic_events; then + again=1 + fi + done + done +} + initialize_ftrace() { # Reset ftrace to initial-state # As the initial state, ftrace will be set to nop tracer, # no events, no triggers, no filters, no function filters, @@ -93,6 +114,7 @@ initialize_ftrace() { # Reset ftrace to initial-state reset_events_filter reset_ftrace_filter disable_events + clear_dynamic_events [ -f set_event_pid ] && echo > set_event_pid [ -f set_ftrace_pid ] && echo > set_ftrace_pid [ -f set_ftrace_notrace ] && echo > set_ftrace_notrace From 210f9df02611cbe641ced3239122b270fd907d86 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 20 Aug 2021 16:46:47 -0400 Subject: [PATCH 31/35] selftests/ftrace: Fix requirement check of README file The selftest for ftrace checks some features by checking if the README has text that states the feature is supported by that kernel. Unfortunately, this check gives false positives because it many not be checked if there's spaces in the string to check. This is due to the compare between the required variable with the ":README" string stripped, because neither has quotes around them. Link: https://lkml.kernel.org/r/20210820204742.087177341@goodmis.org Cc: "Tzvetomir Stoyanov" Cc: Tom Zanussi Cc: Shuah Khan Cc: Shuah Khan Cc: linux-kselftest@vger.kernel.org Cc: stable@vger.kernel.org Fixes: 1b8eec510ba64 ("selftests/ftrace: Support ":README" suffix for requires") Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- tools/testing/selftests/ftrace/test.d/functions | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/ftrace/test.d/functions b/tools/testing/selftests/ftrace/test.d/functions index f68d336b961b..000fd05e84b1 100644 --- a/tools/testing/selftests/ftrace/test.d/functions +++ b/tools/testing/selftests/ftrace/test.d/functions @@ -137,7 +137,7 @@ check_requires() { # Check required files and tracers echo "Required tracer $t is not configured." exit_unsupported fi - elif [ $r != $i ]; then + elif [ "$r" != "$i" ]; then if ! grep -Fq "$r" README ; then echo "Required feature pattern \"$r\" is not in README." exit_unsupported From 079db70794ec5494a9e1091a1788e671d30b9103 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 20 Aug 2021 16:46:48 -0400 Subject: [PATCH 32/35] selftests/ftrace: Add test case to test adding and removing of event probe Add a test case that adds an event probe, makes sure that it works, and then removes it. Link: https://lore.kernel.org/linux-kselftest/20210819152825.526931866@goodmis.org/ Link: https://lkml.kernel.org/r/20210820204742.274591200@goodmis.org Cc: "Tzvetomir Stoyanov" Cc: Tom Zanussi Cc: Shuah Khan Cc: Shuah Khan Cc: linux-kselftest@vger.kernel.org Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- .../test.d/dynevent/add_remove_eprobe.tc | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 tools/testing/selftests/ftrace/test.d/dynevent/add_remove_eprobe.tc diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_eprobe.tc b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_eprobe.tc new file mode 100644 index 000000000000..25a3da4eaa44 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_eprobe.tc @@ -0,0 +1,40 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Generic dynamic event - add/remove eprobe events +# requires: dynamic_events events/syscalls/sys_enter_openat "e[:[/]] . []":README + +echo 0 > events/enable + +clear_dynamic_events + +SYSTEM="syscalls" +EVENT="sys_enter_openat" +FIELD="filename" +EPROBE="eprobe_open" + +echo "e:$EPROBE $SYSTEM/$EVENT file=+0(\$filename):ustring" >> dynamic_events + +grep -q "$EPROBE" dynamic_events +test -d events/eprobes/$EPROBE + +echo 1 > events/eprobes/$EPROBE/enable +ls +echo 0 > events/eprobes/$EPROBE/enable + +content=`grep '^ *ls-' trace | grep 'file='` +nocontent=`grep '^ *ls-' trace | grep 'file=' | grep -v -e '"/' -e '"."'` || true + +if [ -z "$content" ]; then + exit_fail +fi + +if [ ! -z "$nocontent" ]; then + exit_fail +fi + +echo "-:$EPROBE" >> dynamic_events + +! grep -q "$EPROBE" dynamic_events +! test -d events/eprobes/$EPROBE + +clear_trace From 8f022d3a769c5460c325d3444cf5f0c948d5bb52 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 20 Aug 2021 16:46:49 -0400 Subject: [PATCH 33/35] selftests/ftrace: Add selftest for testing eprobe events on synthetic events Add a test to test event probes, by creating a synthetic event across sys_enter_openat and sys_exit_openat that passes the filename pointer from the enter of the system call to the exit, and then add an event probe to the synthetic event to make sure that the file name is seen. Link: https://lore.kernel.org/linux-kselftest/20210819152825.526931866@goodmis.org/ Link: https://lkml.kernel.org/r/20210820204742.463259900@goodmis.org Cc: "Tzvetomir Stoyanov" Cc: Tom Zanussi Cc: Shuah Khan Cc: Shuah Khan Cc: linux-kselftest@vger.kernel.org Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- .../inter-event/trigger-synthetic-eprobe.tc | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-eprobe.tc diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-eprobe.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-eprobe.tc new file mode 100644 index 000000000000..914fe2e5d030 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-eprobe.tc @@ -0,0 +1,53 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: event trigger - test inter-event histogram trigger eprobe on synthetic event +# requires: dynamic_events synthetic_events events/syscalls/sys_enter_openat/hist "e[:[/]] . []":README + +echo 0 > events/enable + +clear_dynamic_events + +SYSTEM="syscalls" +START="sys_enter_openat" +END="sys_exit_openat" +FIELD="filename" +SYNTH="synth_open" +EPROBE="eprobe_open" + +echo "$SYNTH u64 filename; s64 ret;" > synthetic_events +echo "hist:keys=common_pid:__arg__1=$FIELD" > events/$SYSTEM/$START/trigger +echo "hist:keys=common_pid:filename=\$__arg__1,ret=ret:onmatch($SYSTEM.$START).trace($SYNTH,\$filename,\$ret)" > events/$SYSTEM/$END/trigger + +echo "e:$EPROBE synthetic/$SYNTH file=+0(\$filename):ustring ret=\$ret:s64" >> dynamic_events + +grep -q "$SYNTH" dynamic_events +grep -q "$EPROBE" dynamic_events +test -d events/synthetic/$SYNTH +test -d events/eprobes/$EPROBE + +echo 1 > events/eprobes/$EPROBE/enable +ls +echo 0 > events/eprobes/$EPROBE/enable + +content=`grep '^ *ls-' trace | grep 'file='` +nocontent=`grep '^ *ls-' trace | grep 'file=' | grep -v -e '"/' -e '"."'` || true + +if [ -z "$content" ]; then + exit_fail +fi + +if [ ! -z "$nocontent" ]; then + exit_fail +fi + +echo "-:$EPROBE" >> dynamic_events +echo '!'"hist:keys=common_pid:filename=\$__arg__1,ret=ret:onmatch($SYSTEM.$START).trace($SYNTH,\$filename,\$ret)" > events/$SYSTEM/$END/trigger +echo '!'"hist:keys=common_pid:__arg__1=$FIELD" > events/$SYSTEM/$START/trigger +echo '!'"$SYNTH u64 filename; s64 ret;" >> synthetic_events + +! grep -q "$SYNTH" dynamic_events +! grep -q "$EPROBE" dynamic_events +! test -d events/synthetic/$SYNTH +! test -d events/eprobes/$EPROBE + +clear_trace From 297e1dcdca3d7f268ccfb175d0a3534bb9481303 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 20 Aug 2021 16:46:50 -0400 Subject: [PATCH 34/35] selftests/ftrace: Add selftest for testing duplicate eprobes and kprobes Add a selftest that makes sure that eprobes and kprobes can not be created with the same group and name as existing events. Link: https://lore.kernel.org/linux-kselftest/20210819152825.715290342@goodmis.org/ Link: https://lkml.kernel.org/r/20210820204742.653288346@goodmis.org Cc: "Tzvetomir Stoyanov" Cc: Tom Zanussi Cc: Shuah Khan Cc: Shuah Khan Cc: linux-kselftest@vger.kernel.org Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- .../ftrace/test.d/dynevent/test_duplicates.tc | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 tools/testing/selftests/ftrace/test.d/dynevent/test_duplicates.tc diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/test_duplicates.tc b/tools/testing/selftests/ftrace/test.d/dynevent/test_duplicates.tc new file mode 100644 index 000000000000..db522577ff78 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/dynevent/test_duplicates.tc @@ -0,0 +1,38 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Generic dynamic event - check if duplicate events are caught +# requires: dynamic_events "e[:[/]] . []":README + +echo 0 > events/enable + +HAVE_KPROBES=0 + +if [ -f kprobe_events ]; then + HAVE_KPROBES=1 +fi + +clear_dynamic_events + +# first create dynamic events for eprobes and kprobes. + +echo 'e:egroup/eevent syscalls/sys_enter_openat file=+0($filename):ustring' >> dynamic_events + +# Test eprobe for same eprobe, existing kprobe and existing event +! echo 'e:egroup/eevent syscalls/sys_enter_openat file=+0($filename):ustring' >> dynamic_events +! echo 'e:syscalls/sys_enter_open syscalls/sys_enter_openat file=+0($filename):ustring' >> dynamic_events + +if [ $HAVE_KPROBES -eq 1 ]; then + echo 'p:kgroup/kevent vfs_open file=+0($arg2)' >> dynamic_events + ! echo 'e:kgroup/kevent syscalls/sys_enter_openat file=+0($filename):ustring' >> dynamic_events + +# Test kprobe for same kprobe, existing eprobe and existing event + ! echo 'p:kgroup/kevent vfs_open file=+0($arg2)' >> dynamic_events + ! echo 'p:egroup/eevent vfs_open file=+0($arg2)' >> dynamic_events + ! echo 'p:syscalls/sys_enter_open vfs_open file=+0($arg2)' >> dynamic_events + + echo '-:kgroup/kevent' >> dynamic_events +fi + +echo '-:egroup/eevent' >> dynamic_events + +clear_trace From 4420f5b1be7b117330526f3eabd13d840f510b15 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 23 Aug 2021 10:00:07 -0400 Subject: [PATCH 35/35] tracing/doc: Fix table format in histogram code The addition of the buckets conversion for the histogram code, updated the documentation table of available conversions, but did not update the format to accommodate the extra size needed to cover the description. Link: https://lkml.kernel.org/r/20210823100007.71ce2ba9@oasis.local.home Reported-by: Stephen Rothwell Tested-by: Stephen Rothwell Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/histogram.rst | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Documentation/trace/histogram.rst b/Documentation/trace/histogram.rst index 4e650671f245..533415644c54 100644 --- a/Documentation/trace/histogram.rst +++ b/Documentation/trace/histogram.rst @@ -70,16 +70,16 @@ Documentation written by Tom Zanussi modified by appending any of the following modifiers to the field name: - =========== ========================================== - .hex display a number as a hex value - .sym display an address as a symbol - .sym-offset display an address as a symbol and offset - .syscall display a syscall id as a system call name - .execname display a common_pid as a program name - .log2 display log2 value rather than raw number + ============= ================================================= + .hex display a number as a hex value + .sym display an address as a symbol + .sym-offset display an address as a symbol and offset + .syscall display a syscall id as a system call name + .execname display a common_pid as a program name + .log2 display log2 value rather than raw number .buckets=size display grouping of values rather than raw number - .usecs display a common_timestamp in microseconds - =========== ========================================== + .usecs display a common_timestamp in microseconds + ============= ================================================= Note that in general the semantics of a given field aren't interpreted when applying a modifier to it, but there are some