From 8f0901cda14d3be38cd2196d8cf61cdf3b368e34 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 9 Jun 2021 18:04:59 -0400 Subject: [PATCH] tracing: Add better comments for the filtering temp buffer use case When filtering is enabled, the event is copied into a temp buffer instead of being written into the ring buffer directly, because the discarding of events from the ring buffer is very expensive, and doing the extra copy is much faster than having to discard most of the time. As that logic is subtle, add comments to explain in more detail to what is going on and how it works. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a0a84ff46ecd..a0d66a056e59 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2734,10 +2734,44 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, if (!tr->no_filter_buffering_ref && (trace_file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) && (entry = this_cpu_read(trace_buffered_event))) { - /* Try to use the per cpu buffer first */ + /* + * Filtering is on, so try to use the per cpu buffer first. + * This buffer will simulate a ring_buffer_event, + * where the type_len is zero and the array[0] will + * hold the full length. + * (see include/linux/ring-buffer.h for details on + * how the ring_buffer_event is structured). + * + * Using a temp buffer during filtering and copying it + * on a matched filter is quicker than writing directly + * into the ring buffer and then discarding it when + * it doesn't match. That is because the discard + * requires several atomic operations to get right. + * Copying on match and doing nothing on a failed match + * is still quicker than no copy on match, but having + * to discard out of the ring buffer on a failed match. + */ int max_len = PAGE_SIZE - struct_size(entry, array, 1); val = this_cpu_inc_return(trace_buffered_event_cnt); + + /* + * Preemption is disabled, but interrupts and NMIs + * can still come in now. If that happens after + * the above increment, then it will have to go + * back to the old method of allocating the event + * on the ring buffer, and if the filter fails, it + * will have to call ring_buffer_discard_commit() + * to remove it. + * + * Need to also check the unlikely case that the + * length is bigger than the temp buffer size. + * If that happens, then the reserve is pretty much + * guaranteed to fail, as the ring buffer currently + * only allows events less than a page. But that may + * change in the future, so let the ring buffer reserve + * handle the failure in that case. + */ if (val == 1 && likely(len <= max_len)) { trace_event_setup(entry, type, trace_ctx); entry->array[0] = len;