perf stat: Add "--per-cache" aggregation option and document it
This patch adds support for "--per-cache" option for aggregation at a particular cache level and documents the same. Following is the output of 'perf stat' with aggregation at L3 for the event "ls_dmnd_fills_from_sys.ext_cache_remote" on a dual socket 3rd Generation EPYC Processor (2 x 64C/128T - 16 LLCs) when running hackbench pinned to 4 LLCs: $ sudo perf stat --per-cache=L3 -a -e ls_dmnd_fills_from_sys.ext_cache_remote -- \ taskset -c 0-15,64-79,128-143,192-207 \ perf bench sched messaging -p -t -l 100000 -g 8 ... Performance counter stats for 'system wide': S0-D0-L3-ID0 16 9,500,803 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID8 16 6,338,099 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID16 16 355,005 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID24 16 22,067 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID32 16 16,321 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID40 16 11,619 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID48 16 4,238 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID56 16 31,158 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID64 16 28,242,452 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID72 16 22,906,973 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID80 16 72,898 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID88 16 56,907 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID96 16 20,456 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID104 16 40,913 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID112 16 78,113 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID120 16 37,897 ls_dmnd_fills_from_sys.ext_cache_remote Also support 'perf stat record' and 'perf stat report' with the ability to specify a different cache level to aggregate data at when running 'perf stat report'. $ sudo perf stat record --per-cache=L2 -a -e ls_dmnd_fills_from_sys.ext_cache_remote -- \ taskset -c 0-15,64-79,128-143,192-207 \ perf bench sched messaging -p -t -l 100000 -g 8 ... Performance counter stats for 'system wide': S0-D0-L2-ID0 2 1,442,061 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L2-ID1 2 1,548,994 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L2-ID2 2 1,553,557 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L2-ID3 2 1,420,122 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L2-ID4 2 1,465,461 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L2-ID5 2 1,455,153 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L2-ID6 2 1,595,237 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L2-ID7 2 1,499,321 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L2-ID8 2 1,919,025 ls_dmnd_fills_from_sys.ext_cache_remote ... S1-D1-L2-ID127 2 21,295 ls_dmnd_fills_from_sys.ext_cache_remote $ sudo perf stat report --per-cache=L3 Performance counter stats for 'perf stat record --per-cache=L2 -a -e ls_dmnd_fills_from_sys.ext_cache_remote --\ taskset -c 0-15,64-79,128-143,192-207 \ perf bench sched messaging -p -t -l 100000 -g 8': S0-D0-L3-ID0 16 11,979,906 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID8 16 14,257,202 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID16 16 377,484 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID24 16 27,224 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID32 16 26,816 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID40 16 14,461 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID48 16 10,499 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID56 16 53,817 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID64 16 27,361,987 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID72 16 37,299,024 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID80 16 84,125 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID88 16 64,561 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID96 16 13,403 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID104 16 20,138 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID112 16 93,220 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID120 16 35,465 ls_dmnd_fills_from_sys.ext_cache_remote On the above system, the domain covered by S0-D0-L3-ID0 contains S0-D0-L2-ID0 to S0-D0-L2-ID7, the corresponding count for L3-ID0 is equal to the sum of counts for L2-ID0 to L2-ID7. Add documentation for the newly introduced "--per-cache" option. Suggested-by: Gautham Shenoy <gautham.shenoy@amd.com> Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com> Acked-by: Ian Rogers <irogers@google.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Ananth Narayan <ananth.narayan@amd.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ravi Bangoria <ravi.bangoria@amd.com> Cc: Sandipan Das <sandipan.das@amd.com> Cc: Stephane Eranian <eranian@google.com> Cc: Wen Pu <puwen@hygon.cn> Link: https://lore.kernel.org/r/20230517172745.5833-5-kprateek.nayak@amd.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
This commit is contained in:
Родитель
4b87406a3b
Коммит
aab667ca88
|
@ -308,6 +308,14 @@ use --per-die in addition to -a. (system-wide). The output includes the
|
|||
die number and the number of online processors on that die. This is
|
||||
useful to gauge the amount of aggregation.
|
||||
|
||||
--per-cache::
|
||||
Aggregate counts per cache instance for system-wide mode measurements. By
|
||||
default, the aggregation happens for the cache level at the highest index
|
||||
in the system. To specify a particular level, mention the cache level
|
||||
alongside the option in the format [Ll][1-9][0-9]*. For example:
|
||||
Using option "--per-cache=l3" or "--per-cache=L3" will aggregate the
|
||||
information at the boundary of the level 3 cache in the system.
|
||||
|
||||
--per-core::
|
||||
Aggregate counts per physical processor for system-wide mode measurements. This
|
||||
is a useful mode to detect imbalance between physical cores. To enable this mode,
|
||||
|
@ -379,6 +387,14 @@ Aggregate counts per processor socket for system-wide mode measurements.
|
|||
--per-die::
|
||||
Aggregate counts per processor die for system-wide mode measurements.
|
||||
|
||||
--per-cache::
|
||||
Aggregate counts per cache instance for system-wide mode measurements. By
|
||||
default, the aggregation happens for the cache level at the highest index
|
||||
in the system. To specify a particular level, mention the cache level
|
||||
alongside the option in the format [Ll][1-9][0-9]*. For example: Using
|
||||
option "--per-cache=l3" or "--per-cache=L3" will aggregate the
|
||||
information at the boundary of the level 3 cache in the system.
|
||||
|
||||
--per-core::
|
||||
Aggregate counts per physical processor for system-wide mode measurements.
|
||||
|
||||
|
|
|
@ -1113,6 +1113,55 @@ static int parse_cputype(const struct option *opt,
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int parse_cache_level(const struct option *opt,
|
||||
const char *str,
|
||||
int unset __maybe_unused)
|
||||
{
|
||||
int level;
|
||||
u32 *aggr_mode = (u32 *)opt->value;
|
||||
u32 *aggr_level = (u32 *)opt->data;
|
||||
|
||||
/*
|
||||
* If no string is specified, aggregate based on the topology of
|
||||
* Last Level Cache (LLC). Since the LLC level can change from
|
||||
* architecture to architecture, set level greater than
|
||||
* MAX_CACHE_LVL which will be interpreted as LLC.
|
||||
*/
|
||||
if (str == NULL) {
|
||||
level = MAX_CACHE_LVL + 1;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* The format to specify cache level is LX or lX where X is the
|
||||
* cache level.
|
||||
*/
|
||||
if (strlen(str) != 2 || (str[0] != 'l' && str[0] != 'L')) {
|
||||
pr_err("Cache level must be of form L[1-%d], or l[1-%d]\n",
|
||||
MAX_CACHE_LVL,
|
||||
MAX_CACHE_LVL);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
level = atoi(&str[1]);
|
||||
if (level < 1) {
|
||||
pr_err("Cache level must be of form L[1-%d], or l[1-%d]\n",
|
||||
MAX_CACHE_LVL,
|
||||
MAX_CACHE_LVL);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (level > MAX_CACHE_LVL) {
|
||||
pr_err("perf only supports max cache level of %d.\n"
|
||||
"Consider increasing MAX_CACHE_LVL\n", MAX_CACHE_LVL);
|
||||
return -EINVAL;
|
||||
}
|
||||
out:
|
||||
*aggr_mode = AGGR_CACHE;
|
||||
*aggr_level = level;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct option stat_options[] = {
|
||||
OPT_BOOLEAN('T', "transaction", &transaction_run,
|
||||
"hardware transaction statistics"),
|
||||
|
@ -1190,6 +1239,9 @@ static struct option stat_options[] = {
|
|||
"aggregate counts per processor socket", AGGR_SOCKET),
|
||||
OPT_SET_UINT(0, "per-die", &stat_config.aggr_mode,
|
||||
"aggregate counts per processor die", AGGR_DIE),
|
||||
OPT_CALLBACK_OPTARG(0, "per-cache", &stat_config.aggr_mode, &stat_config.aggr_level,
|
||||
"cache level", "aggregate count at this cache level (Default: LLC)",
|
||||
parse_cache_level),
|
||||
OPT_SET_UINT(0, "per-core", &stat_config.aggr_mode,
|
||||
"aggregate counts per physical processor core", AGGR_CORE),
|
||||
OPT_SET_UINT(0, "per-thread", &stat_config.aggr_mode,
|
||||
|
@ -2346,6 +2398,10 @@ static int __cmd_report(int argc, const char **argv)
|
|||
"aggregate counts per processor socket", AGGR_SOCKET),
|
||||
OPT_SET_UINT(0, "per-die", &perf_stat.aggr_mode,
|
||||
"aggregate counts per processor die", AGGR_DIE),
|
||||
OPT_CALLBACK_OPTARG(0, "per-cache", &perf_stat.aggr_mode, &perf_stat.aggr_level,
|
||||
"cache level",
|
||||
"aggregate count at this cache level (Default: LLC)",
|
||||
parse_cache_level),
|
||||
OPT_SET_UINT(0, "per-core", &perf_stat.aggr_mode,
|
||||
"aggregate counts per physical processor core", AGGR_CORE),
|
||||
OPT_SET_UINT(0, "per-node", &perf_stat.aggr_mode,
|
||||
|
|
Загрузка…
Ссылка в новой задаче