powerpc/pmu: Make the generic compat PMU use the architected events

This changes generic-compat-pmu.c so that it only uses architected
events defined in Power ISA v3.0B, rather than event encodings which,
while common to all the IBM Power Systems implementations, are
nevertheless implementation-specific rather than architected.  The
intention is that any CPU implementation designed to conform to Power
ISA v3.0B or later can use generic-compat-pmu.c.

In addition to the existing events for cycles and instructions, this
adds several other architected events, including alternative encodings
for some events.  In order to make it possible to measure cycles and
instructions at the same time as each other, we set the CC5-6RUN bit
in MMCR0, which makes PMC5 and PMC6 count instructions and cycles
regardless of the run bit, so their events are now PM_CYC and
PM_INST_CMPL rather than PM_RUN_CYC and PM_RUN_INST_CMPL (the latter
are still available via other event codes).

Note that POWER9 has an erratum where one architected event
(PM_FLOP_CMPL, floating-point operations completed, code 0x100f4) does
not work correctly.  Given that there is a specific PMU driver for P9
which will be used in preference to generic-compat-pmu.c, that is not
a real problem.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Reviewed-by: Madhavan Srinivasan <maddy@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/YJD7L9yeoxvxqeYi@thinks.paulus.ozlabs.org
This commit is contained in:
Paul Mackerras 2021-05-04 17:43:43 +10:00 коммит произвёл Michael Ellerman
Родитель bfb0c9fcf5
Коммит d40a82be2f
1 изменённых файлов: 134 добавлений и 36 удалений

Просмотреть файл

@ -14,45 +14,119 @@
* *
* 28 24 20 16 12 8 4 0 * 28 24 20 16 12 8 4 0
* | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - |
* [ pmc ] [unit ] [ ] m [ pmcxsel ] * [ pmc ] [ pmcxsel ]
* | |
* | *- mark
* |
* |
* *- combine
*
* Below uses IBM bit numbering.
*
* MMCR1[x:y] = unit (PMCxUNIT)
* MMCR1[24] = pmc1combine[0]
* MMCR1[25] = pmc1combine[1]
* MMCR1[26] = pmc2combine[0]
* MMCR1[27] = pmc2combine[1]
* MMCR1[28] = pmc3combine[0]
* MMCR1[29] = pmc3combine[1]
* MMCR1[30] = pmc4combine[0]
* MMCR1[31] = pmc4combine[1]
*
*/ */
/* /*
* Some power9 event codes. * Event codes defined in ISA v3.0B
*/ */
#define EVENT(_name, _code) _name = _code, #define EVENT(_name, _code) _name = _code,
enum { enum {
EVENT(PM_CYC, 0x0001e) /* Cycles, alternate code */
EVENT(PM_INST_CMPL, 0x00002) EVENT(PM_CYC_ALT, 0x100f0)
/* One or more instructions completed in a cycle */
EVENT(PM_CYC_INST_CMPL, 0x100f2)
/* Floating-point instruction completed */
EVENT(PM_FLOP_CMPL, 0x100f4)
/* Instruction ERAT/L1-TLB miss */
EVENT(PM_L1_ITLB_MISS, 0x100f6)
/* All instructions completed and none available */
EVENT(PM_NO_INST_AVAIL, 0x100f8)
/* A load-type instruction completed (ISA v3.0+) */
EVENT(PM_LD_CMPL, 0x100fc)
/* Instruction completed, alternate code (ISA v3.0+) */
EVENT(PM_INST_CMPL_ALT, 0x100fe)
/* A store-type instruction completed */
EVENT(PM_ST_CMPL, 0x200f0)
/* Instruction Dispatched */
EVENT(PM_INST_DISP, 0x200f2)
/* Run_cycles */
EVENT(PM_RUN_CYC, 0x200f4)
/* Data ERAT/L1-TLB miss/reload */
EVENT(PM_L1_DTLB_RELOAD, 0x200f6)
/* Taken branch completed */
EVENT(PM_BR_TAKEN_CMPL, 0x200fa)
/* Demand iCache Miss */
EVENT(PM_L1_ICACHE_MISS, 0x200fc)
/* L1 Dcache reload from memory */
EVENT(PM_L1_RELOAD_FROM_MEM, 0x200fe)
/* L1 Dcache store miss */
EVENT(PM_ST_MISS_L1, 0x300f0)
/* Alternate code for PM_INST_DISP */
EVENT(PM_INST_DISP_ALT, 0x300f2)
/* Branch direction or target mispredicted */
EVENT(PM_BR_MISPREDICT, 0x300f6)
/* Data TLB miss/reload */
EVENT(PM_DTLB_MISS, 0x300fc)
/* Demand LD - L3 Miss (not L2 hit and not L3 hit) */
EVENT(PM_DATA_FROM_L3MISS, 0x300fe)
/* L1 Dcache load miss */
EVENT(PM_LD_MISS_L1, 0x400f0)
/* Cycle when instruction(s) dispatched */
EVENT(PM_CYC_INST_DISP, 0x400f2)
/* Branch or branch target mispredicted */
EVENT(PM_BR_MPRED_CMPL, 0x400f6)
/* Instructions completed with run latch set */
EVENT(PM_RUN_INST_CMPL, 0x400fa)
/* Instruction TLB miss/reload */
EVENT(PM_ITLB_MISS, 0x400fc)
/* Load data not cached */
EVENT(PM_LD_NOT_CACHED, 0x400fe)
/* Instructions */
EVENT(PM_INST_CMPL, 0x500fa)
/* Cycles */
EVENT(PM_CYC, 0x600f4)
}; };
#undef EVENT #undef EVENT
/* Table of alternatives, sorted in increasing order of column 0 */
/* Note that in each row, column 0 must be the smallest */
static const unsigned int generic_event_alternatives[][MAX_ALT] = {
{ PM_CYC_ALT, PM_CYC },
{ PM_INST_CMPL_ALT, PM_INST_CMPL },
{ PM_INST_DISP, PM_INST_DISP_ALT },
};
static int generic_get_alternatives(u64 event, unsigned int flags, u64 alt[])
{
int num_alt = 0;
num_alt = isa207_get_alternatives(event, alt,
ARRAY_SIZE(generic_event_alternatives), flags,
generic_event_alternatives);
return num_alt;
}
GENERIC_EVENT_ATTR(cpu-cycles, PM_CYC); GENERIC_EVENT_ATTR(cpu-cycles, PM_CYC);
GENERIC_EVENT_ATTR(instructions, PM_INST_CMPL); GENERIC_EVENT_ATTR(instructions, PM_INST_CMPL);
GENERIC_EVENT_ATTR(stalled-cycles-frontend, PM_NO_INST_AVAIL);
GENERIC_EVENT_ATTR(branch-misses, PM_BR_MPRED_CMPL);
GENERIC_EVENT_ATTR(cache-misses, PM_LD_MISS_L1);
CACHE_EVENT_ATTR(L1-dcache-load-misses, PM_LD_MISS_L1);
CACHE_EVENT_ATTR(L1-dcache-store-misses, PM_ST_MISS_L1);
CACHE_EVENT_ATTR(L1-icache-load-misses, PM_L1_ICACHE_MISS);
CACHE_EVENT_ATTR(LLC-load-misses, PM_DATA_FROM_L3MISS);
CACHE_EVENT_ATTR(branch-load-misses, PM_BR_MPRED_CMPL);
CACHE_EVENT_ATTR(dTLB-load-misses, PM_DTLB_MISS);
CACHE_EVENT_ATTR(iTLB-load-misses, PM_ITLB_MISS);
static struct attribute *generic_compat_events_attr[] = { static struct attribute *generic_compat_events_attr[] = {
GENERIC_EVENT_PTR(PM_CYC), GENERIC_EVENT_PTR(PM_CYC),
GENERIC_EVENT_PTR(PM_INST_CMPL), GENERIC_EVENT_PTR(PM_INST_CMPL),
GENERIC_EVENT_PTR(PM_NO_INST_AVAIL),
GENERIC_EVENT_PTR(PM_BR_MPRED_CMPL),
GENERIC_EVENT_PTR(PM_LD_MISS_L1),
CACHE_EVENT_PTR(PM_LD_MISS_L1),
CACHE_EVENT_PTR(PM_ST_MISS_L1),
CACHE_EVENT_PTR(PM_L1_ICACHE_MISS),
CACHE_EVENT_PTR(PM_DATA_FROM_L3MISS),
CACHE_EVENT_PTR(PM_BR_MPRED_CMPL),
CACHE_EVENT_PTR(PM_DTLB_MISS),
CACHE_EVENT_PTR(PM_ITLB_MISS),
NULL NULL
}; };
@ -63,17 +137,11 @@ static struct attribute_group generic_compat_pmu_events_group = {
PMU_FORMAT_ATTR(event, "config:0-19"); PMU_FORMAT_ATTR(event, "config:0-19");
PMU_FORMAT_ATTR(pmcxsel, "config:0-7"); PMU_FORMAT_ATTR(pmcxsel, "config:0-7");
PMU_FORMAT_ATTR(mark, "config:8");
PMU_FORMAT_ATTR(combine, "config:10-11");
PMU_FORMAT_ATTR(unit, "config:12-15");
PMU_FORMAT_ATTR(pmc, "config:16-19"); PMU_FORMAT_ATTR(pmc, "config:16-19");
static struct attribute *generic_compat_pmu_format_attr[] = { static struct attribute *generic_compat_pmu_format_attr[] = {
&format_attr_event.attr, &format_attr_event.attr,
&format_attr_pmcxsel.attr, &format_attr_pmcxsel.attr,
&format_attr_mark.attr,
&format_attr_combine.attr,
&format_attr_unit.attr,
&format_attr_pmc.attr, &format_attr_pmc.attr,
NULL, NULL,
}; };
@ -92,6 +160,9 @@ static const struct attribute_group *generic_compat_pmu_attr_groups[] = {
static int compat_generic_events[] = { static int compat_generic_events[] = {
[PERF_COUNT_HW_CPU_CYCLES] = PM_CYC, [PERF_COUNT_HW_CPU_CYCLES] = PM_CYC,
[PERF_COUNT_HW_INSTRUCTIONS] = PM_INST_CMPL, [PERF_COUNT_HW_INSTRUCTIONS] = PM_INST_CMPL,
[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = PM_NO_INST_AVAIL,
[PERF_COUNT_HW_BRANCH_MISSES] = PM_BR_MPRED_CMPL,
[PERF_COUNT_HW_CACHE_MISSES] = PM_LD_MISS_L1,
}; };
#define C(x) PERF_COUNT_HW_CACHE_##x #define C(x) PERF_COUNT_HW_CACHE_##x
@ -105,11 +176,11 @@ static u64 generic_compat_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
[ C(L1D) ] = { [ C(L1D) ] = {
[ C(OP_READ) ] = { [ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0, [ C(RESULT_ACCESS) ] = 0,
[ C(RESULT_MISS) ] = 0, [ C(RESULT_MISS) ] = PM_LD_MISS_L1,
}, },
[ C(OP_WRITE) ] = { [ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = 0, [ C(RESULT_ACCESS) ] = 0,
[ C(RESULT_MISS) ] = 0, [ C(RESULT_MISS) ] = PM_ST_MISS_L1,
}, },
[ C(OP_PREFETCH) ] = { [ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = 0, [ C(RESULT_ACCESS) ] = 0,
@ -119,7 +190,7 @@ static u64 generic_compat_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
[ C(L1I) ] = { [ C(L1I) ] = {
[ C(OP_READ) ] = { [ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0, [ C(RESULT_ACCESS) ] = 0,
[ C(RESULT_MISS) ] = 0, [ C(RESULT_MISS) ] = PM_L1_ICACHE_MISS,
}, },
[ C(OP_WRITE) ] = { [ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = 0, [ C(RESULT_ACCESS) ] = 0,
@ -133,7 +204,7 @@ static u64 generic_compat_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
[ C(LL) ] = { [ C(LL) ] = {
[ C(OP_READ) ] = { [ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0, [ C(RESULT_ACCESS) ] = 0,
[ C(RESULT_MISS) ] = 0, [ C(RESULT_MISS) ] = PM_DATA_FROM_L3MISS,
}, },
[ C(OP_WRITE) ] = { [ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = 0, [ C(RESULT_ACCESS) ] = 0,
@ -147,7 +218,7 @@ static u64 generic_compat_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
[ C(DTLB) ] = { [ C(DTLB) ] = {
[ C(OP_READ) ] = { [ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0, [ C(RESULT_ACCESS) ] = 0,
[ C(RESULT_MISS) ] = 0, [ C(RESULT_MISS) ] = PM_DTLB_MISS,
}, },
[ C(OP_WRITE) ] = { [ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = -1, [ C(RESULT_ACCESS) ] = -1,
@ -161,7 +232,7 @@ static u64 generic_compat_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
[ C(ITLB) ] = { [ C(ITLB) ] = {
[ C(OP_READ) ] = { [ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0, [ C(RESULT_ACCESS) ] = 0,
[ C(RESULT_MISS) ] = 0, [ C(RESULT_MISS) ] = PM_ITLB_MISS,
}, },
[ C(OP_WRITE) ] = { [ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = -1, [ C(RESULT_ACCESS) ] = -1,
@ -175,7 +246,7 @@ static u64 generic_compat_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
[ C(BPU) ] = { [ C(BPU) ] = {
[ C(OP_READ) ] = { [ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0, [ C(RESULT_ACCESS) ] = 0,
[ C(RESULT_MISS) ] = 0, [ C(RESULT_MISS) ] = PM_BR_MPRED_CMPL,
}, },
[ C(OP_WRITE) ] = { [ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = -1, [ C(RESULT_ACCESS) ] = -1,
@ -204,13 +275,30 @@ static u64 generic_compat_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
#undef C #undef C
/*
* We set MMCR0[CC5-6RUN] so we can use counters 5 and 6 for
* PM_INST_CMPL and PM_CYC.
*/
static int generic_compute_mmcr(u64 event[], int n_ev,
unsigned int hwc[], struct mmcr_regs *mmcr,
struct perf_event *pevents[], u32 flags)
{
int ret;
ret = isa207_compute_mmcr(event, n_ev, hwc, mmcr, pevents, flags);
if (!ret)
mmcr->mmcr0 |= MMCR0_C56RUN;
return ret;
}
static struct power_pmu generic_compat_pmu = { static struct power_pmu generic_compat_pmu = {
.name = "GENERIC_COMPAT", .name = "GENERIC_COMPAT",
.n_counter = MAX_PMU_COUNTERS, .n_counter = MAX_PMU_COUNTERS,
.add_fields = ISA207_ADD_FIELDS, .add_fields = ISA207_ADD_FIELDS,
.test_adder = ISA207_TEST_ADDER, .test_adder = ISA207_TEST_ADDER,
.compute_mmcr = isa207_compute_mmcr, .compute_mmcr = generic_compute_mmcr,
.get_constraint = isa207_get_constraint, .get_constraint = isa207_get_constraint,
.get_alternatives = generic_get_alternatives,
.disable_pmc = isa207_disable_pmc, .disable_pmc = isa207_disable_pmc,
.flags = PPMU_HAS_SIER | PPMU_ARCH_207S, .flags = PPMU_HAS_SIER | PPMU_ARCH_207S,
.n_generic = ARRAY_SIZE(compat_generic_events), .n_generic = ARRAY_SIZE(compat_generic_events),
@ -223,6 +311,16 @@ int init_generic_compat_pmu(void)
{ {
int rc = 0; int rc = 0;
/*
* From ISA v2.07 on, PMU features are architected;
* we require >= v3.0 because (a) that has PM_LD_CMPL and
* PM_INST_CMPL_ALT, which v2.07 doesn't have, and
* (b) we don't expect any non-IBM Power ISA
* implementations that conform to v2.07 but not v3.0.
*/
if (!cpu_has_feature(CPU_FTR_ARCH_300))
return -ENODEV;
rc = register_power_pmu(&generic_compat_pmu); rc = register_power_pmu(&generic_compat_pmu);
if (rc) if (rc)
return rc; return rc;