perf vendor events: Update Intel sandybridge

Update to v17, the metrics are based on TMA 4.4 full. Use script at: https://github.com/intel/event-converter-for-linux-perf/blob/master/download_and_gen.py to download and generate the latest events and metrics. Manually copy the sandybridge files into perf and update mapfile.csv. Tested on a non-sandybridge with 'perf test': 10: PMU events : 10.1: PMU event table sanity : Ok 10.2: PMU event map aliases : Ok 10.3: Parsing of PMU event table metrics : Ok 10.4: Parsing of PMU event table metrics with fake PMUs : Ok Signed-off-by: Ian Rogers <irogers@google.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Alexandre Torgue <alexandre.torgue@foss.st.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Caleb Biggers <caleb.biggers@intel.com> Cc: James Clark <james.clark@arm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: John Garry <john.garry@huawei.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kshipra Bopardikar <kshipra.bopardikar@intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Maxime Coquelin <mcoquelin.stm32@gmail.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Perry Taylor <perry.taylor@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Sedat Dilek <sedat.dilek@gmail.com> Cc: Stephane Eranian <eranian@google.com> Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com> Link: http://lore.kernel.org/lkml/20220727220832.2865794-22-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2022-07-27 15:08:23 -07:00 · 2022-07-27 15:08:23 -07:00 · 777e131244
--- a/tools/perf/pmu-events/arch/x86/mapfile.csv
+++ b/tools/perf/pmu-events/arch/x86/mapfile.csv
@ -19,12 +19,12 @@ GenuineIntel-6-(57|85),v9,knightslanding,core
 GenuineIntel-6-AA,v1.00,meteorlake,core
 GenuineIntel-6-1[AEF],v3,nehalemep,core
 GenuineIntel-6-2E,v3,nehalemex,core
+GenuineIntel-6-2A,v17,sandybridge,core
 GenuineIntel-6-[4589]E,v24,skylake,core
 GenuineIntel-6-A[56],v24,skylake,core
 GenuineIntel-6-37,v13,silvermont,core
 GenuineIntel-6-4D,v13,silvermont,core
 GenuineIntel-6-4C,v13,silvermont,core
-GenuineIntel-6-2A,v15,sandybridge,core
 GenuineIntel-6-2C,v2,westmereep-dp,core
 GenuineIntel-6-25,v2,westmereep-sp,core
 GenuineIntel-6-2F,v2,westmereex,core
--- a/tools/perf/pmu-events/arch/x86/sandybridge/cache.json
+++ b/tools/perf/pmu-events/arch/x86/sandybridge/cache.json
@ -1876,4 +1876,4 @@
        "SampleAfterValue": "100003",
        "UMask": "0x10"
    }
-]
+]
--- a/tools/perf/pmu-events/arch/x86/sandybridge/floating-point.json
+++ b/tools/perf/pmu-events/arch/x86/sandybridge/floating-point.json
@ -135,4 +135,4 @@
        "SampleAfterValue": "2000003",
        "UMask": "0x1"
    }
-]
+]
--- a/tools/perf/pmu-events/arch/x86/sandybridge/frontend.json
+++ b/tools/perf/pmu-events/arch/x86/sandybridge/frontend.json
@ -176,7 +176,7 @@
        "CounterMask": "1",
        "EventCode": "0x79",
        "EventName": "IDQ.MS_CYCLES",
-        "PublicDescription": "This event counts cycles during which the microcode sequencer assisted the front-end in delivering uops.  Microcode assists are used for complex instructions or scenarios that can't be handled by the standard decoder.  Using other instructions, if possible, will usually improve performance.  See the Intel 64 and IA-32 Architectures Optimization Reference Manual for more information.",
+        "PublicDescription": "This event counts cycles during which the microcode sequencer assisted the front-end in delivering uops.  Microcode assists are used for complex instructions or scenarios that can't be handled by the standard decoder.  Using other instructions, if possible, will usually improve performance.  See the Intel(R) 64 and IA-32 Architectures Optimization Reference Manual for more information.",
        "SampleAfterValue": "2000003",
        "UMask": "0x30"
    },
@ -311,4 +311,4 @@
        "SampleAfterValue": "2000003",
        "UMask": "0x1"
    }
-]
+]
--- a/tools/perf/pmu-events/arch/x86/sandybridge/memory.json
+++ b/tools/perf/pmu-events/arch/x86/sandybridge/memory.json
@ -442,4 +442,4 @@
        "SampleAfterValue": "100003",
        "UMask": "0x1"
    }
-]
+]
--- a/tools/perf/pmu-events/arch/x86/sandybridge/other.json
+++ b/tools/perf/pmu-events/arch/x86/sandybridge/other.json
@ -55,4 +55,4 @@
        "SampleAfterValue": "2000003",
        "UMask": "0x1"
    }
-]
+]
--- a/tools/perf/pmu-events/arch/x86/sandybridge/pipeline.json
+++ b/tools/perf/pmu-events/arch/x86/sandybridge/pipeline.json
@ -609,7 +609,7 @@
        "UMask": "0x3"
    },
    {
-        "BriefDescription": "Number of occurences waiting for the checkpoints in Resource Allocation Table (RAT) to be recovered after Nuke due to all other cases except JEClear (e.g. whenever a ucode assist is needed like SSE exception, memory disambiguation, etc...).",
+        "BriefDescription": "Number of occurrences waiting for the checkpoints in Resource Allocation Table (RAT) to be recovered after Nuke due to all other cases except JEClear (e.g. whenever a ucode assist is needed like SSE exception, memory disambiguation, etc...).",
        "Counter": "0,1,2,3",
        "CounterHTOff": "0,1,2,3,4,5,6,7",
        "CounterMask": "1",
@ -652,7 +652,7 @@
        "CounterHTOff": "0,1,2,3,4,5,6,7",
        "EventCode": "0x03",
        "EventName": "LD_BLOCKS.STORE_FORWARD",
-        "PublicDescription": "This event counts loads that followed a store to the same address, where the data could not be forwarded inside the pipeline from the store to the load.  The most common reason why store forwarding would be blocked is when a load's address range overlaps with a preceeding smaller uncompleted store.  See the table of not supported store forwards in the Intel 64 and IA-32 Architectures Optimization Reference Manual.  The penalty for blocked store forwarding is that the load must wait for the store to complete before it can be issued.",
+        "PublicDescription": "This event counts loads that followed a store to the same address, where the data could not be forwarded inside the pipeline from the store to the load.  The most common reason why store forwarding would be blocked is when a load's address range overlaps with a preceeding smaller uncompleted store.  See the table of not supported store forwards in the Intel(R) 64 and IA-32 Architectures Optimization Reference Manual.  The penalty for blocked store forwarding is that the load must wait for the store to complete before it can be issued.",
        "SampleAfterValue": "100003",
        "UMask": "0x2"
    },
@ -778,7 +778,7 @@
        "CounterMask": "1",
        "EventCode": "0x59",
        "EventName": "PARTIAL_RAT_STALLS.FLAGS_MERGE_UOP_CYCLES",
-        "PublicDescription": "This event counts the number of cycles spent executing performance-sensitive flags-merging uops. For example, shift CL (merge_arith_flags). For more details, See the Intel 64 and IA-32 Architectures Optimization Reference Manual.",
+        "PublicDescription": "This event counts the number of cycles spent executing performance-sensitive flags-merging uops. For example, shift CL (merge_arith_flags). For more details, See the Intel(R) 64 and IA-32 Architectures Optimization Reference Manual.",
        "SampleAfterValue": "2000003",
        "UMask": "0x20"
    },
@ -797,7 +797,7 @@
        "CounterHTOff": "0,1,2,3,4,5,6,7",
        "EventCode": "0x59",
        "EventName": "PARTIAL_RAT_STALLS.SLOW_LEA_WINDOW",
-        "PublicDescription": "This event counts the number of cycles with at least one slow LEA uop being allocated. A uop is generally considered as slow LEA if it has three sources (for example, two sources and immediate) regardless of whether it is a result of LEA instruction or not. Examples of the slow LEA uop are or uops with base, index, and offset source operands using base and index reqisters, where base is EBR/RBP/R13, using RIP relative or 16-bit addressing modes. See the Intel 64 and IA-32 Architectures Optimization Reference Manual for more details about slow LEA instructions.",
+        "PublicDescription": "This event counts the number of cycles with at least one slow LEA uop being allocated. A uop is generally considered as slow LEA if it has three sources (for example, two sources and immediate) regardless of whether it is a result of LEA instruction or not. Examples of the slow LEA uop are or uops with base, index, and offset source operands using base and index reqisters, where base is EBR/RBP/R13, using RIP relative or 16-bit addressing modes. See the Intel(R) 64 and IA-32 Architectures Optimization Reference Manual for more details about slow LEA instructions.",
        "SampleAfterValue": "2000003",
        "UMask": "0x40"
    },
@ -1209,4 +1209,4 @@
        "SampleAfterValue": "2000003",
        "UMask": "0x1"
    }
-]
+]
--- a/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json
@ -124,7 +124,7 @@
        "MetricName": "FLOPc_SMT"
    },
    {
-        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)",
+        "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core",
        "MetricExpr": "UOPS_DISPATCHED.THREAD / (( cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@ / 2 ) if #SMT_on else cpu@UOPS_DISPATCHED.CORE\\,cmask\\=1@)",
        "MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
        "MetricName": "ILP"
@ -141,6 +141,12 @@
        "MetricGroup": "Summary;TmaL1",
        "MetricName": "Instructions"
    },
+    {
+        "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.",
+        "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / cpu@UOPS_RETIRED.RETIRE_SLOTS\\,cmask\\=1@",
+        "MetricGroup": "Pipeline;Ret",
+        "MetricName": "Retire"
+    },
    {
        "BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)",
        "MetricExpr": "IDQ.DSB_UOPS / (( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS ) )",
@ -163,7 +169,8 @@
        "BriefDescription": "Giga Floating Point Operations Per Second",
        "MetricExpr": "( ( 1 * ( FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE + FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE ) + 2 * FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE + 4 * ( FP_COMP_OPS_EXE.SSE_PACKED_SINGLE + SIMD_FP_256.PACKED_DOUBLE ) + 8 * SIMD_FP_256.PACKED_SINGLE ) / 1000000000 ) / duration_time",
        "MetricGroup": "Cor;Flops;HPC",
-        "MetricName": "GFLOPs"
+        "MetricName": "GFLOPs",
+        "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine."
    },
    {
        "BriefDescription": "Average Frequency Utilization relative nominal frequency",
--- a/tools/perf/pmu-events/arch/x86/sandybridge/uncore-other.json
+++ b/tools/perf/pmu-events/arch/x86/sandybridge/uncore-other.json
@ -82,10 +82,10 @@
    {
        "BriefDescription": "This 48-bit fixed counter counts the UCLK cycles.",
        "Counter": "Fixed",
+        "EventCode": "0xff",
        "EventName": "UNC_CLOCK.SOCKET",
        "PerPkg": "1",
        "PublicDescription": "This 48-bit fixed counter counts the UCLK cycles.",
-        "UMask": "0x01",
        "Unit": "ARB"
    }
 ]
--- a/tools/perf/pmu-events/arch/x86/sandybridge/virtual-memory.json
+++ b/tools/perf/pmu-events/arch/x86/sandybridge/virtual-memory.json
@ -146,4 +146,4 @@
        "SampleAfterValue": "100007",
        "UMask": "0x20"
    }
-]
+]