aboutsummaryrefslogtreecommitdiff
path: root/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json
diff options
context:
space:
mode:
Diffstat (limited to 'tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json')
-rw-r--r--tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json988
1 files changed, 472 insertions, 516 deletions
diff --git a/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json b/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json
index b72c0e2cb946..8fdf4a4225de 100644
--- a/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json
@@ -113,42 +113,30 @@
"ScaleUnit": "100%"
},
{
- "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to certain allocation restrictions.",
- "MetricExpr": "cpu_atom@TOPDOWN_BE_BOUND.ALLOC_RESTRICTIONS@ / tma_info_core_slots",
- "MetricGroup": "TopdownL3;tma_L3_group;tma_resource_bound_group",
- "MetricName": "tma_alloc_restriction",
- "MetricThreshold": "tma_alloc_restriction > 0.1",
+ "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to certain allocation restrictions",
+ "MetricExpr": "tma_core_bound",
+ "MetricGroup": "TopdownL3;tma_L3_group;tma_core_bound_group",
+ "MetricName": "tma_allocation_restriction",
+ "MetricThreshold": "tma_allocation_restriction > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.1)",
"ScaleUnit": "100%",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls",
+ "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls",
"DefaultMetricgroupName": "TopdownL1",
- "MetricExpr": "cpu_atom@TOPDOWN_BE_BOUND.ALL@ / tma_info_core_slots",
+ "MetricExpr": "cpu_atom@TOPDOWN_BE_BOUND.ALL@ / (5 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
"MetricGroup": "Default;TopdownL1;tma_L1_group",
"MetricName": "tma_backend_bound",
"MetricThreshold": "tma_backend_bound > 0.1",
"MetricgroupNoGroup": "TopdownL1;Default",
- "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls. Note that uops must be available for consumption in order for this event to count. If a uop is not available (IQ is empty), this event will not count. The rest of these subevents count backend stalls, in cycles, due to an outstanding request which is memory bound vs core bound. The subevents are not slot based events and therefore can not be precisely added or subtracted from the Backend_Bound_Aux subevents which are slot based.",
- "ScaleUnit": "100%",
- "Unit": "cpu_atom"
- },
- {
- "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls",
- "DefaultMetricgroupName": "TopdownL1",
- "MetricExpr": "tma_backend_bound",
- "MetricGroup": "Default;TopdownL1;tma_L1_group",
- "MetricName": "tma_backend_bound_aux",
- "MetricThreshold": "tma_backend_bound_aux > 0.2",
- "MetricgroupNoGroup": "TopdownL1;Default",
- "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls. Note that UOPS must be available for consumption in order for this event to count. If a uop is not available (IQ is empty), this event will not count. All of these subevents count backend stalls, in slots, due to a resource limitation. These are not cycle based events and therefore can not be precisely added or subtracted from the Backend_Bound subevents which are cycle based. These subevents are supplementary to Backend_Bound and can be used to analyze results from a resource perspective at allocation.",
+ "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls. Note that uops must be available for consumption in order for this event to count. If a uop is not available (IQ is empty), this event will not count",
"ScaleUnit": "100%",
"Unit": "cpu_atom"
},
{
"BriefDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear",
"DefaultMetricgroupName": "TopdownL1",
- "MetricExpr": "(tma_info_core_slots - (cpu_atom@TOPDOWN_FE_BOUND.ALL@ + cpu_atom@TOPDOWN_BE_BOUND.ALL@ + cpu_atom@TOPDOWN_RETIRING.ALL@)) / tma_info_core_slots",
+ "MetricExpr": "(5 * cpu_atom@CPU_CLK_UNHALTED.CORE@ - (cpu_atom@TOPDOWN_FE_BOUND.ALL@ + cpu_atom@TOPDOWN_BE_BOUND.ALL@ + cpu_atom@TOPDOWN_RETIRING.ALL@)) / (5 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
"MetricGroup": "Default;TopdownL1;tma_L1_group",
"MetricName": "tma_bad_speculation",
"MetricThreshold": "tma_bad_speculation > 0.15",
@@ -158,644 +146,564 @@
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of uops that are not from the microsequencer.",
- "MetricExpr": "(cpu_atom@TOPDOWN_RETIRING.ALL@ - cpu_atom@UOPS_RETIRED.MS@) / tma_info_core_slots",
- "MetricGroup": "TopdownL2;tma_L2_group;tma_retiring_group",
- "MetricName": "tma_base",
- "MetricThreshold": "tma_base > 0.6",
- "MetricgroupNoGroup": "TopdownL2",
- "ScaleUnit": "100%",
- "Unit": "cpu_atom"
- },
- {
- "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to BACLEARS, which occurs when the Branch Target Buffer (BTB) prediction or lack thereof, was corrected by a later branch predictor in the frontend",
- "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.BRANCH_DETECT@ / tma_info_core_slots",
- "MetricGroup": "TopdownL3;tma_L3_group;tma_fetch_latency_group",
+ "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to BACLEARS, which occurs when the Branch Target Buffer (BTB) prediction or lack thereof, was corrected by a later branch predictor in the frontend",
+ "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.BRANCH_DETECT@ / (5 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+ "MetricGroup": "TopdownL3;tma_L3_group;tma_ifetch_latency_group",
"MetricName": "tma_branch_detect",
- "MetricThreshold": "tma_branch_detect > 0.05",
- "PublicDescription": "Counts the number of issue slots that were not delivered by the frontend due to BACLEARS, which occurs when the Branch Target Buffer (BTB) prediction or lack thereof, was corrected by a later branch predictor in the frontend. Includes BACLEARS due to all branch types including conditional and unconditional jumps, returns, and indirect branches.",
+ "MetricThreshold": "tma_branch_detect > 0.05 & (tma_ifetch_latency > 0.15 & tma_frontend_bound > 0.2)",
+ "PublicDescription": "Counts the number of issue slots that were not delivered by the frontend due to BACLEARS, which occurs when the Branch Target Buffer (BTB) prediction or lack thereof, was corrected by a later branch predictor in the frontend. Includes BACLEARS due to all branch types including conditional and unconditional jumps, returns, and indirect branches.",
"ScaleUnit": "100%",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to branch mispredicts.",
- "MetricExpr": "cpu_atom@TOPDOWN_BAD_SPECULATION.MISPREDICT@ / tma_info_core_slots",
+ "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to branch mispredicts",
+ "MetricExpr": "cpu_atom@TOPDOWN_BAD_SPECULATION.MISPREDICT@ / (5 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
"MetricGroup": "TopdownL2;tma_L2_group;tma_bad_speculation_group",
"MetricName": "tma_branch_mispredicts",
- "MetricThreshold": "tma_branch_mispredicts > 0.05",
+ "MetricThreshold": "tma_branch_mispredicts > 0.05 & tma_bad_speculation > 0.15",
"MetricgroupNoGroup": "TopdownL2",
"ScaleUnit": "100%",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to BTCLEARS, which occurs when the Branch Target Buffer (BTB) predicts a taken branch.",
- "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.BRANCH_RESTEER@ / tma_info_core_slots",
- "MetricGroup": "TopdownL3;tma_L3_group;tma_fetch_latency_group",
+ "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to BTCLEARS, which occurs when the Branch Target Buffer (BTB) predicts a taken branch.",
+ "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.BRANCH_RESTEER@ / (5 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+ "MetricGroup": "TopdownL3;tma_L3_group;tma_ifetch_latency_group",
"MetricName": "tma_branch_resteer",
- "MetricThreshold": "tma_branch_resteer > 0.05",
+ "MetricThreshold": "tma_branch_resteer > 0.05 & (tma_ifetch_latency > 0.15 & tma_frontend_bound > 0.2)",
"ScaleUnit": "100%",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to the microcode sequencer (MS).",
- "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.CISC@ / tma_info_core_slots",
- "MetricGroup": "TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
+ "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to the microcode sequencer (MS).",
+ "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.CISC@ / (5 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+ "MetricGroup": "TopdownL3;tma_L3_group;tma_ifetch_bandwidth_group",
"MetricName": "tma_cisc",
- "MetricThreshold": "tma_cisc > 0.05",
+ "MetricThreshold": "tma_cisc > 0.05 & (tma_ifetch_bandwidth > 0.1 & tma_frontend_bound > 0.2)",
"ScaleUnit": "100%",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of cycles due to backend bound stalls that are core execution bound and not attributed to outstanding demand load or store stalls.",
- "MetricExpr": "max(0, tma_backend_bound - tma_memory_bound)",
+ "BriefDescription": "Counts the number of cycles due to backend bound stalls that are bounded by core restrictions and not attributed to an outstanding load or stores, or resource limitation",
+ "MetricExpr": "cpu_atom@TOPDOWN_BE_BOUND.ALLOC_RESTRICTIONS@ / (5 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
"MetricGroup": "TopdownL2;tma_L2_group;tma_backend_bound_group",
"MetricName": "tma_core_bound",
- "MetricThreshold": "tma_core_bound > 0.1",
+ "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.1",
"MetricgroupNoGroup": "TopdownL2",
"ScaleUnit": "100%",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to decode stalls.",
- "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.DECODE@ / tma_info_core_slots",
- "MetricGroup": "TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
+ "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to decode stalls.",
+ "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.DECODE@ / (5 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+ "MetricGroup": "TopdownL3;tma_L3_group;tma_ifetch_bandwidth_group",
"MetricName": "tma_decode",
- "MetricThreshold": "tma_decode > 0.05",
+ "MetricThreshold": "tma_decode > 0.05 & (tma_ifetch_bandwidth > 0.1 & tma_frontend_bound > 0.2)",
"ScaleUnit": "100%",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of machine clears relative to the number of nuke slots due to memory disambiguation.",
- "MetricExpr": "tma_nuke * (cpu_atom@MACHINE_CLEARS.DISAMBIGUATION@ / cpu_atom@MACHINE_CLEARS.SLOW@)",
- "MetricGroup": "TopdownL4;tma_L4_group;tma_nuke_group",
- "MetricName": "tma_disambiguation",
- "MetricThreshold": "tma_disambiguation > 0.02",
+ "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to a machine clear that does not require the use of microcode, classified as a fast nuke, due to memory ordering, memory disambiguation and memory renaming",
+ "MetricExpr": "cpu_atom@TOPDOWN_BAD_SPECULATION.FASTNUKE@ / (5 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+ "MetricGroup": "TopdownL3;tma_L3_group;tma_machine_clears_group",
+ "MetricName": "tma_fast_nuke",
+ "MetricThreshold": "tma_fast_nuke > 0.05 & (tma_machine_clears > 0.05 & tma_bad_speculation > 0.15)",
"ScaleUnit": "100%",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of cycles the core is stalled due to a demand load miss which hit in DRAM or MMIO (Non-DRAM).",
- "MetricExpr": "cpu_atom@MEM_BOUND_STALLS.LOAD_DRAM_HIT@ / tma_info_core_clks - max((cpu_atom@MEM_BOUND_STALLS.LOAD@ - cpu_atom@LD_HEAD.L1_MISS_AT_RET@) / tma_info_core_clks, 0) * cpu_atom@MEM_BOUND_STALLS.LOAD_DRAM_HIT@ / cpu_atom@MEM_BOUND_STALLS.LOAD@",
- "MetricGroup": "TopdownL3;tma_L3_group;tma_memory_bound_group",
- "MetricName": "tma_dram_bound",
- "MetricThreshold": "tma_dram_bound > 0.1",
+ "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to frontend stalls.",
+ "DefaultMetricgroupName": "TopdownL1",
+ "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.ALL@ / (5 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+ "MetricGroup": "Default;TopdownL1;tma_L1_group",
+ "MetricName": "tma_frontend_bound",
+ "MetricThreshold": "tma_frontend_bound > 0.2",
+ "MetricgroupNoGroup": "TopdownL1;Default",
"ScaleUnit": "100%",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to a machine clear classified as a fast nuke due to memory ordering, memory disambiguation and memory renaming.",
- "MetricExpr": "cpu_atom@TOPDOWN_BAD_SPECULATION.FASTNUKE@ / tma_info_core_slots",
- "MetricGroup": "TopdownL3;tma_L3_group;tma_machine_clears_group",
- "MetricName": "tma_fast_nuke",
- "MetricThreshold": "tma_fast_nuke > 0.05",
+ "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to instruction cache misses.",
+ "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.ICACHE@ / (5 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+ "MetricGroup": "TopdownL3;tma_L3_group;tma_ifetch_latency_group",
+ "MetricName": "tma_icache_misses",
+ "MetricThreshold": "tma_icache_misses > 0.05 & (tma_ifetch_latency > 0.15 & tma_frontend_bound > 0.2)",
"ScaleUnit": "100%",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to frontend bandwidth restrictions due to decode, predecode, cisc, and other limitations.",
- "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.FRONTEND_BANDWIDTH@ / tma_info_core_slots",
+ "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to frontend bandwidth restrictions due to decode, predecode, cisc, and other limitations.",
+ "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.FRONTEND_BANDWIDTH@ / (5 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
"MetricGroup": "TopdownL2;tma_L2_group;tma_frontend_bound_group",
- "MetricName": "tma_fetch_bandwidth",
- "MetricThreshold": "tma_fetch_bandwidth > 0.1",
+ "MetricName": "tma_ifetch_bandwidth",
+ "MetricThreshold": "tma_ifetch_bandwidth > 0.1 & tma_frontend_bound > 0.2",
"MetricgroupNoGroup": "TopdownL2",
"ScaleUnit": "100%",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to frontend bandwidth restrictions due to decode, predecode, cisc, and other limitations.",
- "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.FRONTEND_LATENCY@ / tma_info_core_slots",
+ "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to frontend latency restrictions due to icache misses, itlb misses, branch detection, and resteer limitations.",
+ "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.FRONTEND_LATENCY@ / (5 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
"MetricGroup": "TopdownL2;tma_L2_group;tma_frontend_bound_group",
- "MetricName": "tma_fetch_latency",
- "MetricThreshold": "tma_fetch_latency > 0.15",
+ "MetricName": "tma_ifetch_latency",
+ "MetricThreshold": "tma_ifetch_latency > 0.15 & tma_frontend_bound > 0.2",
"MetricgroupNoGroup": "TopdownL2",
"ScaleUnit": "100%",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of machine clears relative to the number of nuke slots due to FP assists.",
- "MetricExpr": "tma_nuke * (cpu_atom@MACHINE_CLEARS.FP_ASSIST@ / cpu_atom@MACHINE_CLEARS.SLOW@)",
- "MetricGroup": "TopdownL4;tma_L4_group;tma_nuke_group",
- "MetricName": "tma_fp_assist",
- "MetricThreshold": "tma_fp_assist > 0.02",
- "ScaleUnit": "100%",
- "Unit": "cpu_atom"
- },
- {
- "BriefDescription": "Counts the number of floating point divide operations per uop.",
- "MetricExpr": "cpu_atom@UOPS_RETIRED.FPDIV@ / tma_info_core_slots",
- "MetricGroup": "TopdownL3;tma_L3_group;tma_base_group",
- "MetricName": "tma_fpdiv_uops",
- "MetricThreshold": "tma_fpdiv_uops > 0.2",
- "ScaleUnit": "100%",
+ "BriefDescription": "Percentage of time that retirement is stalled due to a first level data TLB miss",
+ "MetricExpr": "100 * (cpu_atom@LD_HEAD.DTLB_MISS_AT_RET@ + cpu_atom@LD_HEAD.PGWALK_AT_RET@) / cpu_atom@CPU_CLK_UNHALTED.CORE@",
+ "MetricName": "tma_info_bottleneck_%_dtlb_miss_bound_cycles",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to frontend stalls.",
- "DefaultMetricgroupName": "TopdownL1",
- "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.ALL@ / tma_info_core_slots",
- "MetricGroup": "Default;TopdownL1;tma_L1_group",
- "MetricName": "tma_frontend_bound",
- "MetricThreshold": "tma_frontend_bound > 0.2",
- "MetricgroupNoGroup": "TopdownL1;Default",
- "ScaleUnit": "100%",
+ "BriefDescription": "Percentage of time that allocation and retirement is stalled by the Frontend Cluster due to an Ifetch Miss, either Icache or ITLB Miss",
+ "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS.IFETCH@ / cpu_atom@CPU_CLK_UNHALTED.CORE@",
+ "MetricGroup": "Ifetch",
+ "MetricName": "tma_info_bottleneck_%_ifetch_miss_bound_cycles",
+ "PublicDescription": "Percentage of time that allocation and retirement is stalled by the Frontend Cluster due to an Ifetch Miss, either Icache or ITLB Miss. See Info.Ifetch_Bound",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to instruction cache misses.",
- "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.ICACHE@ / tma_info_core_slots",
- "MetricGroup": "TopdownL3;tma_L3_group;tma_fetch_latency_group",
- "MetricName": "tma_icache_misses",
- "MetricThreshold": "tma_icache_misses > 0.05",
- "ScaleUnit": "100%",
+ "BriefDescription": "Percentage of time that retirement is stalled due to an L1 miss",
+ "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS.LOAD@ / cpu_atom@CPU_CLK_UNHALTED.CORE@",
+ "MetricGroup": "Load_Store_Miss",
+ "MetricName": "tma_info_bottleneck_%_load_miss_bound_cycles",
+ "PublicDescription": "Percentage of time that retirement is stalled due to an L1 miss. See Info.Load_Miss_Bound",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "",
- "MetricExpr": "cpu_atom@CPU_CLK_UNHALTED.CORE@",
- "MetricName": "tma_info_core_clks",
+ "BriefDescription": "Percentage of time that retirement is stalled by the Memory Cluster due to a pipeline stall",
+ "MetricExpr": "100 * cpu_atom@LD_HEAD.ANY_AT_RET@ / cpu_atom@CPU_CLK_UNHALTED.CORE@",
+ "MetricGroup": "Mem_Exec",
+ "MetricName": "tma_info_bottleneck_%_mem_exec_bound_cycles",
+ "PublicDescription": "Percentage of time that retirement is stalled by the Memory Cluster due to a pipeline stall. See Info.Mem_Exec_Bound",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "",
- "MetricExpr": "cpu_atom@CPU_CLK_UNHALTED.CORE_P@",
- "MetricName": "tma_info_core_clks_p",
+ "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)",
+ "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@BR_INST_RETIRED.ALL_BRANCHES@",
+ "MetricName": "tma_info_br_inst_mix_ipbranch",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Cycles Per Instruction",
- "MetricExpr": "tma_info_core_clks / INST_RETIRED.ANY",
- "MetricName": "tma_info_core_cpi",
+ "BriefDescription": "Instruction per (near) call (lower number means higher occurrence rate)",
+ "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@BR_INST_RETIRED.CALL@",
+ "MetricName": "tma_info_br_inst_mix_ipcall",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Instructions Per Cycle",
- "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / tma_info_core_clks",
- "MetricName": "tma_info_core_ipc",
+ "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]",
+ "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@BR_INST_RETIRED.FAR_BRANCH@u",
+ "MetricName": "tma_info_br_inst_mix_ipfarbranch",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "",
- "MetricExpr": "5 * tma_info_core_clks",
- "MetricName": "tma_info_core_slots",
+ "BriefDescription": "Instructions per retired conditional Branch Misprediction where the branch was not taken",
+ "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / (cpu_atom@BR_MISP_RETIRED.COND@ - cpu_atom@BR_MISP_RETIRED.COND_TAKEN@)",
+ "MetricName": "tma_info_br_inst_mix_ipmisp_cond_ntaken",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Uops Per Instruction",
- "MetricExpr": "cpu_atom@UOPS_RETIRED.ALL@ / INST_RETIRED.ANY",
- "MetricName": "tma_info_core_upi",
+ "BriefDescription": "Instructions per retired conditional Branch Misprediction where the branch was taken",
+ "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@BR_MISP_RETIRED.COND_TAKEN@",
+ "MetricName": "tma_info_br_inst_mix_ipmisp_cond_taken",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Percent of instruction miss cost that hit in DRAM",
- "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS.IFETCH_DRAM_HIT@ / cpu_atom@MEM_BOUND_STALLS.IFETCH@",
- "MetricName": "tma_info_frontend_inst_miss_cost_dramhit_percent",
+ "BriefDescription": "Instructions per retired indirect call or jump Branch Misprediction",
+ "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@BR_MISP_RETIRED.INDIRECT@",
+ "MetricName": "tma_info_br_inst_mix_ipmisp_indirect",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Percent of instruction miss cost that hit in the L2",
- "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS.IFETCH_L2_HIT@ / cpu_atom@MEM_BOUND_STALLS.IFETCH@",
- "MetricName": "tma_info_frontend_inst_miss_cost_l2hit_percent",
+ "BriefDescription": "Instructions per retired return Branch Misprediction",
+ "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@BR_MISP_RETIRED.RETURN@",
+ "MetricName": "tma_info_br_inst_mix_ipmisp_ret",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Percent of instruction miss cost that hit in the L3",
- "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS.IFETCH_LLC_HIT@ / cpu_atom@MEM_BOUND_STALLS.IFETCH@",
- "MetricName": "tma_info_frontend_inst_miss_cost_l3hit_percent",
+ "BriefDescription": "Instructions per retired Branch Misprediction",
+ "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@BR_MISP_RETIRED.ALL_BRANCHES@",
+ "MetricName": "tma_info_br_inst_mix_ipmispredict",
"Unit": "cpu_atom"
},
{
"BriefDescription": "Ratio of all branches which mispredict",
- "MetricExpr": "cpu_atom@BR_MISP_RETIRED.ALL_BRANCHES@ / BR_INST_RETIRED.ALL_BRANCHES",
- "MetricName": "tma_info_inst_mix_branch_mispredict_ratio",
+ "MetricExpr": "cpu_atom@BR_MISP_RETIRED.ALL_BRANCHES@ / cpu_atom@BR_INST_RETIRED.ALL_BRANCHES@",
+ "MetricName": "tma_info_br_mispredict_bound_branch_mispredict_ratio",
"Unit": "cpu_atom"
},
{
"BriefDescription": "Ratio between Mispredicted branches and unknown branches",
- "MetricExpr": "cpu_atom@BR_MISP_RETIRED.ALL_BRANCHES@ / BACLEARS.ANY",
- "MetricName": "tma_info_inst_mix_branch_mispredict_to_unknown_branch_ratio",
+ "MetricExpr": "cpu_atom@BR_MISP_RETIRED.ALL_BRANCHES@ / cpu_atom@BACLEARS.ANY@",
+ "MetricName": "tma_info_br_mispredict_bound_branch_mispredict_to_unknown_branch_ratio",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Percentage of all uops which are FPDiv uops",
- "MetricExpr": "100 * cpu_atom@UOPS_RETIRED.FPDIV@ / UOPS_RETIRED.ALL",
- "MetricName": "tma_info_inst_mix_fpdiv_uop_ratio",
+ "BriefDescription": "Percentage of time that allocation is stalled due to load buffer full",
+ "MetricExpr": "100 * cpu_atom@MEM_SCHEDULER_BLOCK.LD_BUF@ / cpu_atom@CPU_CLK_UNHALTED.CORE@",
+ "MetricName": "tma_info_buffer_stalls_%_load_buffer_stall_cycles",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Percentage of all uops which are IDiv uops",
- "MetricExpr": "100 * cpu_atom@UOPS_RETIRED.IDIV@ / UOPS_RETIRED.ALL",
- "MetricName": "tma_info_inst_mix_idiv_uop_ratio",
+ "BriefDescription": "Percentage of time that allocation is stalled due to memory reservation stations full",
+ "MetricExpr": "100 * cpu_atom@MEM_SCHEDULER_BLOCK.RSV@ / cpu_atom@CPU_CLK_UNHALTED.CORE@",
+ "MetricName": "tma_info_buffer_stalls_%_mem_rsv_stall_cycles",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)",
- "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / BR_INST_RETIRED.ALL_BRANCHES",
- "MetricName": "tma_info_inst_mix_ipbranch",
+ "BriefDescription": "Percentage of time that allocation is stalled due to store buffer full",
+ "MetricExpr": "100 * cpu_atom@MEM_SCHEDULER_BLOCK.ST_BUF@ / cpu_atom@CPU_CLK_UNHALTED.CORE@",
+ "MetricName": "tma_info_buffer_stalls_%_store_buffer_stall_cycles",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Instruction per (near) call (lower number means higher occurrence rate)",
- "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / BR_INST_RETIRED.CALL",
- "MetricName": "tma_info_inst_mix_ipcall",
+ "BriefDescription": "Cycles Per Instruction",
+ "MetricExpr": "cpu_atom@CPU_CLK_UNHALTED.CORE@ / cpu_atom@INST_RETIRED.ANY@",
+ "MetricName": "tma_info_core_cpi",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Instructions per Far Branch",
- "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / (cpu_atom@BR_INST_RETIRED.FAR_BRANCH@ / 2)",
- "MetricName": "tma_info_inst_mix_ipfarbranch",
+ "BriefDescription": "Instructions Per Cycle",
+ "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@CPU_CLK_UNHALTED.CORE@",
+ "MetricName": "tma_info_core_ipc",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Instructions per Load",
- "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / MEM_UOPS_RETIRED.ALL_LOADS",
- "MetricName": "tma_info_inst_mix_ipload",
+ "BriefDescription": "Uops Per Instruction",
+ "MetricExpr": "cpu_atom@UOPS_RETIRED.ALL@ / cpu_atom@INST_RETIRED.ANY@",
+ "MetricName": "tma_info_core_upi",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Instructions per retired conditional Branch Misprediction where the branch was not taken",
- "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / (cpu_atom@BR_MISP_RETIRED.COND@ - cpu_atom@BR_MISP_RETIRED.COND_TAKEN@)",
- "MetricName": "tma_info_inst_mix_ipmisp_cond_ntaken",
+ "BriefDescription": "Percentage of ifetch miss bound stalls, where the ifetch miss hits in the L2",
+ "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS.IFETCH_L2_HIT@ / cpu_atom@MEM_BOUND_STALLS.IFETCH@",
+ "MetricName": "tma_info_ifetch_miss_bound_%_ifetchmissbound_with_l2hit",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Instructions per retired conditional Branch Misprediction where the branch was taken",
- "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / BR_MISP_RETIRED.COND_TAKEN",
- "MetricName": "tma_info_inst_mix_ipmisp_cond_taken",
+ "BriefDescription": "Percentage of ifetch miss bound stalls, where the ifetch miss hits in the L3",
+ "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS.IFETCH_LLC_HIT@ / cpu_atom@MEM_BOUND_STALLS.IFETCH@",
+ "MetricName": "tma_info_ifetch_miss_bound_%_ifetchmissbound_with_l3hit",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Instructions per retired indirect call or jump Branch Misprediction",
- "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / BR_MISP_RETIRED.INDIRECT",
- "MetricName": "tma_info_inst_mix_ipmisp_indirect",
+ "BriefDescription": "Percentage of ifetch miss bound stalls, where the ifetch miss subsequently misses in the L3",
+ "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS.IFETCH_DRAM_HIT@ / cpu_atom@MEM_BOUND_STALLS.IFETCH@",
+ "MetricName": "tma_info_ifetch_miss_bound_%_ifetchmissbound_with_l3miss",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Instructions per retired return Branch Misprediction",
- "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / BR_MISP_RETIRED.RETURN",
- "MetricName": "tma_info_inst_mix_ipmisp_ret",
+ "BriefDescription": "Percentage of memory bound stalls where retirement is stalled due to an L1 miss that hit the L2",
+ "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS.LOAD_L2_HIT@ / cpu_atom@MEM_BOUND_STALLS.LOAD@",
+ "MetricGroup": "load_store_bound",
+ "MetricName": "tma_info_load_miss_bound_%_loadmissbound_with_l2hit",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Instructions per retired Branch Misprediction",
- "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / BR_MISP_RETIRED.ALL_BRANCHES",
- "MetricName": "tma_info_inst_mix_ipmispredict",
+ "BriefDescription": "Percentage of memory bound stalls where retirement is stalled due to an L1 miss that hit the L3",
+ "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS.LOAD_LLC_HIT@ / cpu_atom@MEM_BOUND_STALLS.LOAD@",
+ "MetricGroup": "load_store_bound",
+ "MetricName": "tma_info_load_miss_bound_%_loadmissbound_with_l3hit",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Instructions per Store",
- "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / MEM_UOPS_RETIRED.ALL_STORES",
- "MetricName": "tma_info_inst_mix_ipstore",
+ "BriefDescription": "Percentage of memory bound stalls where retirement is stalled due to an L1 miss that subsequently misses the L3",
+ "MetricExpr": "100 * cpu_atom@MEM_BOUND_STALLS.LOAD_DRAM_HIT@ / cpu_atom@MEM_BOUND_STALLS.LOAD@",
+ "MetricGroup": "load_store_bound",
+ "MetricName": "tma_info_load_miss_bound_%_loadmissbound_with_l3miss",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Percentage of all uops which are ucode ops",
- "MetricExpr": "100 * cpu_atom@UOPS_RETIRED.MS@ / UOPS_RETIRED.ALL",
- "MetricName": "tma_info_inst_mix_microcode_uop_ratio",
+ "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement due to a pipeline block",
+ "MetricExpr": "100 * cpu_atom@LD_HEAD.L1_BOUND_AT_RET@ / cpu_atom@CPU_CLK_UNHALTED.CORE@",
+ "MetricGroup": "load_store_bound",
+ "MetricName": "tma_info_load_store_bound_l1_bound",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Percentage of all uops which are x87 uops",
- "MetricExpr": "100 * cpu_atom@UOPS_RETIRED.X87@ / UOPS_RETIRED.ALL",
- "MetricName": "tma_info_inst_mix_x87_uop_ratio",
+ "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement",
+ "MetricExpr": "100 * (cpu_atom@LD_HEAD.L1_BOUND_AT_RET@ + cpu_atom@MEM_BOUND_STALLS.LOAD@) / cpu_atom@CPU_CLK_UNHALTED.CORE@",
+ "MetricGroup": "load_store_bound",
+ "MetricName": "tma_info_load_store_bound_load_bound",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Percentage of total non-speculative loads with a address aliasing block",
- "MetricExpr": "100 * cpu_atom@LD_BLOCKS.4K_ALIAS@ / MEM_UOPS_RETIRED.ALL_LOADS",
- "MetricName": "tma_info_l1_bound_address_alias_blocks",
+ "BriefDescription": "Counts the number of cycles the core is stalled due to store buffer full",
+ "MetricExpr": "100 * (cpu_atom@MEM_SCHEDULER_BLOCK.ST_BUF@ / cpu_atom@MEM_SCHEDULER_BLOCK.ALL@) * tma_mem_scheduler",
+ "MetricGroup": "load_store_bound",
+ "MetricName": "tma_info_load_store_bound_store_bound",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Percentage of total non-speculative loads that are splits",
- "MetricExpr": "100 * cpu_atom@MEM_UOPS_RETIRED.SPLIT_LOADS@ / MEM_UOPS_RETIRED.ALL_LOADS",
- "MetricName": "tma_info_l1_bound_load_splits",
+ "BriefDescription": "Counts the number of machine clears relative to thousands of instructions retired, due to memory disambiguation",
+ "MetricExpr": "1e3 * cpu_atom@MACHINE_CLEARS.DISAMBIGUATION@ / cpu_atom@INST_RETIRED.ANY@",
+ "MetricName": "tma_info_machine_clear_bound_machine_clears_disamb_pki",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Percentage of total non-speculative loads with a store forward or unknown store address block",
- "MetricExpr": "100 * cpu_atom@LD_BLOCKS.DATA_UNKNOWN@ / MEM_UOPS_RETIRED.ALL_LOADS",
- "MetricName": "tma_info_l1_bound_store_fwd_blocks",
+ "BriefDescription": "Counts the number of machine clears relative to thousands of instructions retired, due to floating point assists",
+ "MetricExpr": "1e3 * cpu_atom@MACHINE_CLEARS.FP_ASSIST@ / cpu_atom@INST_RETIRED.ANY@",
+ "MetricName": "tma_info_machine_clear_bound_machine_clears_fp_assist_pki",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Cycle cost per DRAM hit",
- "MetricExpr": "cpu_atom@MEM_BOUND_STALLS.LOAD_DRAM_HIT@ / MEM_LOAD_UOPS_RETIRED.DRAM_HIT",
- "MetricName": "tma_info_memory_cycles_per_demand_load_dram_hit",
+ "BriefDescription": "Counts the number of machine clears relative to thousands of instructions retired, due to memory ordering",
+ "MetricExpr": "1e3 * cpu_atom@MACHINE_CLEARS.MEMORY_ORDERING@ / cpu_atom@INST_RETIRED.ANY@",
+ "MetricName": "tma_info_machine_clear_bound_machine_clears_monuke_pki",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Cycle cost per L2 hit",
- "MetricExpr": "cpu_atom@MEM_BOUND_STALLS.LOAD_L2_HIT@ / MEM_LOAD_UOPS_RETIRED.L2_HIT",
- "MetricName": "tma_info_memory_cycles_per_demand_load_l2_hit",
+ "BriefDescription": "Counts the number of machine clears relative to thousands of instructions retired, due to memory renaming",
+ "MetricExpr": "1e3 * cpu_atom@MACHINE_CLEARS.MRN_NUKE@ / cpu_atom@INST_RETIRED.ANY@",
+ "MetricName": "tma_info_machine_clear_bound_machine_clears_mrn_pki",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Cycle cost per LLC hit",
- "MetricExpr": "cpu_atom@MEM_BOUND_STALLS.LOAD_LLC_HIT@ / MEM_LOAD_UOPS_RETIRED.L3_HIT",
- "MetricName": "tma_info_memory_cycles_per_demand_load_l3_hit",
+ "BriefDescription": "Counts the number of machine clears relative to thousands of instructions retired, due to page faults",
+ "MetricExpr": "1e3 * cpu_atom@MACHINE_CLEARS.PAGE_FAULT@ / cpu_atom@INST_RETIRED.ANY@",
+ "MetricName": "tma_info_machine_clear_bound_machine_clears_page_fault_pki",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "load ops retired per 1000 instruction",
- "MetricExpr": "1e3 * cpu_atom@MEM_UOPS_RETIRED.ALL_LOADS@ / INST_RETIRED.ANY",
- "MetricName": "tma_info_memory_memloadpki",
+ "BriefDescription": "Counts the number of machine clears relative to thousands of instructions retired, due to self-modifying code",
+ "MetricExpr": "1e3 * cpu_atom@MACHINE_CLEARS.SMC@ / cpu_atom@INST_RETIRED.ANY@",
+ "MetricName": "tma_info_machine_clear_bound_machine_clears_smc_pki",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Average CPU Utilization",
- "MetricExpr": "cpu_atom@CPU_CLK_UNHALTED.REF_TSC@ / TSC",
- "MetricName": "tma_info_system_cpu_utilization",
+ "BriefDescription": "Percentage of total non-speculative loads with an address aliasing block",
+ "MetricExpr": "100 * cpu_atom@LD_BLOCKS.4K_ALIAS@ / cpu_atom@MEM_UOPS_RETIRED.ALL_LOADS@",
+ "MetricName": "tma_info_mem_exec_blocks_%_loads_with_adressaliasing",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Fraction of cycles spent in Kernel mode",
- "MetricExpr": "cpu_atom@CPU_CLK_UNHALTED.CORE@k / CPU_CLK_UNHALTED.CORE",
- "MetricGroup": "Summary",
- "MetricName": "tma_info_system_kernel_utilization",
+ "BriefDescription": "Percentage of total non-speculative loads with a store forward or unknown store address block",
+ "MetricExpr": "100 * cpu_atom@LD_BLOCKS.DATA_UNKNOWN@ / cpu_atom@MEM_UOPS_RETIRED.ALL_LOADS@",
+ "MetricName": "tma_info_mem_exec_blocks_%_loads_with_storefwdblk",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Average Frequency Utilization relative nominal frequency",
- "MetricExpr": "tma_info_core_clks / CPU_CLK_UNHALTED.REF_TSC",
- "MetricGroup": "Power",
- "MetricName": "tma_info_system_turbo_utilization",
+ "BriefDescription": "Percentage of Memory Execution Bound due to a first level data cache miss",
+ "MetricExpr": "100 * cpu_atom@LD_HEAD.L1_MISS_AT_RET@ / cpu_atom@LD_HEAD.ANY_AT_RET@",
+ "MetricName": "tma_info_mem_exec_bound_%_loadhead_with_l1miss",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to Instruction Table Lookaside Buffer (ITLB) misses.",
- "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.ITLB@ / tma_info_core_slots",
- "MetricGroup": "TopdownL3;tma_L3_group;tma_fetch_latency_group",
- "MetricName": "tma_itlb_misses",
- "MetricThreshold": "tma_itlb_misses > 0.05",
- "ScaleUnit": "100%",
+ "BriefDescription": "Percentage of Memory Execution Bound due to other block cases, such as pipeline conflicts, fences, etc",
+ "MetricExpr": "100 * cpu_atom@LD_HEAD.OTHER_AT_RET@ / cpu_atom@LD_HEAD.ANY_AT_RET@",
+ "MetricName": "tma_info_mem_exec_bound_%_loadhead_with_otherpipelineblks",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement due to a load block.",
- "MetricExpr": "cpu_atom@LD_HEAD.L1_BOUND_AT_RET@ / tma_info_core_clks",
- "MetricGroup": "TopdownL3;tma_L3_group;tma_memory_bound_group",
- "MetricName": "tma_l1_bound",
- "MetricThreshold": "tma_l1_bound > 0.1",
- "ScaleUnit": "100%",
+ "BriefDescription": "Percentage of Memory Execution Bound due to a pagewalk",
+ "MetricExpr": "100 * cpu_atom@LD_HEAD.PGWALK_AT_RET@ / cpu_atom@LD_HEAD.ANY_AT_RET@",
+ "MetricName": "tma_info_mem_exec_bound_%_loadhead_with_pagewalk",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of cycles a core is stalled due to a demand load which hit in the L2 Cache.",
- "MetricExpr": "cpu_atom@MEM_BOUND_STALLS.LOAD_L2_HIT@ / tma_info_core_clks - max((cpu_atom@MEM_BOUND_STALLS.LOAD@ - cpu_atom@LD_HEAD.L1_MISS_AT_RET@) / tma_info_core_clks, 0) * cpu_atom@MEM_BOUND_STALLS.LOAD_L2_HIT@ / cpu_atom@MEM_BOUND_STALLS.LOAD@",
- "MetricGroup": "TopdownL3;tma_L3_group;tma_memory_bound_group",
- "MetricName": "tma_l2_bound",
- "MetricThreshold": "tma_l2_bound > 0.1",
- "ScaleUnit": "100%",
+ "BriefDescription": "Percentage of Memory Execution Bound due to a second level TLB miss",
+ "MetricExpr": "100 * cpu_atom@LD_HEAD.DTLB_MISS_AT_RET@ / cpu_atom@LD_HEAD.ANY_AT_RET@",
+ "MetricName": "tma_info_mem_exec_bound_%_loadhead_with_stlbhit",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of cycles a core is stalled due to a demand load which hit in the Last Level Cache (LLC) or other core with HITE/F/M.",
- "MetricExpr": "cpu_atom@MEM_BOUND_STALLS.LOAD_LLC_HIT@ / tma_info_core_clks - max((cpu_atom@MEM_BOUND_STALLS.LOAD@ - cpu_atom@LD_HEAD.L1_MISS_AT_RET@) / tma_info_core_clks, 0) * cpu_atom@MEM_BOUND_STALLS.LOAD_LLC_HIT@ / cpu_atom@MEM_BOUND_STALLS.LOAD@",
- "MetricGroup": "TopdownL3;tma_L3_group;tma_memory_bound_group",
- "MetricName": "tma_l3_bound",
- "MetricThreshold": "tma_l3_bound > 0.1",
- "ScaleUnit": "100%",
+ "BriefDescription": "Percentage of Memory Execution Bound due to a store forward address match",
+ "MetricExpr": "100 * cpu_atom@LD_HEAD.ST_ADDR_AT_RET@ / cpu_atom@LD_HEAD.ANY_AT_RET@",
+ "MetricName": "tma_info_mem_exec_bound_%_loadhead_with_storefwding",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of cycles, relative to the number of mem_scheduler slots, in which uops are blocked due to load buffer full",
- "MetricExpr": "tma_mem_scheduler * cpu_atom@MEM_SCHEDULER_BLOCK.LD_BUF@ / MEM_SCHEDULER_BLOCK.ALL",
- "MetricGroup": "TopdownL4;tma_L4_group;tma_mem_scheduler_group",
- "MetricName": "tma_ld_buffer",
- "MetricThreshold": "tma_ld_buffer > 0.05",
- "ScaleUnit": "100%",
+ "BriefDescription": "Instructions per Load",
+ "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@MEM_UOPS_RETIRED.ALL_LOADS@",
+ "MetricName": "tma_info_mem_mix_ipload",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a machine clear (nuke) of any kind including memory ordering and memory disambiguation.",
- "MetricExpr": "cpu_atom@TOPDOWN_BAD_SPECULATION.MACHINE_CLEARS@ / tma_info_core_slots",
- "MetricGroup": "TopdownL2;tma_L2_group;tma_bad_speculation_group",
- "MetricName": "tma_machine_clears",
- "MetricThreshold": "tma_machine_clears > 0.05",
- "MetricgroupNoGroup": "TopdownL2",
- "ScaleUnit": "100%",
+ "BriefDescription": "Instructions per Store",
+ "MetricExpr": "cpu_atom@INST_RETIRED.ANY@ / cpu_atom@MEM_UOPS_RETIRED.ALL_STORES@",
+ "MetricName": "tma_info_mem_mix_ipstore",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to memory reservation stalls in which a scheduler is not able to accept uops.",
- "MetricExpr": "cpu_atom@TOPDOWN_BE_BOUND.MEM_SCHEDULER@ / tma_info_core_slots",
- "MetricGroup": "TopdownL3;tma_L3_group;tma_resource_bound_group",
- "MetricName": "tma_mem_scheduler",
- "MetricThreshold": "tma_mem_scheduler > 0.1",
- "ScaleUnit": "100%",
+ "BriefDescription": "Percentage of total non-speculative loads that perform one or more locks",
+ "MetricExpr": "100 * cpu_atom@MEM_UOPS_RETIRED.LOCK_LOADS@ / cpu_atom@MEM_UOPS_RETIRED.ALL_LOADS@",
+ "MetricName": "tma_info_mem_mix_load_locks_ratio",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of cycles the core is stalled due to stores or loads.",
- "MetricExpr": "min(tma_backend_bound, cpu_atom@LD_HEAD.ANY_AT_RET@ / tma_info_core_clks + tma_store_bound)",
- "MetricGroup": "TopdownL2;tma_L2_group;tma_backend_bound_group",
- "MetricName": "tma_memory_bound",
- "MetricThreshold": "tma_memory_bound > 0.2",
- "MetricgroupNoGroup": "TopdownL2",
- "ScaleUnit": "100%",
+ "BriefDescription": "Percentage of total non-speculative loads that are splits",
+ "MetricExpr": "100 * cpu_atom@MEM_UOPS_RETIRED.SPLIT_LOADS@ / cpu_atom@MEM_UOPS_RETIRED.ALL_LOADS@",
+ "MetricName": "tma_info_mem_mix_load_splits_ratio",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of machine clears relative to the number of nuke slots due to memory ordering.",
- "MetricExpr": "tma_nuke * (cpu_atom@MACHINE_CLEARS.MEMORY_ORDERING@ / cpu_atom@MACHINE_CLEARS.SLOW@)",
- "MetricGroup": "TopdownL4;tma_L4_group;tma_nuke_group",
- "MetricName": "tma_memory_ordering",
- "MetricThreshold": "tma_memory_ordering > 0.02",
- "ScaleUnit": "100%",
+ "BriefDescription": "Ratio of mem load uops to all uops",
+ "MetricExpr": "1e3 * cpu_atom@MEM_UOPS_RETIRED.ALL_LOADS@ / cpu_atom@UOPS_RETIRED.ALL@",
+ "MetricName": "tma_info_mem_mix_memload_ratio",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of uops that are from the complex flows issued by the micro-sequencer (MS)",
- "MetricExpr": "cpu_atom@UOPS_RETIRED.MS@ / tma_info_core_slots",
- "MetricGroup": "TopdownL2;tma_L2_group;tma_retiring_group",
- "MetricName": "tma_ms_uops",
- "MetricThreshold": "tma_ms_uops > 0.05",
- "MetricgroupNoGroup": "TopdownL2",
- "PublicDescription": "Counts the number of uops that are from the complex flows issued by the micro-sequencer (MS). This includes uops from flows due to complex instructions, faults, assists, and inserted flows.",
- "ScaleUnit": "100%",
+ "BriefDescription": "Percentage of time that the core is stalled due to a TPAUSE or UMWAIT instruction",
+ "MetricExpr": "100 * cpu_atom@SERIALIZATION.C01_MS_SCB@ / (5 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+ "MetricName": "tma_info_serialization _%_tpause_cycles",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to IEC or FPC RAT stalls, which can be due to FIQ or IEC reservation stalls in which the integer, floating point or SIMD scheduler is not able to accept uops.",
- "MetricExpr": "cpu_atom@TOPDOWN_BE_BOUND.NON_MEM_SCHEDULER@ / tma_info_core_slots",
- "MetricGroup": "TopdownL3;tma_L3_group;tma_resource_bound_group",
- "MetricName": "tma_non_mem_scheduler",
- "MetricThreshold": "tma_non_mem_scheduler > 0.1",
- "ScaleUnit": "100%",
+ "BriefDescription": "Average CPU Utilization",
+ "MetricExpr": "cpu_atom@CPU_CLK_UNHALTED.REF_TSC@ / TSC",
+ "MetricName": "tma_info_system_cpu_utilization",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to a machine clear (slow nuke).",
- "MetricExpr": "cpu_atom@TOPDOWN_BAD_SPECULATION.NUKE@ / tma_info_core_slots",
- "MetricGroup": "TopdownL3;tma_L3_group;tma_machine_clears_group",
- "MetricName": "tma_nuke",
- "MetricThreshold": "tma_nuke > 0.05",
- "ScaleUnit": "100%",
+ "BriefDescription": "Fraction of cycles spent in Kernel mode",
+ "MetricExpr": "cpu_atom@CPU_CLK_UNHALTED.CORE_P@k / cpu_atom@CPU_CLK_UNHALTED.CORE@",
+ "MetricGroup": "Summary",
+ "MetricName": "tma_info_system_kernel_utilization",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to other common frontend stalls not categorized.",
- "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.OTHER@ / tma_info_core_slots",
- "MetricGroup": "TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
- "MetricName": "tma_other_fb",
- "MetricThreshold": "tma_other_fb > 0.05",
- "ScaleUnit": "100%",
+ "BriefDescription": "Average Frequency Utilization relative nominal frequency",
+ "MetricExpr": "cpu_atom@CPU_CLK_UNHALTED.CORE@ / cpu_atom@CPU_CLK_UNHALTED.REF_TSC@",
+ "MetricGroup": "Power",
+ "MetricName": "tma_info_system_turbo_utilization",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement due to a number of other load blocks.",
- "MetricExpr": "cpu_atom@LD_HEAD.OTHER_AT_RET@ / tma_info_core_clks",
- "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group",
- "MetricName": "tma_other_l1",
- "MetricThreshold": "tma_other_l1 > 0.05",
- "ScaleUnit": "100%",
+ "BriefDescription": "Percentage of all uops which are FPDiv uops",
+ "MetricExpr": "100 * cpu_atom@UOPS_RETIRED.FPDIV@ / cpu_atom@UOPS_RETIRED.ALL@",
+ "MetricName": "tma_info_uop_mix_fpdiv_uop_ratio",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of cycles the core is stalled due to a demand load miss which hits in the L2, LLC, DRAM or MMIO (Non-DRAM) but could not be correctly attributed or cycles in which the load miss is waiting on a request buffer.",
- "MetricExpr": "max(0, tma_memory_bound - (tma_store_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_dram_bound))",
- "MetricGroup": "TopdownL3;tma_L3_group;tma_memory_bound_group",
- "MetricName": "tma_other_load_store",
- "MetricThreshold": "tma_other_load_store > 0.1",
- "ScaleUnit": "100%",
+ "BriefDescription": "Percentage of all uops which are IDiv uops",
+ "MetricExpr": "100 * cpu_atom@UOPS_RETIRED.IDIV@ / cpu_atom@UOPS_RETIRED.ALL@",
+ "MetricName": "tma_info_uop_mix_idiv_uop_ratio",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of uops retired excluding ms and fp div uops.",
- "MetricExpr": "(cpu_atom@TOPDOWN_RETIRING.ALL@ - cpu_atom@UOPS_RETIRED.MS@ - cpu_atom@UOPS_RETIRED.FPDIV@) / tma_info_core_slots",
- "MetricGroup": "TopdownL3;tma_L3_group;tma_base_group",
- "MetricName": "tma_other_ret",
- "MetricThreshold": "tma_other_ret > 0.3",
- "ScaleUnit": "100%",
+ "BriefDescription": "Percentage of all uops which are microcode ops",
+ "MetricExpr": "100 * cpu_atom@UOPS_RETIRED.MS@ / cpu_atom@UOPS_RETIRED.ALL@",
+ "MetricName": "tma_info_uop_mix_microcode_uop_ratio",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of machine clears relative to the number of nuke slots due to page faults.",
- "MetricExpr": "tma_nuke * (cpu_atom@MACHINE_CLEARS.PAGE_FAULT@ / cpu_atom@MACHINE_CLEARS.SLOW@)",
- "MetricGroup": "TopdownL4;tma_L4_group;tma_nuke_group",
- "MetricName": "tma_page_fault",
- "MetricThreshold": "tma_page_fault > 0.02",
- "ScaleUnit": "100%",
+ "BriefDescription": "Percentage of all uops which are x87 uops",
+ "MetricExpr": "100 * cpu_atom@UOPS_RETIRED.X87@ / cpu_atom@UOPS_RETIRED.ALL@",
+ "MetricName": "tma_info_uop_mix_x87_uop_ratio",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to wrong predecodes.",
- "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.PREDECODE@ / tma_info_core_slots",
- "MetricGroup": "TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
- "MetricName": "tma_predecode",
- "MetricThreshold": "tma_predecode > 0.05",
+ "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to Instruction Table Lookaside Buffer (ITLB) misses.",
+ "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.ITLB@ / (5 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+ "MetricGroup": "TopdownL3;tma_L3_group;tma_ifetch_latency_group",
+ "MetricName": "tma_itlb_misses",
+ "MetricThreshold": "tma_itlb_misses > 0.05 & (tma_ifetch_latency > 0.15 & tma_frontend_bound > 0.2)",
"ScaleUnit": "100%",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to the physical register file unable to accept an entry (marble stalls).",
- "MetricExpr": "cpu_atom@TOPDOWN_BE_BOUND.REGISTER@ / tma_info_core_slots",
- "MetricGroup": "TopdownL3;tma_L3_group;tma_resource_bound_group",
- "MetricName": "tma_register",
- "MetricThreshold": "tma_register > 0.1",
+ "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a machine clear (nuke) of any kind including memory ordering and memory disambiguation",
+ "MetricExpr": "cpu_atom@TOPDOWN_BAD_SPECULATION.MACHINE_CLEARS@ / (5 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+ "MetricGroup": "TopdownL2;tma_L2_group;tma_bad_speculation_group",
+ "MetricName": "tma_machine_clears",
+ "MetricThreshold": "tma_machine_clears > 0.05 & tma_bad_speculation > 0.15",
+ "MetricgroupNoGroup": "TopdownL2",
"ScaleUnit": "100%",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to the reorder buffer being full (ROB stalls).",
- "MetricExpr": "cpu_atom@TOPDOWN_BE_BOUND.REORDER_BUFFER@ / tma_info_core_slots",
+ "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to memory reservation stalls in which a scheduler is not able to accept uops",
+ "MetricExpr": "cpu_atom@TOPDOWN_BE_BOUND.MEM_SCHEDULER@ / (5 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
"MetricGroup": "TopdownL3;tma_L3_group;tma_resource_bound_group",
- "MetricName": "tma_reorder_buffer",
- "MetricThreshold": "tma_reorder_buffer > 0.1",
- "ScaleUnit": "100%",
- "Unit": "cpu_atom"
- },
- {
- "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls",
- "MetricExpr": "tma_backend_bound",
- "MetricGroup": "TopdownL2;tma_L2_group;tma_backend_bound_aux_group",
- "MetricName": "tma_resource_bound",
- "MetricThreshold": "tma_resource_bound > 0.2",
- "MetricgroupNoGroup": "TopdownL2",
- "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls. Note that uops must be available for consumption in order for this event to count. If a uop is not available (IQ is empty), this event will not count.",
+ "MetricName": "tma_mem_scheduler",
+ "MetricThreshold": "tma_mem_scheduler > 0.1 & (tma_resource_bound > 0.2 & tma_backend_bound > 0.1)",
"ScaleUnit": "100%",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of issue slots that result in retirement slots.",
- "DefaultMetricgroupName": "TopdownL1",
- "MetricExpr": "cpu_atom@TOPDOWN_RETIRING.ALL@ / tma_info_core_slots",
- "MetricGroup": "Default;TopdownL1;tma_L1_group",
- "MetricName": "tma_retiring",
- "MetricThreshold": "tma_retiring > 0.75",
- "MetricgroupNoGroup": "TopdownL1;Default",
+ "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to IEC or FPC RAT stalls, which can be due to FIQ or IEC reservation stalls in which the integer, floating point or SIMD scheduler is not able to accept uops",
+ "MetricExpr": "cpu_atom@TOPDOWN_BE_BOUND.NON_MEM_SCHEDULER@ / (5 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+ "MetricGroup": "TopdownL3;tma_L3_group;tma_resource_bound_group",
+ "MetricName": "tma_non_mem_scheduler",
+ "MetricThreshold": "tma_non_mem_scheduler > 0.1 & (tma_resource_bound > 0.2 & tma_backend_bound > 0.1)",
"ScaleUnit": "100%",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of cycles, relative to the number of mem_scheduler slots, in which uops are blocked due to RSV full relative",
- "MetricExpr": "tma_mem_scheduler * cpu_atom@MEM_SCHEDULER_BLOCK.RSV@ / MEM_SCHEDULER_BLOCK.ALL",
- "MetricGroup": "TopdownL4;tma_L4_group;tma_mem_scheduler_group",
- "MetricName": "tma_rsv",
- "MetricThreshold": "tma_rsv > 0.05",
+ "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to a machine clear that requires the use of microcode (slow nuke)",
+ "MetricExpr": "cpu_atom@TOPDOWN_BAD_SPECULATION.NUKE@ / (5 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+ "MetricGroup": "TopdownL3;tma_L3_group;tma_machine_clears_group",
+ "MetricName": "tma_nuke",
+ "MetricThreshold": "tma_nuke > 0.05 & (tma_machine_clears > 0.05 & tma_bad_speculation > 0.15)",
"ScaleUnit": "100%",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to scoreboards from the instruction queue (IQ), jump execution unit (JEU), or microcode sequencer (MS).",
- "MetricExpr": "cpu_atom@TOPDOWN_BE_BOUND.SERIALIZATION@ / tma_info_core_slots",
- "MetricGroup": "TopdownL3;tma_L3_group;tma_resource_bound_group",
- "MetricName": "tma_serialization",
- "MetricThreshold": "tma_serialization > 0.1",
+ "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to other common frontend stalls not categorized.",
+ "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.OTHER@ / (5 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+ "MetricGroup": "TopdownL3;tma_L3_group;tma_ifetch_bandwidth_group",
+ "MetricName": "tma_other_fb",
+ "MetricThreshold": "tma_other_fb > 0.05 & (tma_ifetch_bandwidth > 0.1 & tma_frontend_bound > 0.2)",
"ScaleUnit": "100%",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of machine clears relative to the number of nuke slots due to SMC.",
- "MetricExpr": "tma_nuke * (cpu_atom@MACHINE_CLEARS.SMC@ / cpu_atom@MACHINE_CLEARS.SLOW@)",
- "MetricGroup": "TopdownL4;tma_L4_group;tma_nuke_group",
- "MetricName": "tma_smc",
- "MetricThreshold": "tma_smc > 0.02",
+ "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to wrong predecodes.",
+ "MetricExpr": "cpu_atom@TOPDOWN_FE_BOUND.PREDECODE@ / (5 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+ "MetricGroup": "TopdownL3;tma_L3_group;tma_ifetch_bandwidth_group",
+ "MetricName": "tma_predecode",
+ "MetricThreshold": "tma_predecode > 0.05 & (tma_ifetch_bandwidth > 0.1 & tma_frontend_bound > 0.2)",
"ScaleUnit": "100%",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of cycles, relative to the number of mem_scheduler slots, in which uops are blocked due to store buffer full",
- "MetricExpr": "tma_store_bound",
- "MetricGroup": "TopdownL4;tma_L4_group;tma_mem_scheduler_group",
- "MetricName": "tma_st_buffer",
- "MetricThreshold": "tma_st_buffer > 0.05",
+ "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to the physical register file unable to accept an entry (marble stalls)",
+ "MetricExpr": "cpu_atom@TOPDOWN_BE_BOUND.REGISTER@ / (5 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+ "MetricGroup": "TopdownL3;tma_L3_group;tma_resource_bound_group",
+ "MetricName": "tma_register",
+ "MetricThreshold": "tma_register > 0.1 & (tma_resource_bound > 0.2 & tma_backend_bound > 0.1)",
"ScaleUnit": "100%",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement due to a first level TLB miss.",
- "MetricExpr": "cpu_atom@LD_HEAD.DTLB_MISS_AT_RET@ / tma_info_core_clks",
- "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group",
- "MetricName": "tma_stlb_hit",
- "MetricThreshold": "tma_stlb_hit > 0.05",
+ "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to the reorder buffer being full (ROB stalls)",
+ "MetricExpr": "cpu_atom@TOPDOWN_BE_BOUND.REORDER_BUFFER@ / (5 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+ "MetricGroup": "TopdownL3;tma_L3_group;tma_resource_bound_group",
+ "MetricName": "tma_reorder_buffer",
+ "MetricThreshold": "tma_reorder_buffer > 0.1 & (tma_resource_bound > 0.2 & tma_backend_bound > 0.1)",
"ScaleUnit": "100%",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement due to a second level TLB miss requiring a page walk.",
- "MetricExpr": "cpu_atom@LD_HEAD.PGWALK_AT_RET@ / tma_info_core_clks",
- "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group",
- "MetricName": "tma_stlb_miss",
- "MetricThreshold": "tma_stlb_miss > 0.05",
+ "BriefDescription": "Counts the number of cycles the core is stalled due to a resource limitation",
+ "MetricExpr": "tma_backend_bound - tma_core_bound",
+ "MetricGroup": "TopdownL2;tma_L2_group;tma_backend_bound_group",
+ "MetricName": "tma_resource_bound",
+ "MetricThreshold": "tma_resource_bound > 0.2 & tma_backend_bound > 0.1",
+ "MetricgroupNoGroup": "TopdownL2",
"ScaleUnit": "100%",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of cycles the core is stalled due to store buffer full.",
- "MetricExpr": "tma_mem_scheduler * (cpu_atom@MEM_SCHEDULER_BLOCK.ST_BUF@ / cpu_atom@MEM_SCHEDULER_BLOCK.ALL@)",
- "MetricGroup": "TopdownL3;tma_L3_group;tma_memory_bound_group",
- "MetricName": "tma_store_bound",
- "MetricThreshold": "tma_store_bound > 0.1",
+ "BriefDescription": "Counts the number of issue slots that result in retirement slots",
+ "DefaultMetricgroupName": "TopdownL1",
+ "MetricExpr": "cpu_atom@TOPDOWN_RETIRING.ALL@ / (5 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+ "MetricGroup": "Default;TopdownL1;tma_L1_group",
+ "MetricName": "tma_retiring",
+ "MetricThreshold": "tma_retiring > 0.75",
+ "MetricgroupNoGroup": "TopdownL1;Default",
"ScaleUnit": "100%",
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement due to a store forward block.",
- "MetricExpr": "cpu_atom@LD_HEAD.ST_ADDR_AT_RET@ / tma_info_core_clks",
- "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group",
- "MetricName": "tma_store_fwd_blk",
- "MetricThreshold": "tma_store_fwd_blk > 0.05",
+ "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to scoreboards from the instruction queue (IQ), jump execution unit (JEU), or microcode sequencer (MS)",
+ "MetricExpr": "cpu_atom@TOPDOWN_BE_BOUND.SERIALIZATION@ / (5 * cpu_atom@CPU_CLK_UNHALTED.CORE@)",
+ "MetricGroup": "TopdownL3;tma_L3_group;tma_resource_bound_group",
+ "MetricName": "tma_serialization",
+ "MetricThreshold": "tma_serialization > 0.1 & (tma_resource_bound > 0.2 & tma_backend_bound > 0.1)",
"ScaleUnit": "100%",
"Unit": "cpu_atom"
},
@@ -818,7 +726,7 @@
{
"BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists",
"MetricExpr": "78 * cpu_core@ASSISTS.ANY@ / tma_info_thread_slots",
- "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group",
+ "MetricGroup": "BvIO;TopdownL4;tma_L4_group;tma_microcode_sequencer_group",
"MetricName": "tma_assists",
"MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)",
"PublicDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists. Assists are long sequences of uops that are required in certain corner-cases for operations that cannot be handled natively by the execution pipeline. For example; when working with very small floating point values (so-called Denormals); the FP units are not set up to perform these operations natively. Instead; a sequence of instructions to perform the computation on the Denormals is injected into the pipeline. Since these microcode sequences might be dozens of uops long; Assists can be extremely deleterious to performance and they can be avoided in many cases. Sample with: ASSISTS.ANY",
@@ -838,7 +746,7 @@
"BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend",
"DefaultMetricgroupName": "TopdownL1",
"MetricExpr": "cpu_core@topdown\\-be\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots",
- "MetricGroup": "Default;TmaL1;TopdownL1;tma_L1_group",
+ "MetricGroup": "BvOB;Default;TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_backend_bound",
"MetricThreshold": "tma_backend_bound > 0.2",
"MetricgroupNoGroup": "TopdownL1;Default",
@@ -861,7 +769,7 @@
{
"BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction",
"MetricExpr": "cpu_core@topdown\\-br\\-mispredict@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots",
- "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM",
+ "MetricGroup": "BadSpec;BrMispredicts;BvMP;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM",
"MetricName": "tma_branch_mispredicts",
"MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15",
"MetricgroupNoGroup": "TopdownL2",
@@ -920,7 +828,7 @@
{
"BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses",
"MetricExpr": "(25 * tma_info_system_core_frequency * (cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD@ * (cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ / (cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ + cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD@))) + 24 * tma_info_system_core_frequency * cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS@) * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_thread_clks",
- "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group",
+ "MetricGroup": "BvMS;DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group",
"MetricName": "tma_contested_accesses",
"MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
"PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses. Contested accesses occur when data written by one Logical Processor are read by another Logical Processor on a different Physical Core. Examples of contested accesses include synchronizations such as locks; true data sharing such as modified locked variables; and false sharing. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD;MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS. Related metrics: tma_data_sharing, tma_false_sharing, tma_machine_clears, tma_remote_cache",
@@ -941,7 +849,7 @@
{
"BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses",
"MetricExpr": "24 * tma_info_system_core_frequency * (cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD@ + cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD@ * (1 - cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ / (cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ + cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD@))) * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_thread_clks",
- "MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group",
+ "MetricGroup": "BvMS;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group",
"MetricName": "tma_data_sharing",
"MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
"PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses. Data shared by multiple Logical Processors (even just read shared) may cause increased access latency due to cache coherency. Excessive data sharing can drastically harm multithreaded performance. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD. Related metrics: tma_contested_accesses, tma_false_sharing, tma_machine_clears, tma_remote_cache",
@@ -961,7 +869,7 @@
{
"BriefDescription": "This metric represents fraction of cycles where the Divider unit was active",
"MetricExpr": "cpu_core@ARITH.DIV_ACTIVE@ / tma_info_thread_clks",
- "MetricGroup": "TopdownL3;tma_L3_group;tma_core_bound_group",
+ "MetricGroup": "BvCB;TopdownL3;tma_L3_group;tma_core_bound_group",
"MetricName": "tma_divider",
"MetricThreshold": "tma_divider > 0.2 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
"PublicDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication. Sample with: ARITH.DIVIDER_ACTIVE",
@@ -994,14 +902,14 @@
"MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB",
"MetricName": "tma_dsb_switches",
"MetricThreshold": "tma_dsb_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
- "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS. Related metrics: tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp",
+ "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS. Related metrics: tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp",
"ScaleUnit": "100%",
"Unit": "cpu_core"
},
{
"BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses",
"MetricExpr": "min(7 * cpu_core@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + cpu_core@DTLB_LOAD_MISSES.WALK_ACTIVE@, max(cpu_core@CYCLE_ACTIVITY.CYCLES_MEM_ANY@ - cpu_core@MEMORY_ACTIVITY.CYCLES_L1D_MISS@, 0)) / tma_info_thread_clks",
- "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group",
+ "MetricGroup": "BvMT;MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group",
"MetricName": "tma_dtlb_load",
"MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
"PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS. Related metrics: tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization",
@@ -1011,7 +919,7 @@
{
"BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses",
"MetricExpr": "(7 * cpu_core@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + cpu_core@DTLB_STORE_MISSES.WALK_ACTIVE@) / tma_info_core_core_clks",
- "MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group",
+ "MetricGroup": "BvMT;MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group",
"MetricName": "tma_dtlb_store",
"MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
"PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS. Related metrics: tma_dtlb_load, tma_info_bottleneck_memory_data_tlbs, tma_info_bottleneck_memory_synchronization",
@@ -1021,7 +929,7 @@
{
"BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing",
"MetricExpr": "28 * tma_info_system_core_frequency * cpu_core@OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM@ / tma_info_thread_clks",
- "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group",
+ "MetricGroup": "BvMS;DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group",
"MetricName": "tma_false_sharing",
"MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
"PublicDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing. False Sharing is a multithreading hiccup; where multiple Logical Processors contend on different data-elements mapped into the same cache line. Sample with: OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM. Related metrics: tma_contested_accesses, tma_data_sharing, tma_machine_clears, tma_remote_cache",
@@ -1031,7 +939,7 @@
{
"BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed",
"MetricExpr": "cpu_core@L1D_PEND_MISS.FB_FULL@ / tma_info_thread_clks",
- "MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group",
+ "MetricGroup": "BvMS;MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group",
"MetricName": "tma_fb_full",
"MetricThreshold": "tma_fb_full > 0.3",
"PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores",
@@ -1045,7 +953,7 @@
"MetricName": "tma_fetch_bandwidth",
"MetricThreshold": "tma_fetch_bandwidth > 0.2",
"MetricgroupNoGroup": "TopdownL2",
- "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp",
+ "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_botlnk_l2_dsb_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp",
"ScaleUnit": "100%",
"Unit": "cpu_core"
},
@@ -1092,7 +1000,7 @@
},
{
"BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired",
- "MetricExpr": "cpu_core@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ / (tma_retiring * tma_info_thread_slots)",
+ "MetricExpr": "cpu_core@FP_ARITH_INST_RETIRED.SCALAR@ / (tma_retiring * tma_info_thread_slots)",
"MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P",
"MetricName": "tma_fp_scalar",
"MetricThreshold": "tma_fp_scalar > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)",
@@ -1102,7 +1010,7 @@
},
{
"BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths",
- "MetricExpr": "cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@ / (tma_retiring * tma_info_thread_slots)",
+ "MetricExpr": "cpu_core@FP_ARITH_INST_RETIRED.VECTOR@ / (tma_retiring * tma_info_thread_slots)",
"MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P",
"MetricName": "tma_fp_vector",
"MetricThreshold": "tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)",
@@ -1134,7 +1042,7 @@
"BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend",
"DefaultMetricgroupName": "TopdownL1",
"MetricExpr": "cpu_core@topdown\\-fe\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) - cpu_core@INT_MISC.UOP_DROPPING@ / tma_info_thread_slots",
- "MetricGroup": "Default;PGO;TmaL1;TopdownL1;tma_L1_group",
+ "MetricGroup": "BvFB;BvIO;Default;PGO;TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_frontend_bound",
"MetricThreshold": "tma_frontend_bound > 0.15",
"MetricgroupNoGroup": "TopdownL1;Default",
@@ -1145,7 +1053,7 @@
{
"BriefDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions",
"MetricExpr": "tma_light_operations * cpu_core@INST_RETIRED.MACRO_FUSED@ / (tma_retiring * tma_info_thread_slots)",
- "MetricGroup": "Branches;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
+ "MetricGroup": "Branches;BvBO;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
"MetricName": "tma_fused_instructions",
"MetricThreshold": "tma_fused_instructions > 0.1 & tma_light_operations > 0.6",
"PublicDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions. CMP+JCC or DEC+JCC are common examples of legacy fusions. {([MTL] Note new MOV+OP and Load+OP fusions appear under Other_Light_Ops in MTL!)}",
@@ -1166,7 +1074,7 @@
{
"BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses",
"MetricExpr": "cpu_core@ICACHE_DATA.STALLS@ / tma_info_thread_clks",
- "MetricGroup": "BigFootprint;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+ "MetricGroup": "BigFootprint;BvBC;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
"MetricName": "tma_icache_misses",
"MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
"PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS",
@@ -1183,7 +1091,7 @@
},
{
"BriefDescription": "Instructions per retired mispredicts for conditional non-taken branches (lower number means higher occurrence rate).",
- "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / BR_MISP_RETIRED.COND_NTAKEN",
+ "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@BR_MISP_RETIRED.COND_NTAKEN@",
"MetricGroup": "Bad;BrMispredicts",
"MetricName": "tma_info_bad_spec_ipmisp_cond_ntaken",
"MetricThreshold": "tma_info_bad_spec_ipmisp_cond_ntaken < 200",
@@ -1191,7 +1099,7 @@
},
{
"BriefDescription": "Instructions per retired mispredicts for conditional taken branches (lower number means higher occurrence rate).",
- "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / BR_MISP_RETIRED.COND_TAKEN",
+ "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@BR_MISP_RETIRED.COND_TAKEN@",
"MetricGroup": "Bad;BrMispredicts",
"MetricName": "tma_info_bad_spec_ipmisp_cond_taken",
"MetricThreshold": "tma_info_bad_spec_ipmisp_cond_taken < 200",
@@ -1199,7 +1107,7 @@
},
{
"BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).",
- "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / BR_MISP_RETIRED.INDIRECT",
+ "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@BR_MISP_RETIRED.INDIRECT@",
"MetricGroup": "Bad;BrMispredicts",
"MetricName": "tma_info_bad_spec_ipmisp_indirect",
"MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3",
@@ -1207,7 +1115,7 @@
},
{
"BriefDescription": "Instructions per retired mispredicts for return branches (lower number means higher occurrence rate).",
- "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / BR_MISP_RETIRED.RET",
+ "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@BR_MISP_RETIRED.RET@",
"MetricGroup": "Bad;BrMispredicts",
"MetricName": "tma_info_bad_spec_ipmisp_ret",
"MetricThreshold": "tma_info_bad_spec_ipmisp_ret < 500",
@@ -1215,7 +1123,7 @@
},
{
"BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)",
- "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / BR_MISP_RETIRED.ALL_BRANCHES",
+ "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@BR_MISP_RETIRED.ALL_BRANCHES@",
"MetricGroup": "Bad;BadSpec;BrMispredicts",
"MetricName": "tma_info_bad_spec_ipmispredict",
"MetricThreshold": "tma_info_bad_spec_ipmispredict < 200",
@@ -1237,12 +1145,21 @@
"Unit": "cpu_core"
},
{
+ "BriefDescription": "Total pipeline cost of DSB (uop cache) hits - subset of the Instruction_Fetch_BW Bottleneck",
+ "MetricExpr": "100 * (tma_frontend_bound * (tma_fetch_bandwidth / (tma_fetch_bandwidth + tma_fetch_latency)) * (tma_dsb / (tma_dsb + tma_lsd + tma_mite)))",
+ "MetricGroup": "DSB;FetchBW;tma_issueFB",
+ "MetricName": "tma_info_botlnk_l2_dsb_bandwidth",
+ "MetricThreshold": "tma_info_botlnk_l2_dsb_bandwidth > 10",
+ "PublicDescription": "Total pipeline cost of DSB (uop cache) hits - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp",
+ "Unit": "cpu_core"
+ },
+ {
"BriefDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck",
"MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_lsd + tma_mite))",
"MetricGroup": "DSBmiss;Fed;tma_issueFB",
"MetricName": "tma_info_botlnk_l2_dsb_misses",
"MetricThreshold": "tma_info_botlnk_l2_dsb_misses > 10",
- "PublicDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp",
+ "PublicDescription": "Total pipeline cost of DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp",
"Unit": "cpu_core"
},
{
@@ -1255,33 +1172,26 @@
"Unit": "cpu_core"
},
{
- "BriefDescription": "Total pipeline cost of \"useful operations\" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead.",
- "MetricExpr": "100 * (tma_retiring - (cpu_core@BR_INST_RETIRED.ALL_BRANCHES@ + cpu_core@BR_INST_RETIRED.NEAR_CALL@) / tma_info_thread_slots - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)",
- "MetricGroup": "Ret",
- "MetricName": "tma_info_bottleneck_base_non_br",
- "MetricThreshold": "tma_info_bottleneck_base_non_br > 20",
- "Unit": "cpu_core"
- },
- {
"BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)",
"MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)",
- "MetricGroup": "BigFootprint;Fed;Frontend;IcMiss;MemoryTLB",
+ "MetricGroup": "BigFootprint;BvBC;Fed;Frontend;IcMiss;MemoryTLB",
"MetricName": "tma_info_bottleneck_big_code",
"MetricThreshold": "tma_info_bottleneck_big_code > 20",
"Unit": "cpu_core"
},
{
- "BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)",
- "MetricExpr": "100 * ((cpu_core@BR_INST_RETIRED.ALL_BRANCHES@ + cpu_core@BR_INST_RETIRED.NEAR_CALL@) / tma_info_thread_slots)",
- "MetricGroup": "Ret",
+ "BriefDescription": "Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA",
+ "MetricExpr": "100 * ((cpu_core@BR_INST_RETIRED.ALL_BRANCHES@ + 2 * cpu_core@BR_INST_RETIRED.NEAR_CALL@ + cpu_core@INST_RETIRED.NOP@) / tma_info_thread_slots)",
+ "MetricGroup": "BvBO;Ret",
"MetricName": "tma_info_bottleneck_branching_overhead",
"MetricThreshold": "tma_info_bottleneck_branching_overhead > 5",
+ "PublicDescription": "Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA. Examples include function calls; loops and alignments. (A lower bound)",
"Unit": "cpu_core"
},
{
"BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks",
- "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))",
- "MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW",
+ "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_l1_hit_latency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))",
+ "MetricGroup": "BvMB;Mem;MemoryBW;Offcore;tma_issueBW",
"MetricName": "tma_info_bottleneck_cache_memory_bandwidth",
"MetricThreshold": "tma_info_bottleneck_cache_memory_bandwidth > 20",
"PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full",
@@ -1289,8 +1199,8 @@
},
{
"BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks",
- "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))",
- "MetricGroup": "Mem;MemoryLat;Offcore;tma_issueLat",
+ "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l1_hit_latency / (tma_dtlb_load + tma_fb_full + tma_l1_hit_latency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))",
+ "MetricGroup": "BvML;Mem;MemoryLat;Offcore;tma_issueLat",
"MetricName": "tma_info_bottleneck_cache_memory_latency",
"MetricThreshold": "tma_info_bottleneck_cache_memory_latency > 20",
"PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency",
@@ -1299,16 +1209,16 @@
{
"BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation",
"MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))",
- "MetricGroup": "Cor;tma_issueComp",
+ "MetricGroup": "BvCB;Cor;tma_issueComp",
"MetricName": "tma_info_bottleneck_compute_bound_est",
"MetricThreshold": "tma_info_bottleneck_compute_bound_est > 20",
"PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: ",
"Unit": "cpu_core"
},
{
- "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks",
+ "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end)",
"MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - (1 - cpu_core@INST_RETIRED.REP_ITERATION@ / cpu_core@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))) - tma_info_bottleneck_big_code",
- "MetricGroup": "Fed;FetchBW;Frontend",
+ "MetricGroup": "BvFB;Fed;FetchBW;Frontend",
"MetricName": "tma_info_bottleneck_instruction_fetch_bw",
"MetricThreshold": "tma_info_bottleneck_instruction_fetch_bw > 20",
"Unit": "cpu_core"
@@ -1316,7 +1226,7 @@
{
"BriefDescription": "Total pipeline cost of irregular execution (e.g",
"MetricExpr": "100 * ((1 - cpu_core@INST_RETIRED.REP_ITERATION@ / cpu_core@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + cpu_core@RS.EMPTY\\,umask\\=1@ / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)",
- "MetricGroup": "Bad;Cor;Ret;tma_issueMS",
+ "MetricGroup": "Bad;BvIO;Cor;Ret;tma_issueMS",
"MetricName": "tma_info_bottleneck_irregular_overhead",
"MetricThreshold": "tma_info_bottleneck_irregular_overhead > 10",
"PublicDescription": "Total pipeline cost of irregular execution (e.g. FP-assists in HPC, Wait time with work imbalance multithreaded workloads, overhead in system services or virtualized environments). Related metrics: tma_microcode_sequencer, tma_ms_switches",
@@ -1324,8 +1234,8 @@
},
{
"BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)",
- "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))",
- "MetricGroup": "Mem;MemoryTLB;Offcore;tma_issueTLB",
+ "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_dtlb_load + tma_fb_full + tma_l1_hit_latency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))",
+ "MetricGroup": "BvMT;Mem;MemoryTLB;Offcore;tma_issueTLB",
"MetricName": "tma_info_bottleneck_memory_data_tlbs",
"MetricThreshold": "tma_info_bottleneck_memory_data_tlbs > 20",
"PublicDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_synchronization",
@@ -1334,7 +1244,7 @@
{
"BriefDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)",
"MetricExpr": "100 * (tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_contested_accesses + tma_data_sharing) / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * tma_false_sharing / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores - tma_store_latency)) + tma_machine_clears * (1 - tma_other_nukes / tma_other_nukes))",
- "MetricGroup": "Mem;Offcore;tma_issueTLB",
+ "MetricGroup": "BvMS;Mem;Offcore;tma_issueTLB",
"MetricName": "tma_info_bottleneck_memory_synchronization",
"MetricThreshold": "tma_info_bottleneck_memory_synchronization > 10",
"PublicDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors). Related metrics: tma_dtlb_load, tma_dtlb_store, tma_info_bottleneck_memory_data_tlbs",
@@ -1343,45 +1253,53 @@
{
"BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks",
"MetricExpr": "100 * (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))",
- "MetricGroup": "Bad;BadSpec;BrMispredicts;tma_issueBM",
+ "MetricGroup": "Bad;BadSpec;BrMispredicts;BvMP;tma_issueBM",
"MetricName": "tma_info_bottleneck_mispredictions",
"MetricThreshold": "tma_info_bottleneck_mispredictions > 20",
"PublicDescription": "Total pipeline cost of Branch Misprediction related bottlenecks. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers",
"Unit": "cpu_core"
},
{
- "BriefDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class)",
- "MetricExpr": "100 - (tma_info_bottleneck_big_code + tma_info_bottleneck_instruction_fetch_bw + tma_info_bottleneck_mispredictions + tma_info_bottleneck_cache_memory_bandwidth + tma_info_bottleneck_cache_memory_latency + tma_info_bottleneck_memory_data_tlbs + tma_info_bottleneck_memory_synchronization + tma_info_bottleneck_compute_bound_est + tma_info_bottleneck_irregular_overhead + tma_info_bottleneck_branching_overhead + tma_info_bottleneck_base_non_br)",
- "MetricGroup": "Cor;Offcore",
+ "BriefDescription": "Total pipeline cost of remaining bottlenecks in the back-end",
+ "MetricExpr": "100 - (tma_info_bottleneck_big_code + tma_info_bottleneck_instruction_fetch_bw + tma_info_bottleneck_mispredictions + tma_info_bottleneck_cache_memory_bandwidth + tma_info_bottleneck_cache_memory_latency + tma_info_bottleneck_memory_data_tlbs + tma_info_bottleneck_memory_synchronization + tma_info_bottleneck_compute_bound_est + tma_info_bottleneck_irregular_overhead + tma_info_bottleneck_branching_overhead + tma_info_bottleneck_useful_work)",
+ "MetricGroup": "BvOB;Cor;Offcore",
"MetricName": "tma_info_bottleneck_other_bottlenecks",
"MetricThreshold": "tma_info_bottleneck_other_bottlenecks > 20",
- "PublicDescription": "Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls.",
+ "PublicDescription": "Total pipeline cost of remaining bottlenecks in the back-end. Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls.",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "Total pipeline cost of \"useful operations\" - the portion of Retiring category not covered by Branching_Overhead nor Irregular_Overhead.",
+ "MetricExpr": "100 * (tma_retiring - (cpu_core@BR_INST_RETIRED.ALL_BRANCHES@ + 2 * cpu_core@BR_INST_RETIRED.NEAR_CALL@ + cpu_core@INST_RETIRED.NOP@) / tma_info_thread_slots - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)",
+ "MetricGroup": "BvUW;Ret",
+ "MetricName": "tma_info_bottleneck_useful_work",
+ "MetricThreshold": "tma_info_bottleneck_useful_work > 20",
"Unit": "cpu_core"
},
{
"BriefDescription": "Fraction of branches that are CALL or RET",
- "MetricExpr": "(cpu_core@BR_INST_RETIRED.NEAR_CALL@ + cpu_core@BR_INST_RETIRED.NEAR_RETURN@) / BR_INST_RETIRED.ALL_BRANCHES",
+ "MetricExpr": "(cpu_core@BR_INST_RETIRED.NEAR_CALL@ + cpu_core@BR_INST_RETIRED.NEAR_RETURN@) / cpu_core@BR_INST_RETIRED.ALL_BRANCHES@",
"MetricGroup": "Bad;Branches",
"MetricName": "tma_info_branches_callret",
"Unit": "cpu_core"
},
{
"BriefDescription": "Fraction of branches that are non-taken conditionals",
- "MetricExpr": "cpu_core@BR_INST_RETIRED.COND_NTAKEN@ / BR_INST_RETIRED.ALL_BRANCHES",
+ "MetricExpr": "cpu_core@BR_INST_RETIRED.COND_NTAKEN@ / cpu_core@BR_INST_RETIRED.ALL_BRANCHES@",
"MetricGroup": "Bad;Branches;CodeGen;PGO",
"MetricName": "tma_info_branches_cond_nt",
"Unit": "cpu_core"
},
{
"BriefDescription": "Fraction of branches that are taken conditionals",
- "MetricExpr": "cpu_core@BR_INST_RETIRED.COND_TAKEN@ / BR_INST_RETIRED.ALL_BRANCHES",
+ "MetricExpr": "cpu_core@BR_INST_RETIRED.COND_TAKEN@ / cpu_core@BR_INST_RETIRED.ALL_BRANCHES@",
"MetricGroup": "Bad;Branches;CodeGen;PGO",
"MetricName": "tma_info_branches_cond_tk",
"Unit": "cpu_core"
},
{
"BriefDescription": "Fraction of branches that are unconditional (direct or indirect) jumps",
- "MetricExpr": "(cpu_core@BR_INST_RETIRED.NEAR_TAKEN@ - cpu_core@BR_INST_RETIRED.COND_TAKEN@ - 2 * cpu_core@BR_INST_RETIRED.NEAR_CALL@) / BR_INST_RETIRED.ALL_BRANCHES",
+ "MetricExpr": "(cpu_core@BR_INST_RETIRED.NEAR_TAKEN@ - cpu_core@BR_INST_RETIRED.COND_TAKEN@ - 2 * cpu_core@BR_INST_RETIRED.NEAR_CALL@) / cpu_core@BR_INST_RETIRED.ALL_BRANCHES@",
"MetricGroup": "Bad;Branches",
"MetricName": "tma_info_branches_jump",
"Unit": "cpu_core"
@@ -1442,7 +1360,7 @@
"MetricGroup": "DSB;Fed;FetchBW;tma_issueFB",
"MetricName": "tma_info_frontend_dsb_coverage",
"MetricThreshold": "tma_info_frontend_dsb_coverage < 0.7 & tma_info_thread_ipc / 6 > 0.35",
- "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_inst_mix_iptb, tma_lcp",
+ "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_inst_mix_iptb, tma_lcp",
"Unit": "cpu_core"
},
{
@@ -1468,7 +1386,7 @@
},
{
"BriefDescription": "Instructions per non-speculative DSB miss (lower number means higher occurrence rate)",
- "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / FRONTEND_RETIRED.ANY_DSB_MISS",
+ "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@FRONTEND_RETIRED.ANY_DSB_MISS@",
"MetricGroup": "DSBmiss;Fed",
"MetricName": "tma_info_frontend_ipdsb_miss_ret",
"MetricThreshold": "tma_info_frontend_ipdsb_miss_ret < 50",
@@ -1476,21 +1394,21 @@
},
{
"BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)",
- "MetricExpr": "tma_info_inst_mix_instructions / BACLEARS.ANY",
+ "MetricExpr": "tma_info_inst_mix_instructions / cpu_core@BACLEARS.ANY@",
"MetricGroup": "Fed",
"MetricName": "tma_info_frontend_ipunknown_branch",
"Unit": "cpu_core"
},
{
"BriefDescription": "L2 cache true code cacheline misses per kilo instruction",
- "MetricExpr": "1e3 * cpu_core@FRONTEND_RETIRED.L2_MISS@ / INST_RETIRED.ANY",
+ "MetricExpr": "1e3 * cpu_core@FRONTEND_RETIRED.L2_MISS@ / cpu_core@INST_RETIRED.ANY@",
"MetricGroup": "IcMiss",
"MetricName": "tma_info_frontend_l2mpki_code",
"Unit": "cpu_core"
},
{
"BriefDescription": "L2 cache speculative code cacheline misses per kilo instruction",
- "MetricExpr": "1e3 * cpu_core@L2_RQSTS.CODE_RD_MISS@ / INST_RETIRED.ANY",
+ "MetricExpr": "1e3 * cpu_core@L2_RQSTS.CODE_RD_MISS@ / cpu_core@INST_RETIRED.ANY@",
"MetricGroup": "IcMiss",
"MetricName": "tma_info_frontend_l2mpki_code_all",
"Unit": "cpu_core"
@@ -1512,7 +1430,7 @@
},
{
"BriefDescription": "Branch instructions per taken branch.",
- "MetricExpr": "cpu_core@BR_INST_RETIRED.ALL_BRANCHES@ / BR_INST_RETIRED.NEAR_TAKEN",
+ "MetricExpr": "cpu_core@BR_INST_RETIRED.ALL_BRANCHES@ / cpu_core@BR_INST_RETIRED.NEAR_TAKEN@",
"MetricGroup": "Branches;Fed;PGO",
"MetricName": "tma_info_inst_mix_bptkbranch",
"Unit": "cpu_core"
@@ -1527,7 +1445,7 @@
},
{
"BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)",
- "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / (cpu_core@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0x3c@)",
+ "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / (cpu_core@FP_ARITH_INST_RETIRED.SCALAR@ + cpu_core@FP_ARITH_INST_RETIRED.VECTOR@)",
"MetricGroup": "Flops;InsType",
"MetricName": "tma_info_inst_mix_iparith",
"MetricThreshold": "tma_info_inst_mix_iparith < 10",
@@ -1554,7 +1472,7 @@
},
{
"BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)",
- "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / FP_ARITH_INST_RETIRED.SCALAR_DOUBLE",
+ "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@FP_ARITH_INST_RETIRED.SCALAR_DOUBLE@",
"MetricGroup": "Flops;FpScalar;InsType",
"MetricName": "tma_info_inst_mix_iparith_scalar_dp",
"MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10",
@@ -1563,7 +1481,7 @@
},
{
"BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)",
- "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / FP_ARITH_INST_RETIRED.SCALAR_SINGLE",
+ "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@FP_ARITH_INST_RETIRED.SCALAR_SINGLE@",
"MetricGroup": "Flops;FpScalar;InsType",
"MetricName": "tma_info_inst_mix_iparith_scalar_sp",
"MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10",
@@ -1572,7 +1490,7 @@
},
{
"BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)",
- "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / BR_INST_RETIRED.ALL_BRANCHES",
+ "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@BR_INST_RETIRED.ALL_BRANCHES@",
"MetricGroup": "Branches;Fed;InsType",
"MetricName": "tma_info_inst_mix_ipbranch",
"MetricThreshold": "tma_info_inst_mix_ipbranch < 8",
@@ -1580,7 +1498,7 @@
},
{
"BriefDescription": "Instructions per (near) call (lower number means higher occurrence rate)",
- "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / BR_INST_RETIRED.NEAR_CALL",
+ "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@BR_INST_RETIRED.NEAR_CALL@",
"MetricGroup": "Branches;Fed;PGO",
"MetricName": "tma_info_inst_mix_ipcall",
"MetricThreshold": "tma_info_inst_mix_ipcall < 200",
@@ -1596,7 +1514,7 @@
},
{
"BriefDescription": "Instructions per Load (lower number means higher occurrence rate)",
- "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / MEM_INST_RETIRED.ALL_LOADS",
+ "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@MEM_INST_RETIRED.ALL_LOADS@",
"MetricGroup": "InsType",
"MetricName": "tma_info_inst_mix_ipload",
"MetricThreshold": "tma_info_inst_mix_ipload < 3",
@@ -1604,14 +1522,14 @@
},
{
"BriefDescription": "Instructions per PAUSE (lower number means higher occurrence rate)",
- "MetricExpr": "tma_info_inst_mix_instructions / CPU_CLK_UNHALTED.PAUSE_INST",
+ "MetricExpr": "tma_info_inst_mix_instructions / cpu_core@CPU_CLK_UNHALTED.PAUSE_INST@",
"MetricGroup": "Flops;FpVector;InsType",
"MetricName": "tma_info_inst_mix_ippause",
"Unit": "cpu_core"
},
{
"BriefDescription": "Instructions per Store (lower number means higher occurrence rate)",
- "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / MEM_INST_RETIRED.ALL_STORES",
+ "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@MEM_INST_RETIRED.ALL_STORES@",
"MetricGroup": "InsType",
"MetricName": "tma_info_inst_mix_ipstore",
"MetricThreshold": "tma_info_inst_mix_ipstore < 8",
@@ -1626,12 +1544,12 @@
"Unit": "cpu_core"
},
{
- "BriefDescription": "Instruction per taken branch",
- "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / BR_INST_RETIRED.NEAR_TAKEN",
+ "BriefDescription": "Instructions per taken branch",
+ "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@BR_INST_RETIRED.NEAR_TAKEN@",
"MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO;tma_issueFB",
"MetricName": "tma_info_inst_mix_iptb",
"MetricThreshold": "tma_info_inst_mix_iptb < 13",
- "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_lcp",
+ "PublicDescription": "Instructions per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_lcp",
"Unit": "cpu_core"
},
{
@@ -1664,13 +1582,13 @@
},
{
"BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)",
- "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / INST_RETIRED.ANY",
+ "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@INST_RETIRED.ANY@",
"MetricGroup": "CacheHits;Mem",
"MetricName": "tma_info_memory_fb_hpki",
"Unit": "cpu_core"
},
{
- "BriefDescription": "",
+ "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]",
"MetricExpr": "64 * cpu_core@L1D.REPLACEMENT@ / 1e9 / duration_time",
"MetricGroup": "Mem;MemoryBW",
"MetricName": "tma_info_memory_l1d_cache_fill_bw",
@@ -1678,20 +1596,20 @@
},
{
"BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
- "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / INST_RETIRED.ANY",
+ "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / cpu_core@INST_RETIRED.ANY@",
"MetricGroup": "CacheHits;Mem",
"MetricName": "tma_info_memory_l1mpki",
"Unit": "cpu_core"
},
{
"BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)",
- "MetricExpr": "1e3 * cpu_core@L2_RQSTS.ALL_DEMAND_DATA_RD@ / INST_RETIRED.ANY",
+ "MetricExpr": "1e3 * cpu_core@L2_RQSTS.ALL_DEMAND_DATA_RD@ / cpu_core@INST_RETIRED.ANY@",
"MetricGroup": "CacheHits;Mem",
"MetricName": "tma_info_memory_l1mpki_load",
"Unit": "cpu_core"
},
{
- "BriefDescription": "",
+ "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]",
"MetricExpr": "64 * cpu_core@L2_LINES_IN.ALL@ / 1e9 / duration_time",
"MetricGroup": "Mem;MemoryBW",
"MetricName": "tma_info_memory_l2_cache_fill_bw",
@@ -1699,48 +1617,55 @@
},
{
"BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)",
- "MetricExpr": "1e3 * (cpu_core@L2_RQSTS.REFERENCES@ - cpu_core@L2_RQSTS.MISS@) / INST_RETIRED.ANY",
+ "MetricExpr": "1e3 * (cpu_core@L2_RQSTS.REFERENCES@ - cpu_core@L2_RQSTS.MISS@) / cpu_core@INST_RETIRED.ANY@",
"MetricGroup": "CacheHits;Mem",
"MetricName": "tma_info_memory_l2hpki_all",
"Unit": "cpu_core"
},
{
"BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)",
- "MetricExpr": "1e3 * cpu_core@L2_RQSTS.DEMAND_DATA_RD_HIT@ / INST_RETIRED.ANY",
+ "MetricExpr": "1e3 * cpu_core@L2_RQSTS.DEMAND_DATA_RD_HIT@ / cpu_core@INST_RETIRED.ANY@",
"MetricGroup": "CacheHits;Mem",
"MetricName": "tma_info_memory_l2hpki_load",
"Unit": "cpu_core"
},
{
"BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
- "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.L2_MISS@ / INST_RETIRED.ANY",
+ "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.L2_MISS@ / cpu_core@INST_RETIRED.ANY@",
"MetricGroup": "Backend;CacheHits;Mem",
"MetricName": "tma_info_memory_l2mpki",
"Unit": "cpu_core"
},
{
"BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)",
- "MetricExpr": "1e3 * cpu_core@L2_RQSTS.MISS@ / INST_RETIRED.ANY",
+ "MetricExpr": "1e3 * cpu_core@L2_RQSTS.MISS@ / cpu_core@INST_RETIRED.ANY@",
"MetricGroup": "CacheHits;Mem;Offcore",
"MetricName": "tma_info_memory_l2mpki_all",
"Unit": "cpu_core"
},
{
"BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)",
- "MetricExpr": "1e3 * cpu_core@L2_RQSTS.DEMAND_DATA_RD_MISS@ / INST_RETIRED.ANY",
+ "MetricExpr": "1e3 * cpu_core@L2_RQSTS.DEMAND_DATA_RD_MISS@ / cpu_core@INST_RETIRED.ANY@",
"MetricGroup": "CacheHits;Mem",
"MetricName": "tma_info_memory_l2mpki_load",
"Unit": "cpu_core"
},
{
- "BriefDescription": "",
+ "BriefDescription": "Offcore requests (L2 cache miss) per kilo instruction for demand RFOs",
+ "MetricExpr": "1e3 * cpu_core@L2_RQSTS.RFO_MISS@ / cpu_core@INST_RETIRED.ANY@",
+ "MetricGroup": "CacheMisses;Offcore",
+ "MetricName": "tma_info_memory_l2mpki_rfo",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]",
"MetricExpr": "64 * cpu_core@OFFCORE_REQUESTS.ALL_REQUESTS@ / 1e9 / duration_time",
"MetricGroup": "Mem;MemoryBW;Offcore",
"MetricName": "tma_info_memory_l3_cache_access_bw",
"Unit": "cpu_core"
},
{
- "BriefDescription": "",
+ "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]",
"MetricExpr": "64 * cpu_core@LONGEST_LAT_CACHE.MISS@ / 1e9 / duration_time",
"MetricGroup": "Mem;MemoryBW",
"MetricName": "tma_info_memory_l3_cache_fill_bw",
@@ -1748,21 +1673,21 @@
},
{
"BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
- "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.L3_MISS@ / INST_RETIRED.ANY",
+ "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.L3_MISS@ / cpu_core@INST_RETIRED.ANY@",
"MetricGroup": "Mem",
"MetricName": "tma_info_memory_l3mpki",
"Unit": "cpu_core"
},
{
"BriefDescription": "Average Parallel L2 cache miss data reads",
- "MetricExpr": "cpu_core@OFFCORE_REQUESTS_OUTSTANDING.DATA_RD@ / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD",
+ "MetricExpr": "cpu_core@OFFCORE_REQUESTS_OUTSTANDING.DATA_RD@ / cpu_core@OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD@",
"MetricGroup": "Memory_BW;Offcore",
"MetricName": "tma_info_memory_latency_data_l2_mlp",
"Unit": "cpu_core"
},
{
"BriefDescription": "Average Latency for L2 cache miss demand Loads",
- "MetricExpr": "cpu_core@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD@ / OFFCORE_REQUESTS.DEMAND_DATA_RD",
+ "MetricExpr": "cpu_core@OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD@ / cpu_core@OFFCORE_REQUESTS.DEMAND_DATA_RD@",
"MetricGroup": "Memory_Lat;Offcore",
"MetricName": "tma_info_memory_latency_load_l2_miss_latency",
"Unit": "cpu_core"
@@ -1776,35 +1701,35 @@
},
{
"BriefDescription": "Average Latency for L3 cache miss demand Loads",
- "MetricExpr": "cpu_core@OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD@ / OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD",
+ "MetricExpr": "cpu_core@OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD@ / cpu_core@OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD@",
"MetricGroup": "Memory_Lat;Offcore",
"MetricName": "tma_info_memory_latency_load_l3_miss_latency",
"Unit": "cpu_core"
},
{
"BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
- "MetricExpr": "cpu_core@L1D_PEND_MISS.PENDING@ / MEM_LOAD_COMPLETED.L1_MISS_ANY",
+ "MetricExpr": "cpu_core@L1D_PEND_MISS.PENDING@ / cpu_core@MEM_LOAD_COMPLETED.L1_MISS_ANY@",
"MetricGroup": "Mem;MemoryBound;MemoryLat",
"MetricName": "tma_info_memory_load_miss_real_latency",
"Unit": "cpu_core"
},
{
"BriefDescription": "\"Bus lock\" per kilo instruction",
- "MetricExpr": "1e3 * cpu_core@SQ_MISC.BUS_LOCK@ / INST_RETIRED.ANY",
+ "MetricExpr": "1e3 * cpu_core@SQ_MISC.BUS_LOCK@ / cpu_core@INST_RETIRED.ANY@",
"MetricGroup": "Mem",
"MetricName": "tma_info_memory_mix_bus_lock_pki",
"Unit": "cpu_core"
},
{
"BriefDescription": "Un-cacheable retired load per kilo instruction",
- "MetricExpr": "1e3 * cpu_core@MEM_LOAD_MISC_RETIRED.UC@ / INST_RETIRED.ANY",
+ "MetricExpr": "1e3 * cpu_core@MEM_LOAD_MISC_RETIRED.UC@ / cpu_core@INST_RETIRED.ANY@",
"MetricGroup": "Mem",
"MetricName": "tma_info_memory_mix_uc_load_pki",
"Unit": "cpu_core"
},
{
"BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
- "MetricExpr": "cpu_core@L1D_PEND_MISS.PENDING@ / L1D_PEND_MISS.PENDING_CYCLES",
+ "MetricExpr": "cpu_core@L1D_PEND_MISS.PENDING@ / cpu_core@L1D_PEND_MISS.PENDING_CYCLES@",
"MetricGroup": "Mem;MemoryBW;MemoryBound",
"MetricName": "tma_info_memory_mlp",
"PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)",
@@ -1812,14 +1737,14 @@
},
{
"BriefDescription": "STLB (2nd level TLB) code speculative misses per kilo instruction (misses of any page-size that complete the page walk)",
- "MetricExpr": "1e3 * cpu_core@ITLB_MISSES.WALK_COMPLETED@ / INST_RETIRED.ANY",
+ "MetricExpr": "1e3 * cpu_core@ITLB_MISSES.WALK_COMPLETED@ / cpu_core@INST_RETIRED.ANY@",
"MetricGroup": "Fed;MemoryTLB",
"MetricName": "tma_info_memory_tlb_code_stlb_mpki",
"Unit": "cpu_core"
},
{
"BriefDescription": "STLB (2nd level TLB) data load speculative misses per kilo instruction (misses of any page-size that complete the page walk)",
- "MetricExpr": "1e3 * cpu_core@DTLB_LOAD_MISSES.WALK_COMPLETED@ / INST_RETIRED.ANY",
+ "MetricExpr": "1e3 * cpu_core@DTLB_LOAD_MISSES.WALK_COMPLETED@ / cpu_core@INST_RETIRED.ANY@",
"MetricGroup": "Mem;MemoryTLB",
"MetricName": "tma_info_memory_tlb_load_stlb_mpki",
"Unit": "cpu_core"
@@ -1834,21 +1759,42 @@
},
{
"BriefDescription": "STLB (2nd level TLB) data store speculative misses per kilo instruction (misses of any page-size that complete the page walk)",
- "MetricExpr": "1e3 * cpu_core@DTLB_STORE_MISSES.WALK_COMPLETED@ / INST_RETIRED.ANY",
+ "MetricExpr": "1e3 * cpu_core@DTLB_STORE_MISSES.WALK_COMPLETED@ / cpu_core@INST_RETIRED.ANY@",
"MetricGroup": "Mem;MemoryTLB",
"MetricName": "tma_info_memory_tlb_store_stlb_mpki",
"Unit": "cpu_core"
},
{
- "BriefDescription": "",
+ "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per core",
"MetricExpr": "cpu_core@UOPS_EXECUTED.THREAD@ / (cpu_core@UOPS_EXECUTED.CORE_CYCLES_GE_1@ / 2 if #SMT_on else cpu_core@UOPS_EXECUTED.THREAD\\,cmask\\=1@)",
"MetricGroup": "Cor;Pipeline;PortsUtil;SMT",
"MetricName": "tma_info_pipeline_execute",
"Unit": "cpu_core"
},
{
+ "BriefDescription": "Average number of uops fetched from DSB per cycle",
+ "MetricExpr": "cpu_core@IDQ.DSB_UOPS@ / cpu_core@IDQ.DSB_CYCLES_ANY@",
+ "MetricGroup": "Fed;FetchBW",
+ "MetricName": "tma_info_pipeline_fetch_dsb",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "Average number of uops fetched from LSD per cycle",
+ "MetricExpr": "cpu_core@LSD.UOPS@ / cpu_core@LSD.CYCLES_ACTIVE@",
+ "MetricGroup": "Fed;FetchBW",
+ "MetricName": "tma_info_pipeline_fetch_lsd",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "Average number of uops fetched from MITE per cycle",
+ "MetricExpr": "cpu_core@IDQ.MITE_UOPS@ / cpu_core@IDQ.MITE_CYCLES_ANY@",
+ "MetricGroup": "Fed;FetchBW",
+ "MetricName": "tma_info_pipeline_fetch_mite",
+ "Unit": "cpu_core"
+ },
+ {
"BriefDescription": "Instructions per a microcode Assist invocation",
- "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / ASSISTS.ANY",
+ "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@ASSISTS.ANY@",
"MetricGroup": "MicroSeq;Pipeline;Ret;Retire",
"MetricName": "tma_info_pipeline_ipassist",
"MetricThreshold": "tma_info_pipeline_ipassist < 100e3",
@@ -1887,14 +1833,14 @@
},
{
"BriefDescription": "Average CPU Utilization (percentage)",
- "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.REF_TSC@ / TSC",
+ "MetricExpr": "tma_info_system_cpus_utilized / #num_cpus_online",
"MetricGroup": "HPC;Summary",
"MetricName": "tma_info_system_cpu_utilization",
"Unit": "cpu_core"
},
{
"BriefDescription": "Average number of utilized CPUs",
- "MetricExpr": "#num_cpus_online * tma_info_system_cpu_utilization",
+ "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.REF_TSC@ / TSC",
"MetricGroup": "Summary",
"MetricName": "tma_info_system_cpus_utilized",
"Unit": "cpu_core"
@@ -1925,14 +1871,14 @@
},
{
"BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode",
- "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / cpu_core@INST_RETIRED.ANY_P@k",
+ "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.THREAD_P@k / cpu_core@INST_RETIRED.ANY_P@k",
"MetricGroup": "OS",
"MetricName": "tma_info_system_kernel_cpi",
"Unit": "cpu_core"
},
{
"BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode",
- "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD",
+ "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.THREAD_P@k / cpu_core@CPU_CLK_UNHALTED.THREAD@",
"MetricGroup": "OS",
"MetricName": "tma_info_system_kernel_utilization",
"MetricThreshold": "tma_info_system_kernel_utilization > 0.05",
@@ -1971,7 +1917,7 @@
},
{
"BriefDescription": "Average Frequency Utilization relative nominal frequency",
- "MetricExpr": "tma_info_thread_clks / CPU_CLK_UNHALTED.REF_TSC",
+ "MetricExpr": "tma_info_thread_clks / cpu_core@CPU_CLK_UNHALTED.REF_TSC@",
"MetricGroup": "Power",
"MetricName": "tma_info_system_turbo_utilization",
"Unit": "cpu_core"
@@ -1992,7 +1938,7 @@
},
{
"BriefDescription": "The ratio of Executed- by Issued-Uops",
- "MetricExpr": "cpu_core@UOPS_EXECUTED.THREAD@ / UOPS_ISSUED.ANY",
+ "MetricExpr": "cpu_core@UOPS_EXECUTED.THREAD@ / cpu_core@UOPS_ISSUED.ANY@",
"MetricGroup": "Cor;Pipeline",
"MetricName": "tma_info_thread_execute_per_issue",
"PublicDescription": "The ratio of Executed- by Issued-Uops. Ratio > 1 suggests high rate of uop micro-fusions. Ratio < 1 suggest high rate of \"execute\" at rename stage.",
@@ -2021,15 +1967,15 @@
},
{
"BriefDescription": "Uops Per Instruction",
- "MetricExpr": "tma_retiring * tma_info_thread_slots / INST_RETIRED.ANY",
+ "MetricExpr": "tma_retiring * tma_info_thread_slots / cpu_core@INST_RETIRED.ANY@",
"MetricGroup": "Pipeline;Ret;Retire",
"MetricName": "tma_info_thread_uoppi",
"MetricThreshold": "tma_info_thread_uoppi > 1.05",
"Unit": "cpu_core"
},
{
- "BriefDescription": "Instruction per taken branch",
- "MetricExpr": "tma_retiring * tma_info_thread_slots / BR_INST_RETIRED.NEAR_TAKEN",
+ "BriefDescription": "Uops per taken branch",
+ "MetricExpr": "tma_retiring * tma_info_thread_slots / cpu_core@BR_INST_RETIRED.NEAR_TAKEN@",
"MetricGroup": "Branches;Fed;FetchBW",
"MetricName": "tma_info_thread_uptb",
"MetricThreshold": "tma_info_thread_uptb < 9",
@@ -2068,7 +2014,7 @@
{
"BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses",
"MetricExpr": "cpu_core@ICACHE_TAG.STALLS@ / tma_info_thread_clks",
- "MetricGroup": "BigFootprint;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
+ "MetricGroup": "BigFootprint;BvBC;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
"MetricName": "tma_itlb_misses",
"MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
"PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS",
@@ -2086,9 +2032,19 @@
"Unit": "cpu_core"
},
{
+ "BriefDescription": "This metric roughly estimates fraction of cycles with demand load accesses that hit the L1 cache",
+ "MetricExpr": "min(2 * (cpu_core@MEM_INST_RETIRED.ALL_LOADS@ - cpu_core@MEM_LOAD_RETIRED.FB_HIT@ - cpu_core@MEM_LOAD_RETIRED.L1_MISS@) * 20 / 100, max(cpu_core@CYCLE_ACTIVITY.CYCLES_MEM_ANY@ - cpu_core@MEMORY_ACTIVITY.CYCLES_L1D_MISS@, 0)) / tma_info_thread_clks",
+ "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_l1_bound_group",
+ "MetricName": "tma_l1_hit_latency",
+ "MetricThreshold": "tma_l1_hit_latency > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
+ "PublicDescription": "This metric roughly estimates fraction of cycles with demand load accesses that hit the L1 cache. The short latency of the L1 data cache may be exposed in pointer-chasing memory access patterns as an example. Sample with: MEM_LOAD_RETIRED.L1_HIT",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
"BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads",
"MetricExpr": "(cpu_core@MEMORY_ACTIVITY.STALLS_L1D_MISS@ - cpu_core@MEMORY_ACTIVITY.STALLS_L2_MISS@) / tma_info_thread_clks",
- "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
+ "MetricGroup": "BvML;CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
"MetricName": "tma_l2_bound",
"MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
"PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS",
@@ -2108,7 +2064,7 @@
{
"BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
"MetricExpr": "9 * tma_info_system_core_frequency * (cpu_core@MEM_LOAD_RETIRED.L3_HIT@ * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2)) / tma_info_thread_clks",
- "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group",
+ "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group",
"MetricName": "tma_l3_hit_latency",
"MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
"PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_info_bottleneck_cache_memory_latency, tma_mem_latency",
@@ -2121,7 +2077,7 @@
"MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB",
"MetricName": "tma_lcp",
"MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
- "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb",
+ "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_botlnk_l2_dsb_bandwidth, tma_info_botlnk_l2_dsb_misses, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb",
"ScaleUnit": "100%",
"Unit": "cpu_core"
},
@@ -2170,7 +2126,7 @@
"MetricGroup": "Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group",
"MetricName": "tma_lock_latency",
"MetricThreshold": "tma_lock_latency > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
- "PublicDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations. Due to the microarchitecture handling of locks; they are classified as L1_Bound regardless of what memory source satisfied them. Sample with: MEM_INST_RETIRED.LOCK_LOADS_PS. Related metrics: tma_store_latency",
+ "PublicDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations. Due to the microarchitecture handling of locks; they are classified as L1_Bound regardless of what memory source satisfied them. Sample with: MEM_INST_RETIRED.LOCK_LOADS. Related metrics: tma_store_latency",
"ScaleUnit": "100%",
"Unit": "cpu_core"
},
@@ -2187,7 +2143,7 @@
{
"BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears",
"MetricExpr": "max(0, tma_bad_speculation - tma_branch_mispredicts)",
- "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn",
+ "MetricGroup": "BadSpec;BvMS;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn",
"MetricName": "tma_machine_clears",
"MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15",
"MetricgroupNoGroup": "TopdownL2",
@@ -2198,7 +2154,7 @@
{
"BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM)",
"MetricExpr": "min(cpu_core@CPU_CLK_UNHALTED.THREAD@, cpu_core@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / tma_info_thread_clks",
- "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW",
+ "MetricGroup": "BvMS;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW",
"MetricName": "tma_mem_bandwidth",
"MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
"PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_sq_full",
@@ -2208,7 +2164,7 @@
{
"BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM)",
"MetricExpr": "min(cpu_core@CPU_CLK_UNHALTED.THREAD@, cpu_core@OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD@) / tma_info_thread_clks - tma_mem_bandwidth",
- "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat",
+ "MetricGroup": "BvML;MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat",
"MetricName": "tma_mem_latency",
"MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
"PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_info_bottleneck_cache_memory_latency, tma_l3_hit_latency",
@@ -2258,7 +2214,7 @@
{
"BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage",
"MetricExpr": "tma_branch_mispredicts / tma_bad_speculation * cpu_core@INT_MISC.CLEAR_RESTEER_CYCLES@ / tma_info_thread_clks",
- "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueBM",
+ "MetricGroup": "BadSpec;BrMispredicts;BvMP;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueBM",
"MetricName": "tma_mispredicts_resteers",
"MetricThreshold": "tma_mispredicts_resteers > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))",
"PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES. Related metrics: tma_branch_mispredicts, tma_info_bad_spec_branch_misprediction_cost, tma_info_bottleneck_mispredictions",
@@ -2298,7 +2254,7 @@
{
"BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused",
"MetricExpr": "tma_light_operations * (cpu_core@BR_INST_RETIRED.ALL_BRANCHES@ - cpu_core@INST_RETIRED.MACRO_FUSED@) / (tma_retiring * tma_info_thread_slots)",
- "MetricGroup": "Branches;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
+ "MetricGroup": "Branches;BvBO;Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
"MetricName": "tma_non_fused_branches",
"MetricThreshold": "tma_non_fused_branches > 0.1 & tma_light_operations > 0.6",
"PublicDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused. Non-conditional branches like direct JMP or CALL would count here. Can be used to examine fusible conditional jumps that were not fused.",
@@ -2308,7 +2264,7 @@
{
"BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions",
"MetricExpr": "tma_light_operations * cpu_core@INST_RETIRED.NOP@ / (tma_retiring * tma_info_thread_slots)",
- "MetricGroup": "Pipeline;TopdownL4;tma_L4_group;tma_other_light_ops_group",
+ "MetricGroup": "BvBO;Pipeline;TopdownL4;tma_L4_group;tma_other_light_ops_group",
"MetricName": "tma_nop_instructions",
"MetricThreshold": "tma_nop_instructions > 0.1 & (tma_other_light_ops > 0.3 & tma_light_operations > 0.6)",
"PublicDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body. Sample with: INST_RETIRED.NOP",
@@ -2328,7 +2284,7 @@
{
"BriefDescription": "This metric estimates fraction of slots the CPU was stalled due to other cases of misprediction (non-retired x86 branches or other types).",
"MetricExpr": "max(tma_branch_mispredicts * (1 - cpu_core@BR_MISP_RETIRED.ALL_BRANCHES@ / (cpu_core@INT_MISC.CLEARS_COUNT@ - cpu_core@MACHINE_CLEARS.COUNT@)), 0.0001)",
- "MetricGroup": "BrMispredicts;TopdownL3;tma_L3_group;tma_branch_mispredicts_group",
+ "MetricGroup": "BrMispredicts;BvIO;TopdownL3;tma_L3_group;tma_branch_mispredicts_group",
"MetricName": "tma_other_mispredicts",
"MetricThreshold": "tma_other_mispredicts > 0.05 & (tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15)",
"ScaleUnit": "100%",
@@ -2337,7 +2293,7 @@
{
"BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Nukes (Machine Clears) not related to memory ordering.",
"MetricExpr": "max(tma_machine_clears * (1 - cpu_core@MACHINE_CLEARS.MEMORY_ORDERING@ / cpu_core@MACHINE_CLEARS.COUNT@), 0.0001)",
- "MetricGroup": "Machine_Clears;TopdownL3;tma_L3_group;tma_machine_clears_group",
+ "MetricGroup": "BvIO;Machine_Clears;TopdownL3;tma_L3_group;tma_machine_clears_group",
"MetricName": "tma_other_nukes",
"MetricThreshold": "tma_other_nukes > 0.05 & (tma_machine_clears > 0.1 & tma_bad_speculation > 0.15)",
"ScaleUnit": "100%",
@@ -2395,7 +2351,7 @@
},
{
"BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
- "MetricExpr": "(cpu_core@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + cpu_core@RS.EMPTY\\,umask\\=1@) / tma_info_thread_clks * (cpu_core@CYCLE_ACTIVITY.STALLS_TOTAL@ - cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@) / tma_info_thread_clks",
+ "MetricExpr": "(cpu_core@EXE_ACTIVITY.EXE_BOUND_0_PORTS@ + max(cpu_core@RS.EMPTY\\,umask\\=1@ - cpu_core@RESOURCE_STALLS.SCOREBOARD@, 0)) / tma_info_thread_clks * (cpu_core@CYCLE_ACTIVITY.STALLS_TOTAL@ - cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@) / tma_info_thread_clks",
"MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
"MetricName": "tma_ports_utilized_0",
"MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
@@ -2428,7 +2384,7 @@
"BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
"MetricConstraint": "NO_GROUP_EVENTS_NMI",
"MetricExpr": "cpu_core@UOPS_EXECUTED.CYCLES_GE_3@ / tma_info_thread_clks",
- "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
+ "MetricGroup": "BvCB;PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
"MetricName": "tma_ports_utilized_3m",
"MetricThreshold": "tma_ports_utilized_3m > 0.4 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
"PublicDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Sample with: UOPS_EXECUTED.CYCLES_GE_3",
@@ -2439,7 +2395,7 @@
"BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired",
"DefaultMetricgroupName": "TopdownL1",
"MetricExpr": "cpu_core@topdown\\-retiring@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots",
- "MetricGroup": "Default;TmaL1;TopdownL1;tma_L1_group",
+ "MetricGroup": "BvUW;Default;TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_retiring",
"MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1",
"MetricgroupNoGroup": "TopdownL1;Default",
@@ -2450,7 +2406,7 @@
{
"BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations",
"MetricExpr": "cpu_core@RESOURCE_STALLS.SCOREBOARD@ / tma_info_thread_clks + tma_c02_wait",
- "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group;tma_issueSO",
+ "MetricGroup": "BvIO;PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group;tma_issueSO",
"MetricName": "tma_serializing_operation",
"MetricThreshold": "tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
"PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: RESOURCE_STALLS.SCOREBOARD. Related metrics: tma_ms_switches",
@@ -2501,7 +2457,7 @@
{
"BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)",
"MetricExpr": "(cpu_core@XQ.FULL_CYCLES@ + cpu_core@L1D_PEND_MISS.L2_STALLS@) / tma_info_thread_clks",
- "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group",
+ "MetricGroup": "BvMS;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group",
"MetricName": "tma_sq_full",
"MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
"PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth",
@@ -2531,7 +2487,7 @@
{
"BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses",
"MetricExpr": "(cpu_core@MEM_STORE_RETIRED.L2_HIT@ * 10 * (1 - cpu_core@MEM_INST_RETIRED.LOCK_LOADS@ / cpu_core@MEM_INST_RETIRED.ALL_STORES@) + (1 - cpu_core@MEM_INST_RETIRED.LOCK_LOADS@ / cpu_core@MEM_INST_RETIRED.ALL_STORES@) * min(cpu_core@CPU_CLK_UNHALTED.THREAD@, cpu_core@OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO@)) / tma_info_thread_clks",
- "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_issueSL;tma_store_bound_group",
+ "MetricGroup": "BvML;MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_issueSL;tma_store_bound_group",
"MetricName": "tma_store_latency",
"MetricThreshold": "tma_store_latency > 0.1 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
"PublicDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses. Store accesses usually less impact out-of-order core performance; however; holding resources for longer time can lead into undesired implications (e.g. contention on L1D fill-buffer entries - see FB_Full). Related metrics: tma_fb_full, tma_lock_latency",
@@ -2579,7 +2535,7 @@
{
"BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears",
"MetricExpr": "cpu_core@INT_MISC.UNKNOWN_BRANCH_CYCLES@ / tma_info_thread_clks",
- "MetricGroup": "BigFootprint;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group",
+ "MetricGroup": "BigFootprint;BvBC;FetchLat;TopdownL4;tma_L4_group;tma_branch_resteers_group",
"MetricName": "tma_unknown_branches",
"MetricThreshold": "tma_unknown_branches > 0.05 & (tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15))",
"PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (e.g. first time the branch is fetched or hitting BTB capacity limit) hence called Unknown Branches. Sample with: FRONTEND_RETIRED.UNKNOWN_BRANCH",
@@ -2588,7 +2544,7 @@
},
{
"BriefDescription": "This metric serves as an approximation of legacy x87 usage",
- "MetricExpr": "tma_retiring * cpu_core@UOPS_EXECUTED.X87@ / UOPS_EXECUTED.THREAD",
+ "MetricExpr": "tma_retiring * cpu_core@UOPS_EXECUTED.X87@ / cpu_core@UOPS_EXECUTED.THREAD@",
"MetricGroup": "Compute;TopdownL4;tma_L4_group;tma_fp_arith_group",
"MetricName": "tma_x87_use",
"MetricThreshold": "tma_x87_use > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)",