Fixing missing counters for gfx900 (#1028)

ROCm · Aug 21, 2024 · fa91169 · fa91169
1 parent 439025d
commit fa91169
Show file tree

Hide file tree

Showing 2 changed files with 38 additions and 35 deletions.
diff --git a/source/lib/rocprofiler-sdk/counters/xml/derived_counters.xml b/source/lib/rocprofiler-sdk/counters/xml/derived_counters.xml
@@ -132,6 +132,9 @@
   <metric name="LDSBankConflict" expr=100*SQ_LDS_BANK_CONFLICT/GRBM_GUI_ACTIVE/CU_NUM descr="The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad)."></metric>
 </gfx9>
 
+<gfx900 base="gfx9">
+</gfx900>
+
 <gfx906 base="gfx9">
   # EA1
   <metric name="TCC_EA1_RDREQ_32B_sum" expr=reduce(TCC_EA1_RDREQ_32B,sum) descr="Number of 32-byte TCC/EA read requests. Sum over TCC EA1s."></metric>

diff --git a/source/lib/rocprofiler-sdk/counters/yaml/counter_defs.yaml b/source/lib/rocprofiler-sdk/counters/yaml/counter_defs.yaml
@@ -220,27 +220,27 @@ FETCH_SIZE:
       expression: (TCC_MC_RDREQ_sum*32)/1024
     gfx906:
       expression: (TCC_EA_RDREQ_32B_sum*32+(TCC_EA_RDREQ_sum-TCC_EA_RDREQ_32B_sum)*64+RDATA1_SIZE)/1024
-    gfx908/gfx90a/gfx9:
+    gfx908/gfx90a/gfx9/gfx900:
       expression: (TCC_EA_RDREQ_32B_sum*32+(TCC_EA_RDREQ_sum-TCC_EA_RDREQ_32B_sum)*64)/1024
     gfx942/gfx941/gfx940:
       expression: (TCC_EA0_RDREQ_32B_sum*32+(TCC_EA0_RDREQ_sum-TCC_EA0_RDREQ_32B_sum)*64)/1024
   description: The total kilobytes fetched from the video memory. This is measured with all extra fetches
     and any cache or memory effects taken into account.
 FetchSize:
   architectures:
-    gfx942/gfx941/gfx906/gfx940/gfx908/gfx8/gfx90a/gfx9:
+    gfx942/gfx941/gfx906/gfx940/gfx908/gfx8/gfx90a/gfx9/gfx900:
       expression: FETCH_SIZE
   description: The total kilobytes fetched from the video memory. This is measured with all extra fetches
     and any cache or memory effects taken into account.
 FlatLDSInsts:
   architectures:
-    gfx906/gfx908/gfx8/gfx90a/gfx9:
+    gfx906/gfx908/gfx8/gfx90a/gfx9/gfx900:
       expression: SQ_INSTS_FLAT_LDS_ONLY/SQ_WAVES
   description: The average number of FLAT instructions that read or write to LDS executed per work item
     (affected by flow control).
 FlatVMemInsts:
   architectures:
-    gfx906/gfx908/gfx8/gfx90a/gfx9:
+    gfx906/gfx908/gfx8/gfx90a/gfx9/gfx900:
       expression: (SQ_INSTS_FLAT-SQ_INSTS_FLAT_LDS_ONLY)/SQ_WAVES
   description: The average number of FLAT instructions that read from or write to the video memory executed
     per work item (affected by flow control). Includes FLAT instructions that read from or write to scratch.
@@ -373,12 +373,12 @@ GL2C_WRREQ_STALL_max:
   description: Number of cycles a write request was stalled. Max over GL2C instances.
 GPUBusy:
   architectures:
-    gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx908/gfx8/gfx90a/gfx9:
+    gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx908/gfx8/gfx90a/gfx9/gfx900:
       expression: 100*GRBM_GUI_ACTIVE/GRBM_COUNT
   description: The percentage of time GPU was busy.
 GPU_UTIL:
   architectures:
-    gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx90a/gfx9:
+    gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx90a/gfx9/gfx900:
       expression: 100*GRBM_GUI_ACTIVE/GRBM_COUNT
   description: Percentage of the time that GUI is active
 # Block GRBM (Graphics Register Bus Manager Block)
@@ -473,7 +473,7 @@ L2CacheHit:
   architectures:
     gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx1100/gfx1101:
       expression: 100*reduce(GL2C_HIT,sum)/(reduce(GL2C_HIT,sum)+reduce(GL2C_MISS,sum))
-    gfx906/gfx908/gfx8/gfx90a/gfx9:
+    gfx906/gfx908/gfx8/gfx90a/gfx9/gfx900:
       expression: 100*reduce(TCC_HIT,sum)/(reduce(TCC_HIT,sum)+reduce(TCC_MISS,sum))
   description: 'The percentage of fetch, write, atomic, and other instructions that hit the data in L2
     cache. Value range: 0% (no hit) to 100% (optimal).'
@@ -491,13 +491,13 @@ LDSBankConflict:
   architectures:
     gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx1100/gfx1101:
       expression: 100*SQC_LDS_BANK_CONFLICT/SQC_LDS_IDX_ACTIVE
-    gfx906/gfx908/gfx8/gfx90a/gfx9:
+    gfx906/gfx908/gfx8/gfx90a/gfx9/gfx900:
       expression: 100*SQ_LDS_BANK_CONFLICT/GRBM_GUI_ACTIVE/CU_NUM
   description: 'The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal)
     to 100% (bad).'
 LDSInsts:
   architectures:
-    gfx906/gfx908/gfx8/gfx90a/gfx9:
+    gfx906/gfx908/gfx8/gfx90a/gfx9/gfx900:
       expression: (SQ_INSTS_LDS-SQ_INSTS_FLAT_LDS_ONLY)/SQ_WAVES
   description: The average number of LDS read or LDS write instructions executed per work item (affected
     by flow control).  Excludes FLAT instructions that read from or write to LDS.
@@ -553,13 +553,13 @@ MemUnitStalled:
   architectures:
     gfx8:
       expression: 100*reduce(TCP_TCP_TA_DATA_STALL_CYCLES,max)/GRBM_GUI_ACTIVE/SE_NUM
-    gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9:
+    gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900:
       expression: 100*TCP_TCP_TA_DATA_STALL_CYCLES_max/GRBM_GUI_ACTIVE/SE_NUM
   description: 'The percentage of GPUTime the memory unit is stalled. Try reducing the number or size
     of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad).'
 MemWrites32B:
   architectures:
-    gfx942/gfx941/gfx906/gfx940/gfx908/gfx8/gfx90a/gfx9:
+    gfx942/gfx941/gfx906/gfx940/gfx908/gfx8/gfx90a/gfx9/gfx900:
       expression: WRITE_REQ_32B
   description: The total number of effective 32B write transactions to the memory
 MfmaFlops:
@@ -599,7 +599,7 @@ RDATA1_SIZE:
   description: The total kilobytes fetched from the video memory. This is measured on EA1s.
 SALUBusy:
   architectures:
-    gfx906/gfx908/gfx8/gfx90a/gfx9:
+    gfx906/gfx908/gfx8/gfx90a/gfx9/gfx900:
       expression: 100*SQ_INST_CYCLES_SALU*4/SIMD_NUM/GRBM_GUI_ACTIVE
     gfx942/gfx941/gfx940:
       expression: 100*reduce(SQ_INST_CYCLES_SALU,sum)*4/SIMD_NUM/reduce(GRBM_GUI_ACTIVE,sum)
@@ -2031,7 +2031,7 @@ SQ_WAVES_SAVED:
     space). Returns one value per-SE (aggregates of SIMD values).
 SQ_WAVES_sum:
   architectures:
-    gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx8/gfx90a/gfx9:
+    gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx8/gfx90a/gfx9/gfx900:
       expression: reduce(SQ_WAVES,sum)
   description: Gives the total number of waves currently enqueued by the application during the collection
     timeframe (for dispatch profiling this is the timeframe of kernel execution, for agent profiling it
@@ -2223,17 +2223,17 @@ TA_BUFFER_WRITE_WAVEFRONTS_sum:
   description: Number of buffer write wavefronts processed by TA. Sum over TA instances.
 TA_BUSY_avr:
   architectures:
-    gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx8/gfx90a/gfx9:
+    gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx8/gfx90a/gfx9/gfx900:
       expression: reduce(TA_TA_BUSY,avr)
   description: TA block is busy. Average over TA instances.
 TA_BUSY_max:
   architectures:
-    gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx8/gfx90a/gfx9:
+    gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx8/gfx90a/gfx9/gfx900:
       expression: reduce(TA_TA_BUSY,max)
   description: TA block is busy. Max over TA instances.
 TA_BUSY_min:
   architectures:
-    gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx8/gfx90a/gfx9:
+    gfx942/gfx941/gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx906/gfx1100/gfx1101/gfx940/gfx908/gfx8/gfx90a/gfx9/gfx900:
       expression: reduce(TA_TA_BUSY,min)
   description: TA block is busy. Min over TA instances.
 TA_DATA_STALLED_BY_TC_CYCLES:
@@ -2288,7 +2288,7 @@ TA_FLAT_READ_WAVEFRONTS:
   description: Number of flat opcode reads processed by the TA.
 TA_FLAT_READ_WAVEFRONTS_sum:
   architectures:
-    gfx942/gfx941/gfx906/gfx940/gfx908/gfx8/gfx90a/gfx9:
+    gfx942/gfx941/gfx906/gfx940/gfx908/gfx8/gfx90a/gfx9/gfx900:
       expression: reduce(TA_FLAT_READ_WAVEFRONTS,sum)
   description: Number of flat opcode reads processed by the TA. Sum over TA instances.
 TA_FLAT_STORE_WAVEFRONTS:
@@ -2328,7 +2328,7 @@ TA_FLAT_WRITE_WAVEFRONTS:
   description: Number of flat opcode writes processed by the TA.
 TA_FLAT_WRITE_WAVEFRONTS_sum:
   architectures:
-    gfx942/gfx941/gfx906/gfx940/gfx908/gfx8/gfx90a/gfx9:
+    gfx942/gfx941/gfx906/gfx940/gfx908/gfx8/gfx90a/gfx9/gfx900:
       expression: reduce(TA_FLAT_WRITE_WAVEFRONTS,sum)
   description: Number of flat opcode writes processed by the TA. Sum over TA instances.
 TA_TA_BUSY:
@@ -2808,7 +2808,7 @@ TCC_EA_RDREQ_32B:
   description: Number of 32-byte TCC/EA read requests
 TCC_EA_RDREQ_32B_sum:
   architectures:
-    gfx906/gfx908/gfx90a/gfx9:
+    gfx906/gfx908/gfx90a/gfx9/gfx900:
       expression: reduce(TCC_EA_RDREQ_32B,sum)
   description: Number of 32-byte TCC/EA read requests. Sum over TCC instances.
 TCC_EA_RDREQ_DRAM:
@@ -2878,7 +2878,7 @@ TCC_EA_RDREQ_LEVEL_sum:
     Sum over TCC instances.
 TCC_EA_RDREQ_sum:
   architectures:
-    gfx906/gfx908/gfx90a/gfx9:
+    gfx906/gfx908/gfx90a/gfx9/gfx900:
       expression: reduce(TCC_EA_RDREQ,sum)
   description: Number of TCC/EA read requests (either 32-byte or 64-byte). Sum over TCC instances.
 TCC_EA_RD_UNCACHED_32B:
@@ -2916,7 +2916,7 @@ TCC_EA_WRREQ_64B:
   description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface.
 TCC_EA_WRREQ_64B_sum:
   architectures:
-    gfx906/gfx908/gfx90a/gfx9:
+    gfx906/gfx908/gfx90a/gfx9/gfx900:
       expression: reduce(TCC_EA_WRREQ_64B,sum)
   description: Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface.
     Sum over TCC instances.
@@ -2998,7 +2998,7 @@ TCC_EA_WRREQ_STALL_sum:
   description: Number of cycles a write request was stalled. Sum over TCC instances.
 TCC_EA_WRREQ_sum:
   architectures:
-    gfx906/gfx908/gfx90a/gfx9:
+    gfx906/gfx908/gfx90a/gfx9/gfx900:
       expression: reduce(TCC_EA_WRREQ,sum)
   description: Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface.
     Sum over TCC instances.
@@ -3031,7 +3031,7 @@ TCC_HIT:
   description: Number of cache hits.
 TCC_HIT_sum:
   architectures:
-    gfx942/gfx941/gfx906/gfx940/gfx908/gfx8/gfx90a/gfx9:
+    gfx942/gfx941/gfx906/gfx940/gfx908/gfx8/gfx90a/gfx9/gfx900:
       expression: reduce(TCC_HIT,sum)
   description: Number of cache hits. Sum over TCC instances.
 TCC_INTERNAL_PROBE:
@@ -3081,7 +3081,7 @@ TCC_MISS:
   description: Number of cache misses. UC reads count as misses.
 TCC_MISS_sum:
   architectures:
-    gfx942/gfx941/gfx906/gfx940/gfx908/gfx8/gfx90a/gfx9:
+    gfx942/gfx941/gfx906/gfx940/gfx908/gfx8/gfx90a/gfx9/gfx900:
       expression: reduce(TCC_MISS,sum)
   description: Number of cache misses. UC reads count as misses. Sum over TCC instances.
 TCC_NC_REQ:
@@ -3269,7 +3269,7 @@ TCC_WRREQ_STALL_max:
   architectures:
     gfx8:
       expression: reduce(TCC_MC_WRREQ_STALL,max)
-    gfx906/gfx908/gfx90a/gfx9:
+    gfx906/gfx908/gfx90a/gfx9/gfx900:
       expression: reduce(TCC_EA_WRREQ_STALL,max)
     gfx942/gfx941/gfx940:
       expression: reduce(TCC_EA0_WRREQ_STALL,max)
@@ -3625,12 +3625,12 @@ TCP_TCP_TA_DATA_STALL_CYCLES:
   description: TCP stalls TA data interface. Now Windowed.
 TCP_TCP_TA_DATA_STALL_CYCLES_max:
   architectures:
-    gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9:
+    gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900:
       expression: reduce(TCP_TCP_TA_DATA_STALL_CYCLES,max)
   description: Maximum number of TCP stalls TA data interface.
 TCP_TCP_TA_DATA_STALL_CYCLES_sum:
   architectures:
-    gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9:
+    gfx942/gfx941/gfx906/gfx940/gfx908/gfx90a/gfx9/gfx900:
       expression: reduce(TCP_TCP_TA_DATA_STALL_CYCLES,sum)
   description: Total number of TCP stalls TA data interface.
 TCP_TCR_TCP_STALL_CYCLES:
@@ -3965,7 +3965,7 @@ TcUtil:
   description: 'Unit: percent'
 VALUBusy:
   architectures:
-    gfx906/gfx908/gfx8/gfx90a/gfx9:
+    gfx906/gfx908/gfx8/gfx90a/gfx9/gfx900:
       expression: 100*SQ_ACTIVE_INST_VALU*4/SIMD_NUM/GRBM_GUI_ACTIVE
     gfx942/gfx941/gfx940:
       expression: 100*reduce(SQ_ACTIVE_INST_VALU,sum)*4/SIMD_NUM/reduce(GRBM_GUI_ACTIVE,sum)
@@ -3979,20 +3979,20 @@ VALUInsts:
     control).
 VALUUtilization:
   architectures:
-    gfx942/gfx941/gfx906/gfx940/gfx908/gfx8/gfx90a/gfx9:
+    gfx942/gfx941/gfx906/gfx940/gfx908/gfx8/gfx90a/gfx9/gfx900:
       expression: 100*SQ_THREAD_CYCLES_VALU/(SQ_ACTIVE_INST_VALU*MAX_WAVE_SIZE)
   description: 'The percentage of active vector ALU threads in a wave. A lower number can mean either
     more thread divergence in a wave or that the work-group size is not a multiple of 64. Value range:
     0% (bad), 100% (ideal - no thread divergence).'
 VFetchInsts:
   architectures:
-    gfx942/gfx941/gfx906/gfx940/gfx908/gfx8/gfx90a/gfx9:
+    gfx942/gfx941/gfx906/gfx940/gfx908/gfx8/gfx90a/gfx9/gfx900:
       expression: (SQ_INSTS_VMEM_RD-TA_FLAT_READ_WAVEFRONTS_sum)/SQ_WAVES
   description: The average number of vector fetch instructions from the video memory executed per work-item
     (affected by flow control). Excludes FLAT instructions that fetch from video memory.
 VWriteInsts:
   architectures:
-    gfx942/gfx941/gfx906/gfx940/gfx908/gfx8/gfx90a/gfx9:
+    gfx942/gfx941/gfx906/gfx940/gfx908/gfx8/gfx90a/gfx9/gfx900:
       expression: (SQ_INSTS_VMEM_WR-TA_FLAT_WRITE_WAVEFRONTS_sum)/SQ_WAVES
   description: The average number of vector write instructions to the video memory executed per work-item
     (affected by flow control). Excludes FLAT instructions that write to video memory.
@@ -4037,7 +4037,7 @@ WRITE_REQ_32B:
       expression: TCC_MC_WRREQ_sum
     gfx906:
       expression: (TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)+(TCC_EA1_WRREQ_sum-TCC_EA1_WRREQ_64B_sum)+(TCC_EA_WRREQ_64B_sum+TCC_EA1_WRREQ_64B_sum)*2
-    gfx908/gfx90a/gfx9:
+    gfx908/gfx90a/gfx9/gfx900:
       expression: TCC_EA_WRREQ_64B_sum*2+(TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)
     gfx942/gfx941/gfx940:
       expression: TCC_EA0_WRREQ_64B_sum*2+(TCC_EA0_WRREQ_sum-TCC_EA0_WRREQ_64B_sum)
@@ -4048,7 +4048,7 @@ WRITE_SIZE:
       expression: (TCC_MC_WRREQ_sum*32)/1024
     gfx906:
       expression: ((TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)*32+TCC_EA_WRREQ_64B_sum*64+WDATA1_SIZE)/1024
-    gfx908/gfx90a/gfx9:
+    gfx908/gfx90a/gfx9/gfx900:
       expression: ((TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)*32+TCC_EA_WRREQ_64B_sum*64)/1024
     gfx942/gfx941/gfx940:
       expression: ((TCC_EA0_WRREQ_sum-TCC_EA0_WRREQ_64B_sum)*32+TCC_EA0_WRREQ_64B_sum*64)/1024
@@ -4086,15 +4086,15 @@ Wavefronts:
   description: Total wavefronts.
 WriteSize:
   architectures:
-    gfx942/gfx941/gfx906/gfx940/gfx908/gfx8/gfx90a/gfx9:
+    gfx942/gfx941/gfx906/gfx940/gfx908/gfx8/gfx90a/gfx9/gfx900:
       expression: WRITE_SIZE
   description: The total kilobytes written to the video memory. This is measured with all extra fetches
     and any cache or memory effects taken into account.
 WriteUnitStalled:
   architectures:
     gfx10/gfx1010/gfx1030/gfx1031/gfx11/gfx1032/gfx1102/gfx1100/gfx1101:
       expression: 100*GL2C_WRREQ_STALL_max/GRBM_GUI_ACTIVE
-    gfx906/gfx908/gfx8/gfx90a/gfx9:
+    gfx906/gfx908/gfx8/gfx90a/gfx9/gfx900:
       expression: 100*TCC_WRREQ_STALL_max/GRBM_GUI_ACTIVE
   description: 'The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad).'
 sL1dCacheHitRate: