Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions VX_config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ VX_CFG_L1_MEM_PORTS = "expr: min($VX_CFG_DCACHE_NUM_BANKS, $VX_CFG_PLATFORM_MEMO
[l2cache]
VX_CFG_L2_CACHE_SIZE = 1048576
VX_CFG_L2_NUM_WAYS = 8
VX_CFG_L2_WRITEBACK = 0
VX_CFG_L2_WRITEBACK = 1
VX_CFG_L2_DIRTYBYTES = "expr: $VX_CFG_L2_WRITEBACK"
VX_CFG_L2_REPL_POLICY = "expr: $__cache_repl_fifo"
VX_CFG_L2_MSHR_SIZE = 16
Expand All @@ -214,7 +214,7 @@ VX_CFG_L2_MEM_PORTS = "expr: min($VX_CFG_L2_NUM_BANKS, $VX_CFG_PLATFORM_MEMORY_N
[l3cache]
VX_CFG_L3_CACHE_SIZE = 2097152
VX_CFG_L3_NUM_WAYS = 8
VX_CFG_L3_WRITEBACK = 0
VX_CFG_L3_WRITEBACK = 1
VX_CFG_L3_DIRTYBYTES = "expr: $VX_CFG_L3_WRITEBACK"
VX_CFG_L3_REPL_POLICY = "expr: $__cache_repl_fifo"
VX_CFG_L3_MSHR_SIZE = 16
Expand Down
184 changes: 101 additions & 83 deletions VX_types.toml
Original file line number Diff line number Diff line change
Expand Up @@ -390,15 +390,23 @@ VX_CSR_CTA_CLUSTER_SIZE = 0xCE0
VX_CSR_CTA_ENTRY = 0xCE1 # kernel entry PC, supplied per-CTA by the KMU

[dcr_mpm_class]
VX_DCR_MPM_CLASS_BASE = 0
VX_DCR_MPM_CLASS_CORE = 1
VX_DCR_MPM_CLASS_MEM = 2
VX_DCR_MPM_CLASS_TEX = 3
VX_DCR_MPM_CLASS_RASTER = 4
VX_DCR_MPM_CLASS_OM = 5
VX_DCR_MPM_CLASS_DXA = 6
VX_DCR_MPM_CLASS_TCU = 7
VX_DCR_MPM_CLASS_VM = 8
VX_DCR_MPM_CLASS_BASE = 0
VX_DCR_MPM_CLASS_CORE = 1
VX_DCR_MPM_CLASS_RESERVED1= 2
VX_DCR_MPM_CLASS_ICACHE = 3
VX_DCR_MPM_CLASS_DCACHE = 4
VX_DCR_MPM_CLASS_L2CACHE = 5
VX_DCR_MPM_CLASS_L3CACHE = 6
VX_DCR_MPM_CLASS_MEM = 7
VX_DCR_MPM_CLASS_RESERVED2= 8
VX_DCR_MPM_CLASS_RESERVED3= 9
VX_DCR_MPM_CLASS_RESERVED4= 10
VX_DCR_MPM_CLASS_TCU = 11
VX_DCR_MPM_CLASS_RASTER = 12
VX_DCR_MPM_CLASS_TEX = 13
VX_DCR_MPM_CLASS_OM = 14
VX_DCR_MPM_CLASS_RTU = 15
VX_DCR_MPM_CLASS_DXA = 16

[csr_mpm_base]
VX_CSR_MCYCLE = 0xB00
Expand Down Expand Up @@ -476,87 +484,97 @@ VX_CSR_MPM_LOAD_LT_H = 0xB9D
VX_CSR_MPM_STORES = 0xB1E # total LSU store requests
VX_CSR_MPM_STORES_H = 0xB9E

[csr_mpm_vm]
# PERF: VM (TLB/PTW). Hardware sums icache + dcache MMU counters.
VX_CSR_MPM_TLB_READS = 0xB03 # total TLB lookups
VX_CSR_MPM_TLB_READS_H = 0xB83
VX_CSR_MPM_TLB_HITS = 0xB04 # TLB hits
VX_CSR_MPM_TLB_HITS_H = 0xB84
VX_CSR_MPM_TLB_MISSES = 0xB05 # TLB misses (triggered PTW)
VX_CSR_MPM_TLB_MISSES_H = 0xB85
VX_CSR_MPM_TLB_EVICTS = 0xB06 # TLB evictions on fill
VX_CSR_MPM_TLB_EVICTS_H = 0xB86
VX_CSR_MPM_PTW_WALKS = 0xB07 # PTW walks completed
VX_CSR_MPM_PTW_WALKS_H = 0xB87
VX_CSR_MPM_PTW_LATENCY = 0xB08 # PTW total latency cycles
VX_CSR_MPM_PTW_LATENCY_H = 0xB88
# VM/MMU counters are part of the MEM class (see [csr_mpm_mem] below).

[csr_mpm_mem]
# PERF: icache
# Each cache level is its own MPM class (re-based at 0xB03), so every level
# gets the full standard hpmcounter window with room for evictions.
[csr_mpm_icache]
VX_CSR_MPM_ICACHE_READS = 0xB03 # total reads
VX_CSR_MPM_ICACHE_READS_H = 0xB83
VX_CSR_MPM_ICACHE_MISS_R = 0xB04 # read misses
VX_CSR_MPM_ICACHE_MISS_R_H = 0xB84
VX_CSR_MPM_ICACHE_MSHR_ST = 0xB05 # MSHR stalls
VX_CSR_MPM_ICACHE_MSHR_ST_H = 0xB85
# PERF: dcache
VX_CSR_MPM_DCACHE_READS = 0xB06 # total reads
VX_CSR_MPM_DCACHE_READS_H = 0xB86
VX_CSR_MPM_DCACHE_WRITES = 0xB07 # total writes
VX_CSR_MPM_DCACHE_WRITES_H = 0xB87
VX_CSR_MPM_DCACHE_MISS_R = 0xB08 # read misses
VX_CSR_MPM_DCACHE_MISS_R_H = 0xB88
VX_CSR_MPM_DCACHE_MISS_W = 0xB09 # write misses
VX_CSR_MPM_DCACHE_MISS_W_H = 0xB89
VX_CSR_MPM_DCACHE_BANK_ST = 0xB0A # bank conflicts
VX_CSR_MPM_DCACHE_BANK_ST_H = 0xB8A
VX_CSR_MPM_DCACHE_MSHR_ST = 0xB0B # MSHR stalls
VX_CSR_MPM_DCACHE_MSHR_ST_H = 0xB8B
# PERF: l2cache
VX_CSR_MPM_L2CACHE_READS = 0xB0C # total reads
VX_CSR_MPM_L2CACHE_READS_H = 0xB8C
VX_CSR_MPM_L2CACHE_WRITES = 0xB0D # total writes
VX_CSR_MPM_L2CACHE_WRITES_H = 0xB8D
VX_CSR_MPM_L2CACHE_MISS_R = 0xB0E # read misses
VX_CSR_MPM_L2CACHE_MISS_R_H = 0xB8E
VX_CSR_MPM_L2CACHE_MISS_W = 0xB0F # write misses
VX_CSR_MPM_L2CACHE_MISS_W_H = 0xB8F
VX_CSR_MPM_L2CACHE_BANK_ST = 0xB10 # bank conflicts
VX_CSR_MPM_L2CACHE_BANK_ST_H = 0xB90
VX_CSR_MPM_L2CACHE_MSHR_ST = 0xB11 # MSHR stalls
VX_CSR_MPM_L2CACHE_MSHR_ST_H = 0xB91
# PERF: l3cache
VX_CSR_MPM_L3CACHE_READS = 0xB12 # total reads
VX_CSR_MPM_L3CACHE_READS_H = 0xB92
VX_CSR_MPM_L3CACHE_WRITES = 0xB13 # total writes
VX_CSR_MPM_L3CACHE_WRITES_H = 0xB93
VX_CSR_MPM_L3CACHE_MISS_R = 0xB14 # read misses
VX_CSR_MPM_L3CACHE_MISS_R_H = 0xB94
VX_CSR_MPM_L3CACHE_MISS_W = 0xB15 # write misses
VX_CSR_MPM_L3CACHE_MISS_W_H = 0xB95
VX_CSR_MPM_L3CACHE_BANK_ST = 0xB16 # bank conflicts
VX_CSR_MPM_L3CACHE_BANK_ST_H = 0xB96
VX_CSR_MPM_L3CACHE_MSHR_ST = 0xB17 # MSHR stalls
VX_CSR_MPM_L3CACHE_MSHR_ST_H = 0xB97
# PERF: memory
VX_CSR_MPM_MEM_READS = 0xB18 # total reads
VX_CSR_MPM_MEM_READS_H = 0xB98
VX_CSR_MPM_MEM_WRITES = 0xB19 # total writes
VX_CSR_MPM_MEM_WRITES_H = 0xB99
VX_CSR_MPM_MEM_LT = 0xB1A # memory latency
VX_CSR_MPM_MEM_LT_H = 0xB9A
VX_CSR_MPM_MEM_BANK_ST = 0xB1E # bank conflicts
VX_CSR_MPM_MEM_BANK_ST_H = 0xB9E
# PERF: lmem
VX_CSR_MPM_LMEM_READS = 0xB1B # memory reads
VX_CSR_MPM_LMEM_READS_H = 0xB9B
VX_CSR_MPM_LMEM_WRITES = 0xB1C # memory writes
VX_CSR_MPM_LMEM_WRITES_H = 0xB9C
VX_CSR_MPM_LMEM_BANK_ST = 0xB1D # bank conflicts
VX_CSR_MPM_LMEM_BANK_ST_H = 0xB9D
# PERF: coalescer
VX_CSR_MPM_COALESCER_MISS = 0xB1F # coalescer misses
VX_CSR_MPM_COALESCER_MISS_H = 0xB9F

[csr_mpm_dcache]
VX_CSR_MPM_DCACHE_READS = 0xB03 # total reads
VX_CSR_MPM_DCACHE_READS_H = 0xB83
VX_CSR_MPM_DCACHE_WRITES = 0xB04 # total writes
VX_CSR_MPM_DCACHE_WRITES_H = 0xB84
VX_CSR_MPM_DCACHE_MISS_R = 0xB05 # read misses
VX_CSR_MPM_DCACHE_MISS_R_H = 0xB85
VX_CSR_MPM_DCACHE_MISS_W = 0xB06 # write misses
VX_CSR_MPM_DCACHE_MISS_W_H = 0xB86
VX_CSR_MPM_DCACHE_EVICTS = 0xB07 # dirty-line evictions
VX_CSR_MPM_DCACHE_EVICTS_H = 0xB87
VX_CSR_MPM_DCACHE_BANK_ST = 0xB08 # bank conflicts
VX_CSR_MPM_DCACHE_BANK_ST_H = 0xB88
VX_CSR_MPM_DCACHE_MSHR_ST = 0xB09 # MSHR stalls
VX_CSR_MPM_DCACHE_MSHR_ST_H = 0xB89

[csr_mpm_l2cache]
VX_CSR_MPM_L2CACHE_READS = 0xB03 # total reads
VX_CSR_MPM_L2CACHE_READS_H = 0xB83
VX_CSR_MPM_L2CACHE_WRITES = 0xB04 # total writes
VX_CSR_MPM_L2CACHE_WRITES_H = 0xB84
VX_CSR_MPM_L2CACHE_MISS_R = 0xB05 # read misses
VX_CSR_MPM_L2CACHE_MISS_R_H = 0xB85
VX_CSR_MPM_L2CACHE_MISS_W = 0xB06 # write misses
VX_CSR_MPM_L2CACHE_MISS_W_H = 0xB86
VX_CSR_MPM_L2CACHE_EVICTS = 0xB07 # dirty-line evictions
VX_CSR_MPM_L2CACHE_EVICTS_H = 0xB87
VX_CSR_MPM_L2CACHE_BANK_ST = 0xB08 # bank conflicts
VX_CSR_MPM_L2CACHE_BANK_ST_H = 0xB88
VX_CSR_MPM_L2CACHE_MSHR_ST = 0xB09 # MSHR stalls
VX_CSR_MPM_L2CACHE_MSHR_ST_H = 0xB89

[csr_mpm_l3cache]
VX_CSR_MPM_L3CACHE_READS = 0xB03 # total reads
VX_CSR_MPM_L3CACHE_READS_H = 0xB83
VX_CSR_MPM_L3CACHE_WRITES = 0xB04 # total writes
VX_CSR_MPM_L3CACHE_WRITES_H = 0xB84
VX_CSR_MPM_L3CACHE_MISS_R = 0xB05 # read misses
VX_CSR_MPM_L3CACHE_MISS_R_H = 0xB85
VX_CSR_MPM_L3CACHE_MISS_W = 0xB06 # write misses
VX_CSR_MPM_L3CACHE_MISS_W_H = 0xB86
VX_CSR_MPM_L3CACHE_EVICTS = 0xB07 # dirty-line evictions
VX_CSR_MPM_L3CACHE_EVICTS_H = 0xB87
VX_CSR_MPM_L3CACHE_BANK_ST = 0xB08 # bank conflicts
VX_CSR_MPM_L3CACHE_BANK_ST_H = 0xB88
VX_CSR_MPM_L3CACHE_MSHR_ST = 0xB09 # MSHR stalls
VX_CSR_MPM_L3CACHE_MSHR_ST_H = 0xB89

# Off-chip memory + local memory + coalescer + VM/MMU (one memory-subsystem class).
[csr_mpm_mem]
VX_CSR_MPM_MEM_READS = 0xB03 # total reads
VX_CSR_MPM_MEM_READS_H = 0xB83
VX_CSR_MPM_MEM_WRITES = 0xB04 # total writes
VX_CSR_MPM_MEM_WRITES_H = 0xB84
VX_CSR_MPM_MEM_LT = 0xB05 # memory latency
VX_CSR_MPM_MEM_LT_H = 0xB85
VX_CSR_MPM_MEM_BANK_ST = 0xB06 # bank conflicts
VX_CSR_MPM_MEM_BANK_ST_H = 0xB86
VX_CSR_MPM_LMEM_READS = 0xB07 # local memory reads
VX_CSR_MPM_LMEM_READS_H = 0xB87
VX_CSR_MPM_LMEM_WRITES = 0xB08 # local memory writes
VX_CSR_MPM_LMEM_WRITES_H = 0xB88
VX_CSR_MPM_LMEM_BANK_ST = 0xB09 # bank conflicts
VX_CSR_MPM_LMEM_BANK_ST_H = 0xB89
VX_CSR_MPM_COALESCER_MISS = 0xB0A # coalescer misses
VX_CSR_MPM_COALESCER_MISS_H = 0xB8A
# VM/MMU (per-core TLB/PTW). Hardware sums icache + dcache MMU counters.
VX_CSR_MPM_TLB_READS = 0xB0B # total TLB lookups
VX_CSR_MPM_TLB_READS_H = 0xB8B
VX_CSR_MPM_TLB_HITS = 0xB0C # TLB hits
VX_CSR_MPM_TLB_HITS_H = 0xB8C
VX_CSR_MPM_TLB_MISSES = 0xB0D # TLB misses (triggered PTW)
VX_CSR_MPM_TLB_MISSES_H = 0xB8D
VX_CSR_MPM_TLB_EVICTS = 0xB0E # TLB evictions on fill
VX_CSR_MPM_TLB_EVICTS_H = 0xB8E
VX_CSR_MPM_PTW_WALKS = 0xB0F # PTW walks completed
VX_CSR_MPM_PTW_WALKS_H = 0xB8F
VX_CSR_MPM_PTW_LATENCY = 0xB10 # PTW total latency cycles
VX_CSR_MPM_PTW_LATENCY_H = 0xB90

[csr_mpm_dxa]
# PERF: DXA copy engine (cluster-level, same value on all cores in cluster)
Expand Down
3 changes: 2 additions & 1 deletion ci/blackbox.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ show_help()
echo " where"
echo "--driver: gpu, simx, rtlsim, oape, xrt"
echo "--app: any subfolder test under regression, graphics, mpi, opencl, or hip"
echo "--class: 0=disable, 1=pipeline, 2=memsys"
echo "--perf: 0=disable, 1=core, 3=icache, 4=dcache, 5=l2cache, 6=l3cache, 7=mem,"
echo " 11=tcu, 12=raster, 13=tex, 14=om, 15=rtu, 16=dxa"
echo "--nohup: build and run in temp directory"
}

Expand Down
5 changes: 3 additions & 2 deletions ci/regression.sh.in
Original file line number Diff line number Diff line change
Expand Up @@ -339,8 +339,9 @@ amo()
# fill/probe ordering that single-core runs never hit.
# 4x L1 -> 1x L2 (multi-core base configuration).
CONFIGS="-DVX_CFG_EXT_A_ENABLE" ./ci/blackbox.sh --driver=simx --cores=4 --l2cache --args=-n8 --app=amo
# 4x L2 -> 1x L3 (two-level shared hierarchy).
CONFIGS="-DVX_CFG_EXT_A_ENABLE" ./ci/blackbox.sh --driver=simx --cores=4 --l2cache --l3cache --args=-n8 --app=amo
# 4x L2 -> 1x L3 (two-level shared hierarchy). L3 is the LLC where atomics
# resolve, so L2 (above it) must be write-through for AMO correctness.
CONFIGS="-DVX_CFG_EXT_A_ENABLE -DVX_CFG_L2_WRITEBACK=0" ./ci/blackbox.sh --driver=simx --cores=4 --l2cache --l3cache --args=-n8 --app=amo

echo "amo tests done!"
}
Expand Down
6 changes: 3 additions & 3 deletions docs/designs/graphics_fixed_function_pipeline.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,9 @@ arbiters, cores, caches, and DCR fan-out.
blend-mode/func/const, logic-op, [`:255-276`](../../VX_types.toml#L255)).
DCRs are broadcast to all cluster instances; each raster instance
self-selects its tile stripe.
- **Perf** MPM classes TEX=3, RASTER=4, OM=5
([`VX_types.toml:392-394`](../../VX_types.toml#L392)); reported via
[`legacy_perf.cpp:229-231`](../../sw/runtime/common/legacy_perf.cpp#L229).
- **Perf** MPM classes RASTER=12, TEX=13, OM=14
([`VX_types.toml:393-409`](../../VX_types.toml#L393)); reported via
[`legacy_perf.cpp`](../../sw/runtime/common/legacy_perf.cpp).
- **Counts** ([`VX_config.toml`](../../VX_config.toml)): `NUM_TEX_CORES`,
`NUM_RASTER_CORES`, `NUM_OM_CORES`, and `NUM_{TCACHES,RCACHES,OCACHES}`.

Expand Down
7 changes: 4 additions & 3 deletions docs/designs/virtual_memory_subsystem.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,10 @@ CP does.
- **TLB sizing**: a single flat `VX_CFG_TLB_SIZE`-entry (32) fully-
associative CAM, one per dcache MMU + one per icache MMU per core
([`VX_config.toml:160`](../../VX_config.toml#L160)). No L2/L3.
- **Perf**: 6 VM perf CSRs `[csr_mpm_vm]` 0xB03–0xB08 (+_H mirrors),
class `VX_DCR_MPM_CLASS_VM = 8`
([`VX_types.toml:475-488`](../../VX_types.toml#L475)).
- **Perf**: 6 VM perf CSRs in the memory-subsystem class
`VX_DCR_MPM_CLASS_MEM = 7` at 0xB0B–0xB10 (+_H mirrors), alongside
off-chip memory / lmem / coalescer (`[csr_mpm_mem]` in
[VX_types.toml](../../VX_types.toml)).
- **Runtime caps**: `VX_CAPS_VM_SUPPORT`, `VX_MEM_PHYS = 0x8`
([`vortex2.h:74,121`](../../sw/runtime/include/vortex2.h#L74)).

Expand Down
2 changes: 1 addition & 1 deletion docs/proposals/mmu_optimization_proposal.md
Original file line number Diff line number Diff line change
Expand Up @@ -660,7 +660,7 @@ Regression integration:

Performance signals:

- Perf counter dump (`VX_DCR_MPM_CLASS_VM`) reports per-level
- Perf counter dump (VM counters in `VX_DCR_MPM_CLASS_MEM`) reports per-level
reads/hits/misses, MSHR occupancy, walker occupancy, walk-cache hit
rate. Compare against baseline (current per-core MMU) and document
the gain.
Expand Down
14 changes: 8 additions & 6 deletions docs/vm.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,10 @@ falls back to sequential allocation so progress is guaranteed.

## Perf counters

Six MMU-related counters live in their own MPM class
(`VX_DCR_MPM_CLASS_VM`). The hardware sums the icache and dcache MMU
counters into one bank exposed via `pipeline_perf.mmu` in
Six MMU-related counters live in the memory-subsystem MPM class
(`VX_DCR_MPM_CLASS_MEM`, alongside off-chip memory, lmem, and the
coalescer). The hardware sums the icache and dcache MMU counters into one
bank exposed via `pipeline_perf.mmu` in
[VX_gpu_pkg.sv](../hw/rtl/VX_gpu_pkg.sv).

| CSR | Meaning |
Expand All @@ -94,9 +95,10 @@ counters into one bank exposed via `pipeline_perf.mmu` in
| `VX_CSR_MPM_PTW_WALKS` | Completed PTW walks |
| `VX_CSR_MPM_PTW_LATENCY` | Total PTW latency in cycles (avg = LATENCY / WALKS) |

[stub/perf.cpp](../sw/runtime/stub/perf.cpp) reads these and prints a
`vm:` line in the per-core report when `--perf=1` (CORE class) is passed
to `blackbox.sh`. Example:
[common/legacy_perf.cpp](../sw/runtime/common/legacy_perf.cpp) reads these
(from the `VX_DCR_MPM_CLASS_MEM` class) and prints a per-core `vm:` line in
the memory report when `--perf=7` (MEM class) is passed to `blackbox.sh`.
Example:

```
PERF: vm: tlb_reads=96, hit=96%, evicts=0, ptw_walks=4, ptw_avg_lat=84.75
Expand Down
1 change: 1 addition & 0 deletions hw/rtl/VX_gpu_pkg.sv
Original file line number Diff line number Diff line change
Expand Up @@ -1011,6 +1011,7 @@ package VX_gpu_pkg;
logic [PERF_CTR_BITS-1:0] writes;
logic [PERF_CTR_BITS-1:0] read_misses;
logic [PERF_CTR_BITS-1:0] write_misses;
logic [PERF_CTR_BITS-1:0] evictions;
logic [PERF_CTR_BITS-1:0] bank_stalls;
logic [PERF_CTR_BITS-1:0] mshr_stalls;
logic [PERF_CTR_BITS-1:0] mem_stalls;
Expand Down
Loading
Loading