diff --git a/VX_config.toml b/VX_config.toml index d7cb2f0b2a..1072c1250d 100644 --- a/VX_config.toml +++ b/VX_config.toml @@ -200,7 +200,7 @@ VX_CFG_L1_MEM_PORTS = "expr: min($VX_CFG_DCACHE_NUM_BANKS, $VX_CFG_PLATFORM_MEMO [l2cache] VX_CFG_L2_CACHE_SIZE = 1048576 VX_CFG_L2_NUM_WAYS = 8 -VX_CFG_L2_WRITEBACK = 0 +VX_CFG_L2_WRITEBACK = 1 VX_CFG_L2_DIRTYBYTES = "expr: $VX_CFG_L2_WRITEBACK" VX_CFG_L2_REPL_POLICY = "expr: $__cache_repl_fifo" VX_CFG_L2_MSHR_SIZE = 16 @@ -214,7 +214,7 @@ VX_CFG_L2_MEM_PORTS = "expr: min($VX_CFG_L2_NUM_BANKS, $VX_CFG_PLATFORM_MEMORY_N [l3cache] VX_CFG_L3_CACHE_SIZE = 2097152 VX_CFG_L3_NUM_WAYS = 8 -VX_CFG_L3_WRITEBACK = 0 +VX_CFG_L3_WRITEBACK = 1 VX_CFG_L3_DIRTYBYTES = "expr: $VX_CFG_L3_WRITEBACK" VX_CFG_L3_REPL_POLICY = "expr: $__cache_repl_fifo" VX_CFG_L3_MSHR_SIZE = 16 diff --git a/VX_types.toml b/VX_types.toml index f833a940bb..58e6c63b7b 100644 --- a/VX_types.toml +++ b/VX_types.toml @@ -390,15 +390,23 @@ VX_CSR_CTA_CLUSTER_SIZE = 0xCE0 VX_CSR_CTA_ENTRY = 0xCE1 # kernel entry PC, supplied per-CTA by the KMU [dcr_mpm_class] -VX_DCR_MPM_CLASS_BASE = 0 -VX_DCR_MPM_CLASS_CORE = 1 -VX_DCR_MPM_CLASS_MEM = 2 -VX_DCR_MPM_CLASS_TEX = 3 -VX_DCR_MPM_CLASS_RASTER = 4 -VX_DCR_MPM_CLASS_OM = 5 -VX_DCR_MPM_CLASS_DXA = 6 -VX_DCR_MPM_CLASS_TCU = 7 -VX_DCR_MPM_CLASS_VM = 8 +VX_DCR_MPM_CLASS_BASE = 0 +VX_DCR_MPM_CLASS_CORE = 1 +VX_DCR_MPM_CLASS_RESERVED1= 2 +VX_DCR_MPM_CLASS_ICACHE = 3 +VX_DCR_MPM_CLASS_DCACHE = 4 +VX_DCR_MPM_CLASS_L2CACHE = 5 +VX_DCR_MPM_CLASS_L3CACHE = 6 +VX_DCR_MPM_CLASS_MEM = 7 +VX_DCR_MPM_CLASS_RESERVED2= 8 +VX_DCR_MPM_CLASS_RESERVED3= 9 +VX_DCR_MPM_CLASS_RESERVED4= 10 +VX_DCR_MPM_CLASS_TCU = 11 +VX_DCR_MPM_CLASS_RASTER = 12 +VX_DCR_MPM_CLASS_TEX = 13 +VX_DCR_MPM_CLASS_OM = 14 +VX_DCR_MPM_CLASS_RTU = 15 +VX_DCR_MPM_CLASS_DXA = 16 [csr_mpm_base] VX_CSR_MCYCLE = 0xB00 @@ -476,87 +484,97 @@ VX_CSR_MPM_LOAD_LT_H = 0xB9D VX_CSR_MPM_STORES = 0xB1E # total LSU store requests VX_CSR_MPM_STORES_H = 0xB9E -[csr_mpm_vm] -# PERF: VM (TLB/PTW). Hardware sums icache + dcache MMU counters. -VX_CSR_MPM_TLB_READS = 0xB03 # total TLB lookups -VX_CSR_MPM_TLB_READS_H = 0xB83 -VX_CSR_MPM_TLB_HITS = 0xB04 # TLB hits -VX_CSR_MPM_TLB_HITS_H = 0xB84 -VX_CSR_MPM_TLB_MISSES = 0xB05 # TLB misses (triggered PTW) -VX_CSR_MPM_TLB_MISSES_H = 0xB85 -VX_CSR_MPM_TLB_EVICTS = 0xB06 # TLB evictions on fill -VX_CSR_MPM_TLB_EVICTS_H = 0xB86 -VX_CSR_MPM_PTW_WALKS = 0xB07 # PTW walks completed -VX_CSR_MPM_PTW_WALKS_H = 0xB87 -VX_CSR_MPM_PTW_LATENCY = 0xB08 # PTW total latency cycles -VX_CSR_MPM_PTW_LATENCY_H = 0xB88 +# VM/MMU counters are part of the MEM class (see [csr_mpm_mem] below). -[csr_mpm_mem] -# PERF: icache +# Each cache level is its own MPM class (re-based at 0xB03), so every level +# gets the full standard hpmcounter window with room for evictions. +[csr_mpm_icache] VX_CSR_MPM_ICACHE_READS = 0xB03 # total reads VX_CSR_MPM_ICACHE_READS_H = 0xB83 VX_CSR_MPM_ICACHE_MISS_R = 0xB04 # read misses VX_CSR_MPM_ICACHE_MISS_R_H = 0xB84 VX_CSR_MPM_ICACHE_MSHR_ST = 0xB05 # MSHR stalls VX_CSR_MPM_ICACHE_MSHR_ST_H = 0xB85 -# PERF: dcache -VX_CSR_MPM_DCACHE_READS = 0xB06 # total reads -VX_CSR_MPM_DCACHE_READS_H = 0xB86 -VX_CSR_MPM_DCACHE_WRITES = 0xB07 # total writes -VX_CSR_MPM_DCACHE_WRITES_H = 0xB87 -VX_CSR_MPM_DCACHE_MISS_R = 0xB08 # read misses -VX_CSR_MPM_DCACHE_MISS_R_H = 0xB88 -VX_CSR_MPM_DCACHE_MISS_W = 0xB09 # write misses -VX_CSR_MPM_DCACHE_MISS_W_H = 0xB89 -VX_CSR_MPM_DCACHE_BANK_ST = 0xB0A # bank conflicts -VX_CSR_MPM_DCACHE_BANK_ST_H = 0xB8A -VX_CSR_MPM_DCACHE_MSHR_ST = 0xB0B # MSHR stalls -VX_CSR_MPM_DCACHE_MSHR_ST_H = 0xB8B -# PERF: l2cache -VX_CSR_MPM_L2CACHE_READS = 0xB0C # total reads -VX_CSR_MPM_L2CACHE_READS_H = 0xB8C -VX_CSR_MPM_L2CACHE_WRITES = 0xB0D # total writes -VX_CSR_MPM_L2CACHE_WRITES_H = 0xB8D -VX_CSR_MPM_L2CACHE_MISS_R = 0xB0E # read misses -VX_CSR_MPM_L2CACHE_MISS_R_H = 0xB8E -VX_CSR_MPM_L2CACHE_MISS_W = 0xB0F # write misses -VX_CSR_MPM_L2CACHE_MISS_W_H = 0xB8F -VX_CSR_MPM_L2CACHE_BANK_ST = 0xB10 # bank conflicts -VX_CSR_MPM_L2CACHE_BANK_ST_H = 0xB90 -VX_CSR_MPM_L2CACHE_MSHR_ST = 0xB11 # MSHR stalls -VX_CSR_MPM_L2CACHE_MSHR_ST_H = 0xB91 -# PERF: l3cache -VX_CSR_MPM_L3CACHE_READS = 0xB12 # total reads -VX_CSR_MPM_L3CACHE_READS_H = 0xB92 -VX_CSR_MPM_L3CACHE_WRITES = 0xB13 # total writes -VX_CSR_MPM_L3CACHE_WRITES_H = 0xB93 -VX_CSR_MPM_L3CACHE_MISS_R = 0xB14 # read misses -VX_CSR_MPM_L3CACHE_MISS_R_H = 0xB94 -VX_CSR_MPM_L3CACHE_MISS_W = 0xB15 # write misses -VX_CSR_MPM_L3CACHE_MISS_W_H = 0xB95 -VX_CSR_MPM_L3CACHE_BANK_ST = 0xB16 # bank conflicts -VX_CSR_MPM_L3CACHE_BANK_ST_H = 0xB96 -VX_CSR_MPM_L3CACHE_MSHR_ST = 0xB17 # MSHR stalls -VX_CSR_MPM_L3CACHE_MSHR_ST_H = 0xB97 -# PERF: memory -VX_CSR_MPM_MEM_READS = 0xB18 # total reads -VX_CSR_MPM_MEM_READS_H = 0xB98 -VX_CSR_MPM_MEM_WRITES = 0xB19 # total writes -VX_CSR_MPM_MEM_WRITES_H = 0xB99 -VX_CSR_MPM_MEM_LT = 0xB1A # memory latency -VX_CSR_MPM_MEM_LT_H = 0xB9A -VX_CSR_MPM_MEM_BANK_ST = 0xB1E # bank conflicts -VX_CSR_MPM_MEM_BANK_ST_H = 0xB9E -# PERF: lmem -VX_CSR_MPM_LMEM_READS = 0xB1B # memory reads -VX_CSR_MPM_LMEM_READS_H = 0xB9B -VX_CSR_MPM_LMEM_WRITES = 0xB1C # memory writes -VX_CSR_MPM_LMEM_WRITES_H = 0xB9C -VX_CSR_MPM_LMEM_BANK_ST = 0xB1D # bank conflicts -VX_CSR_MPM_LMEM_BANK_ST_H = 0xB9D -# PERF: coalescer -VX_CSR_MPM_COALESCER_MISS = 0xB1F # coalescer misses -VX_CSR_MPM_COALESCER_MISS_H = 0xB9F + +[csr_mpm_dcache] +VX_CSR_MPM_DCACHE_READS = 0xB03 # total reads +VX_CSR_MPM_DCACHE_READS_H = 0xB83 +VX_CSR_MPM_DCACHE_WRITES = 0xB04 # total writes +VX_CSR_MPM_DCACHE_WRITES_H = 0xB84 +VX_CSR_MPM_DCACHE_MISS_R = 0xB05 # read misses +VX_CSR_MPM_DCACHE_MISS_R_H = 0xB85 +VX_CSR_MPM_DCACHE_MISS_W = 0xB06 # write misses +VX_CSR_MPM_DCACHE_MISS_W_H = 0xB86 +VX_CSR_MPM_DCACHE_EVICTS = 0xB07 # dirty-line evictions +VX_CSR_MPM_DCACHE_EVICTS_H = 0xB87 +VX_CSR_MPM_DCACHE_BANK_ST = 0xB08 # bank conflicts +VX_CSR_MPM_DCACHE_BANK_ST_H = 0xB88 +VX_CSR_MPM_DCACHE_MSHR_ST = 0xB09 # MSHR stalls +VX_CSR_MPM_DCACHE_MSHR_ST_H = 0xB89 + +[csr_mpm_l2cache] +VX_CSR_MPM_L2CACHE_READS = 0xB03 # total reads +VX_CSR_MPM_L2CACHE_READS_H = 0xB83 +VX_CSR_MPM_L2CACHE_WRITES = 0xB04 # total writes +VX_CSR_MPM_L2CACHE_WRITES_H = 0xB84 +VX_CSR_MPM_L2CACHE_MISS_R = 0xB05 # read misses +VX_CSR_MPM_L2CACHE_MISS_R_H = 0xB85 +VX_CSR_MPM_L2CACHE_MISS_W = 0xB06 # write misses +VX_CSR_MPM_L2CACHE_MISS_W_H = 0xB86 +VX_CSR_MPM_L2CACHE_EVICTS = 0xB07 # dirty-line evictions +VX_CSR_MPM_L2CACHE_EVICTS_H = 0xB87 +VX_CSR_MPM_L2CACHE_BANK_ST = 0xB08 # bank conflicts +VX_CSR_MPM_L2CACHE_BANK_ST_H = 0xB88 +VX_CSR_MPM_L2CACHE_MSHR_ST = 0xB09 # MSHR stalls +VX_CSR_MPM_L2CACHE_MSHR_ST_H = 0xB89 + +[csr_mpm_l3cache] +VX_CSR_MPM_L3CACHE_READS = 0xB03 # total reads +VX_CSR_MPM_L3CACHE_READS_H = 0xB83 +VX_CSR_MPM_L3CACHE_WRITES = 0xB04 # total writes +VX_CSR_MPM_L3CACHE_WRITES_H = 0xB84 +VX_CSR_MPM_L3CACHE_MISS_R = 0xB05 # read misses +VX_CSR_MPM_L3CACHE_MISS_R_H = 0xB85 +VX_CSR_MPM_L3CACHE_MISS_W = 0xB06 # write misses +VX_CSR_MPM_L3CACHE_MISS_W_H = 0xB86 +VX_CSR_MPM_L3CACHE_EVICTS = 0xB07 # dirty-line evictions +VX_CSR_MPM_L3CACHE_EVICTS_H = 0xB87 +VX_CSR_MPM_L3CACHE_BANK_ST = 0xB08 # bank conflicts +VX_CSR_MPM_L3CACHE_BANK_ST_H = 0xB88 +VX_CSR_MPM_L3CACHE_MSHR_ST = 0xB09 # MSHR stalls +VX_CSR_MPM_L3CACHE_MSHR_ST_H = 0xB89 + +# Off-chip memory + local memory + coalescer + VM/MMU (one memory-subsystem class). +[csr_mpm_mem] +VX_CSR_MPM_MEM_READS = 0xB03 # total reads +VX_CSR_MPM_MEM_READS_H = 0xB83 +VX_CSR_MPM_MEM_WRITES = 0xB04 # total writes +VX_CSR_MPM_MEM_WRITES_H = 0xB84 +VX_CSR_MPM_MEM_LT = 0xB05 # memory latency +VX_CSR_MPM_MEM_LT_H = 0xB85 +VX_CSR_MPM_MEM_BANK_ST = 0xB06 # bank conflicts +VX_CSR_MPM_MEM_BANK_ST_H = 0xB86 +VX_CSR_MPM_LMEM_READS = 0xB07 # local memory reads +VX_CSR_MPM_LMEM_READS_H = 0xB87 +VX_CSR_MPM_LMEM_WRITES = 0xB08 # local memory writes +VX_CSR_MPM_LMEM_WRITES_H = 0xB88 +VX_CSR_MPM_LMEM_BANK_ST = 0xB09 # bank conflicts +VX_CSR_MPM_LMEM_BANK_ST_H = 0xB89 +VX_CSR_MPM_COALESCER_MISS = 0xB0A # coalescer misses +VX_CSR_MPM_COALESCER_MISS_H = 0xB8A +# VM/MMU (per-core TLB/PTW). Hardware sums icache + dcache MMU counters. +VX_CSR_MPM_TLB_READS = 0xB0B # total TLB lookups +VX_CSR_MPM_TLB_READS_H = 0xB8B +VX_CSR_MPM_TLB_HITS = 0xB0C # TLB hits +VX_CSR_MPM_TLB_HITS_H = 0xB8C +VX_CSR_MPM_TLB_MISSES = 0xB0D # TLB misses (triggered PTW) +VX_CSR_MPM_TLB_MISSES_H = 0xB8D +VX_CSR_MPM_TLB_EVICTS = 0xB0E # TLB evictions on fill +VX_CSR_MPM_TLB_EVICTS_H = 0xB8E +VX_CSR_MPM_PTW_WALKS = 0xB0F # PTW walks completed +VX_CSR_MPM_PTW_WALKS_H = 0xB8F +VX_CSR_MPM_PTW_LATENCY = 0xB10 # PTW total latency cycles +VX_CSR_MPM_PTW_LATENCY_H = 0xB90 [csr_mpm_dxa] # PERF: DXA copy engine (cluster-level, same value on all cores in cluster) diff --git a/ci/blackbox.sh b/ci/blackbox.sh index 886a8cdcef..271ddd1550 100755 --- a/ci/blackbox.sh +++ b/ci/blackbox.sh @@ -28,7 +28,8 @@ show_help() echo " where" echo "--driver: gpu, simx, rtlsim, oape, xrt" echo "--app: any subfolder test under regression, graphics, mpi, opencl, or hip" - echo "--class: 0=disable, 1=pipeline, 2=memsys" + echo "--perf: 0=disable, 1=core, 3=icache, 4=dcache, 5=l2cache, 6=l3cache, 7=mem," + echo " 11=tcu, 12=raster, 13=tex, 14=om, 15=rtu, 16=dxa" echo "--nohup: build and run in temp directory" } diff --git a/ci/regression.sh.in b/ci/regression.sh.in index be49db1000..2d03f7fffa 100755 --- a/ci/regression.sh.in +++ b/ci/regression.sh.in @@ -339,8 +339,9 @@ amo() # fill/probe ordering that single-core runs never hit. # 4x L1 -> 1x L2 (multi-core base configuration). CONFIGS="-DVX_CFG_EXT_A_ENABLE" ./ci/blackbox.sh --driver=simx --cores=4 --l2cache --args=-n8 --app=amo - # 4x L2 -> 1x L3 (two-level shared hierarchy). - CONFIGS="-DVX_CFG_EXT_A_ENABLE" ./ci/blackbox.sh --driver=simx --cores=4 --l2cache --l3cache --args=-n8 --app=amo + # 4x L2 -> 1x L3 (two-level shared hierarchy). L3 is the LLC where atomics + # resolve, so L2 (above it) must be write-through for AMO correctness. + CONFIGS="-DVX_CFG_EXT_A_ENABLE -DVX_CFG_L2_WRITEBACK=0" ./ci/blackbox.sh --driver=simx --cores=4 --l2cache --l3cache --args=-n8 --app=amo echo "amo tests done!" } diff --git a/docs/designs/graphics_fixed_function_pipeline.md b/docs/designs/graphics_fixed_function_pipeline.md index 30331b212e..67e0346363 100644 --- a/docs/designs/graphics_fixed_function_pipeline.md +++ b/docs/designs/graphics_fixed_function_pipeline.md @@ -73,9 +73,9 @@ arbiters, cores, caches, and DCR fan-out. blend-mode/func/const, logic-op, [`:255-276`](../../VX_types.toml#L255)). DCRs are broadcast to all cluster instances; each raster instance self-selects its tile stripe. -- **Perf** MPM classes TEX=3, RASTER=4, OM=5 - ([`VX_types.toml:392-394`](../../VX_types.toml#L392)); reported via - [`legacy_perf.cpp:229-231`](../../sw/runtime/common/legacy_perf.cpp#L229). +- **Perf** MPM classes RASTER=12, TEX=13, OM=14 + ([`VX_types.toml:393-409`](../../VX_types.toml#L393)); reported via + [`legacy_perf.cpp`](../../sw/runtime/common/legacy_perf.cpp). - **Counts** ([`VX_config.toml`](../../VX_config.toml)): `NUM_TEX_CORES`, `NUM_RASTER_CORES`, `NUM_OM_CORES`, and `NUM_{TCACHES,RCACHES,OCACHES}`. diff --git a/docs/designs/virtual_memory_subsystem.md b/docs/designs/virtual_memory_subsystem.md index 72e95b3611..f365e5d404 100644 --- a/docs/designs/virtual_memory_subsystem.md +++ b/docs/designs/virtual_memory_subsystem.md @@ -67,9 +67,10 @@ CP does. - **TLB sizing**: a single flat `VX_CFG_TLB_SIZE`-entry (32) fully- associative CAM, one per dcache MMU + one per icache MMU per core ([`VX_config.toml:160`](../../VX_config.toml#L160)). No L2/L3. -- **Perf**: 6 VM perf CSRs `[csr_mpm_vm]` 0xB03–0xB08 (+_H mirrors), - class `VX_DCR_MPM_CLASS_VM = 8` - ([`VX_types.toml:475-488`](../../VX_types.toml#L475)). +- **Perf**: 6 VM perf CSRs in the memory-subsystem class + `VX_DCR_MPM_CLASS_MEM = 7` at 0xB0B–0xB10 (+_H mirrors), alongside + off-chip memory / lmem / coalescer (`[csr_mpm_mem]` in + [VX_types.toml](../../VX_types.toml)). - **Runtime caps**: `VX_CAPS_VM_SUPPORT`, `VX_MEM_PHYS = 0x8` ([`vortex2.h:74,121`](../../sw/runtime/include/vortex2.h#L74)). diff --git a/docs/proposals/mmu_optimization_proposal.md b/docs/proposals/mmu_optimization_proposal.md index eeb5cfe768..4d56bff18a 100644 --- a/docs/proposals/mmu_optimization_proposal.md +++ b/docs/proposals/mmu_optimization_proposal.md @@ -660,7 +660,7 @@ Regression integration: Performance signals: -- Perf counter dump (`VX_DCR_MPM_CLASS_VM`) reports per-level +- Perf counter dump (VM counters in `VX_DCR_MPM_CLASS_MEM`) reports per-level reads/hits/misses, MSHR occupancy, walker occupancy, walk-cache hit rate. Compare against baseline (current per-core MMU) and document the gain. diff --git a/docs/vm.md b/docs/vm.md index 5486fa64c8..7e6c7b014b 100644 --- a/docs/vm.md +++ b/docs/vm.md @@ -80,9 +80,10 @@ falls back to sequential allocation so progress is guaranteed. ## Perf counters -Six MMU-related counters live in their own MPM class -(`VX_DCR_MPM_CLASS_VM`). The hardware sums the icache and dcache MMU -counters into one bank exposed via `pipeline_perf.mmu` in +Six MMU-related counters live in the memory-subsystem MPM class +(`VX_DCR_MPM_CLASS_MEM`, alongside off-chip memory, lmem, and the +coalescer). The hardware sums the icache and dcache MMU counters into one +bank exposed via `pipeline_perf.mmu` in [VX_gpu_pkg.sv](../hw/rtl/VX_gpu_pkg.sv). | CSR | Meaning | @@ -94,9 +95,10 @@ counters into one bank exposed via `pipeline_perf.mmu` in | `VX_CSR_MPM_PTW_WALKS` | Completed PTW walks | | `VX_CSR_MPM_PTW_LATENCY` | Total PTW latency in cycles (avg = LATENCY / WALKS) | -[stub/perf.cpp](../sw/runtime/stub/perf.cpp) reads these and prints a -`vm:` line in the per-core report when `--perf=1` (CORE class) is passed -to `blackbox.sh`. Example: +[common/legacy_perf.cpp](../sw/runtime/common/legacy_perf.cpp) reads these +(from the `VX_DCR_MPM_CLASS_MEM` class) and prints a per-core `vm:` line in +the memory report when `--perf=7` (MEM class) is passed to `blackbox.sh`. +Example: ``` PERF: vm: tlb_reads=96, hit=96%, evicts=0, ptw_walks=4, ptw_avg_lat=84.75 diff --git a/hw/rtl/VX_gpu_pkg.sv b/hw/rtl/VX_gpu_pkg.sv index 5b8da9cfcd..3131a99c72 100644 --- a/hw/rtl/VX_gpu_pkg.sv +++ b/hw/rtl/VX_gpu_pkg.sv @@ -1011,6 +1011,7 @@ package VX_gpu_pkg; logic [PERF_CTR_BITS-1:0] writes; logic [PERF_CTR_BITS-1:0] read_misses; logic [PERF_CTR_BITS-1:0] write_misses; + logic [PERF_CTR_BITS-1:0] evictions; logic [PERF_CTR_BITS-1:0] bank_stalls; logic [PERF_CTR_BITS-1:0] mshr_stalls; logic [PERF_CTR_BITS-1:0] mem_stalls; diff --git a/hw/rtl/cache/VX_amo_alu.sv b/hw/rtl/cache/VX_amo_alu.sv index 9425dc0753..981b1030a4 100644 --- a/hw/rtl/cache/VX_amo_alu.sv +++ b/hw/rtl/cache/VX_amo_alu.sv @@ -18,48 +18,65 @@ // - new_word: the value to write back for store-bearing AMOs (all except LR). // - ret_word: the original loaded value, sign-extended into rd. // For SC the bank overrides this with 0/1 outside this module. -// width selects W (32-bit) or D (64-bit). Sign-extension at the word -// boundary is needed for signed comparisons (MIN/MAX). -module VX_amo_alu import VX_gpu_pkg::*; ( - input amo_op_e op, - input wire amo_unsigned, // selects MIN/MAX variant - input wire [1:0] width, // 2 = .W, 3 = .D - input wire [63:0] old_word, - input wire [63:0] rhs, - output reg [63:0] new_word, - output wire [63:0] ret_word +// DATA_WIDTH is the synthesized operand width (= the cache word width, capped +// at 64): a 32-bit-word cache can only carry .W atomics, so the adder and +// comparators are built 32-bit rather than 64-bit. width selects .W vs .D and +// is only meaningful when DATA_WIDTH > 32. +module VX_amo_alu import VX_gpu_pkg::*; #( + parameter DATA_WIDTH = 64 +) ( + input amo_op_e op, + input wire is_unsigned, // selects MIN/MAX variant + input wire [1:0] width, // 2 = .W, 3 = .D + input wire [63:0] old_word, + input wire [63:0] rhs, + output wire [63:0] new_word, + output wire [63:0] ret_word ); + localparam AW = DATA_WIDTH; - wire is_w = (width == 2'd2); + // .W and .D only differ when the datapath is wider than 32 bits; + // a <= 32-bit operand width can only ever be a .W atomic. + wire is_w = (AW > 32) ? (width == 2'd2) : 1'b1; + if (AW <= 32) begin : g_w_only + `UNUSED_VAR (width) + end + if (AW < 64) begin : g_hi_unused + `UNUSED_VAR (old_word[63:AW]) + `UNUSED_VAR (rhs[63:AW]) + end + + wire [AW-1:0] a = old_word[AW-1:0]; + wire [AW-1:0] b = rhs[AW-1:0]; - // Mask both inputs to width-sized values; sign-extend for MIN/MAX. - wire [63:0] a_u = is_w ? {32'h0, old_word[31:0]} : old_word; - wire [63:0] b_u = is_w ? {32'h0, rhs[31:0]} : rhs; - wire signed [63:0] a_s = is_w ? {{32{old_word[31]}}, old_word[31:0]} : old_word; - wire signed [63:0] b_s = is_w ? {{32{rhs[31]}}, rhs[31:0]} : rhs; + // Mask to width-sized values; sign-extend at the 32-bit boundary for MIN/MAX. + wire [AW-1:0] a_u = is_w ? {{(AW-32){1'b0}}, a[31:0]} : a; + wire [AW-1:0] b_u = is_w ? {{(AW-32){1'b0}}, b[31:0]} : b; + wire signed [AW-1:0] a_s = is_w ? {{(AW-32){a[31]}}, a[31:0]} : a; + wire signed [AW-1:0] b_s = is_w ? {{(AW-32){b[31]}}, b[31:0]} : b; + reg [AW-1:0] res; always @(*) begin case (op) - AMO_OP_LR: new_word = a_u; - AMO_OP_SC: new_word = b_u; - AMO_OP_SWAP: new_word = b_u; - AMO_OP_ADD: new_word = a_u + b_u; - AMO_OP_AND: new_word = a_u & b_u; - AMO_OP_OR: new_word = a_u | b_u; - AMO_OP_XOR: new_word = a_u ^ b_u; - AMO_OP_MIN: new_word = amo_unsigned - ? ((a_u < b_u) ? a_u : b_u) - : ((a_s < b_s) ? a_s : b_s); - AMO_OP_MAX: new_word = amo_unsigned - ? ((a_u > b_u) ? a_u : b_u) - : ((a_s > b_s) ? a_s : b_s); - default: new_word = a_u; + AMO_OP_LR: res = a_u; + AMO_OP_SC: res = b_u; + AMO_OP_SWAP: res = b_u; + AMO_OP_ADD: res = a_u + b_u; + AMO_OP_AND: res = a_u & b_u; + AMO_OP_OR: res = a_u | b_u; + AMO_OP_XOR: res = a_u ^ b_u; + AMO_OP_MIN: res = is_unsigned ? ((a_u < b_u) ? a_u : b_u) + : ((a_s < b_s) ? a_s : b_s); + AMO_OP_MAX: res = is_unsigned ? ((a_u > b_u) ? a_u : b_u) + : ((a_s > b_s) ? a_s : b_s); + default: res = a_u; endcase - if (is_w) new_word = {32'h0, new_word[31:0]}; + if (is_w) res = {{(AW-32){1'b0}}, res[31:0]}; end - // Return value: original loaded word at width (LSU sext at width - // gives rd). For SC, bank overrides this with 0/1. - assign ret_word = is_w ? {32'h0, old_word[31:0]} : old_word; + // Zero-extend the AW-sized results back to the 64-bit port. + // For SC the bank overrides ret_word with 0/1. + assign new_word = 64'(res); + assign ret_word = 64'(is_w ? {{(AW-32){1'b0}}, a[31:0]} : a); endmodule diff --git a/hw/rtl/cache/VX_amo_unit.sv b/hw/rtl/cache/VX_amo_unit.sv index 92c42df8a5..73d3199ac7 100644 --- a/hw/rtl/cache/VX_amo_unit.sv +++ b/hw/rtl/cache/VX_amo_unit.sv @@ -32,14 +32,15 @@ // hart's LR), which guarantees LR/SC forward progress under contention. module VX_amo_unit import VX_gpu_pkg::*; #( parameter NUM_RES_ENTRIES = 4, - parameter LINE_ADDR_BITS = 32 + parameter LINE_ADDR_BITS = 32, + parameter DATA_WIDTH = 64 // ALU operand width (cache word, capped at 64) ) ( input wire clk, input wire reset, // Combinational compute kernel. input amo_op_e compute_op, - input wire compute_amo_unsigned, + input wire compute_unsigned, input wire [1:0] compute_width, input wire [63:0] compute_old, input wire [63:0] compute_rhs, @@ -56,9 +57,11 @@ module VX_amo_unit import VX_gpu_pkg::*; #( ); // Pure ALU (no state, no clock). - VX_amo_alu alu ( + VX_amo_alu #( + .DATA_WIDTH (DATA_WIDTH) + ) alu ( .op (compute_op), - .amo_unsigned (compute_amo_unsigned), + .is_unsigned (compute_unsigned), .width (compute_width), .old_word (compute_old), .rhs (compute_rhs), diff --git a/hw/rtl/cache/VX_cache.sv b/hw/rtl/cache/VX_cache.sv index 69ad8d6619..759237bdcf 100644 --- a/hw/rtl/cache/VX_cache.sv +++ b/hw/rtl/cache/VX_cache.sv @@ -119,6 +119,7 @@ module VX_cache import VX_gpu_pkg::*; #( `ifdef PERF_ENABLE wire [NUM_BANKS-1:0] perf_read_miss_per_bank; wire [NUM_BANKS-1:0] perf_write_miss_per_bank; + wire [NUM_BANKS-1:0] perf_evictions_per_bank; wire [NUM_BANKS-1:0] perf_mshr_stall_per_bank; `endif @@ -401,6 +402,7 @@ module VX_cache import VX_gpu_pkg::*; #( `ifdef PERF_ENABLE .perf_read_miss (perf_read_miss_per_bank[bank_id]), .perf_write_miss (perf_write_miss_per_bank[bank_id]), + .perf_evictions (perf_evictions_per_bank[bank_id]), .perf_mshr_stall (perf_mshr_stall_per_bank[bank_id]), `endif @@ -612,6 +614,7 @@ module VX_cache import VX_gpu_pkg::*; #( wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle; wire [`CLOG2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle; wire [`CLOG2(NUM_BANKS+1)-1:0] perf_write_miss_per_cycle; + wire [`CLOG2(NUM_BANKS+1)-1:0] perf_evictions_per_cycle; wire [`CLOG2(NUM_BANKS+1)-1:0] perf_mshr_stall_per_cycle; wire [`CLOG2(MEM_PORTS+1)-1:0] perf_mem_stall_per_cycle; @@ -619,6 +622,7 @@ module VX_cache import VX_gpu_pkg::*; #( `POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_req); `POP_COUNT(perf_read_miss_per_cycle, perf_read_miss_per_bank); `POP_COUNT(perf_write_miss_per_cycle, perf_write_miss_per_bank); + `POP_COUNT(perf_evictions_per_cycle, perf_evictions_per_bank); `POP_COUNT(perf_mshr_stall_per_cycle, perf_mshr_stall_per_bank); `POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_req); `POP_COUNT(perf_mem_stall_per_cycle, perf_mem_stall_per_port); @@ -627,6 +631,7 @@ module VX_cache import VX_gpu_pkg::*; #( reg [PERF_CTR_BITS-1:0] perf_core_writes; reg [PERF_CTR_BITS-1:0] perf_read_misses; reg [PERF_CTR_BITS-1:0] perf_write_misses; + reg [PERF_CTR_BITS-1:0] perf_evictions; reg [PERF_CTR_BITS-1:0] perf_mshr_stalls; reg [PERF_CTR_BITS-1:0] perf_mem_stalls; reg [PERF_CTR_BITS-1:0] perf_crsp_stalls; @@ -637,6 +642,7 @@ module VX_cache import VX_gpu_pkg::*; #( perf_core_writes <= '0; perf_read_misses <= '0; perf_write_misses <= '0; + perf_evictions <= '0; perf_mshr_stalls <= '0; perf_mem_stalls <= '0; perf_crsp_stalls <= '0; @@ -645,6 +651,7 @@ module VX_cache import VX_gpu_pkg::*; #( perf_core_writes <= perf_core_writes + PERF_CTR_BITS'(perf_core_writes_per_cycle); perf_read_misses <= perf_read_misses + PERF_CTR_BITS'(perf_read_miss_per_cycle); perf_write_misses <= perf_write_misses + PERF_CTR_BITS'(perf_write_miss_per_cycle); + perf_evictions <= perf_evictions + PERF_CTR_BITS'(perf_evictions_per_cycle); perf_mshr_stalls <= perf_mshr_stalls + PERF_CTR_BITS'(perf_mshr_stall_per_cycle); perf_mem_stalls <= perf_mem_stalls + PERF_CTR_BITS'(perf_mem_stall_per_cycle); perf_crsp_stalls <= perf_crsp_stalls + PERF_CTR_BITS'(perf_crsp_stall_per_cycle); @@ -655,6 +662,7 @@ module VX_cache import VX_gpu_pkg::*; #( assign cache_perf.writes = perf_core_writes; assign cache_perf.read_misses = perf_read_misses; assign cache_perf.write_misses = perf_write_misses; + assign cache_perf.evictions = perf_evictions; assign cache_perf.bank_stalls = perf_collisions; assign cache_perf.mshr_stalls = perf_mshr_stalls; assign cache_perf.mem_stalls = perf_mem_stalls; diff --git a/hw/rtl/cache/VX_cache_amo.sv b/hw/rtl/cache/VX_cache_amo.sv new file mode 100644 index 0000000000..d45d4de051 --- /dev/null +++ b/hw/rtl/cache/VX_cache_amo.sv @@ -0,0 +1,481 @@ +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +`include "VX_cache_define.vh" + +// Per-bank AMO engine, instantiated by the bank only when atomics are +// enabled. A bank plays exactly one of two roles: +// +// IS_LLC=1 (commit): perform the read-modify-write on the line word +// resident at S1, maintain the per-hart reservation table (via +// VX_amo_unit), and inject the result back through the bank pipeline +// as a single-outstanding synthetic writeback. +// +// IS_LLC=0 (passthrough): forward the AMO downstream (the LLC does the +// RMW), latch the returned result word, and replay it back to the +// requester. Also enforces same-hart program order at the bank input. +// +// The two roles are mutually exclusive, so each ties off the other's +// outputs and the synthesizer keeps only the selected datapath. +module VX_cache_amo import VX_gpu_pkg::*; #( + parameter IS_LLC = 0, + parameter NUM_RES_ENTRIES = 4, + parameter LINE_ADDR_BITS = 32, + parameter WORD_WIDTH = 32, + parameter WORD_SIZE = 4, + parameter WORD_SEL_WIDTH = 1, + parameter TAG_WIDTH = 1, + parameter REQ_SEL_WIDTH = 1, + parameter ATTR_WIDTH = 1, + parameter MSHR_SIZE = 1, + parameter MSHR_ADDR_WIDTH = 1, + parameter WORDS_PER_LINE = 1 +) ( + input wire clk, + input wire reset, + input wire pipe_stall, + + // pipeline view + input amo_req_t amo_st0, + input wire valid_st0, + input wire is_creq_st0, + input wire is_hit_st0, + input wire is_replay_st0, + input amo_req_t amo_st1, + input wire valid_st1, + input wire is_creq_st1, + input wire is_hit_st1, + input wire is_replay_st1, + input wire do_write_st1, + input wire [WORD_WIDTH-1:0] read_word_st1, + input wire [WORD_SIZE-1:0] byteen_st1, + input wire [WORD_WIDTH-1:0] write_word_st1, + input wire [WORD_SEL_WIDTH-1:0] word_idx_st0, + input wire [WORD_SEL_WIDTH-1:0] word_idx_st1, + input wire [LINE_ADDR_BITS-1:0] addr_st0, + input wire [LINE_ADDR_BITS-1:0] addr_st1, + input wire [TAG_WIDTH-1:0] tag_st1, + input wire [REQ_SEL_WIDTH-1:0] req_idx_st1, + input wire [ATTR_WIDTH-1:0] attr_st1, + + // commit handshake: the bank grants the synthetic writeback this cycle + input wire wb_fire, + + // mshr / memory fill (passthrough) + input wire mshr_allocate_st0, + input wire [MSHR_ADDR_WIDTH-1:0] mshr_alloc_id_st0, + input wire [MSHR_ADDR_WIDTH-1:0] mshr_id_st1, + input wire mem_rsp_fire, + input wire [MSHR_ADDR_WIDTH-1:0] mem_rsp_id, + input wire [WORDS_PER_LINE*WORD_WIDTH-1:0] mem_rsp_data, + input wire is_fill_sel, + + // input arbitration (passthrough age-ordering) + input wire core_req_valid, + input wire core_req_is_amo, + input wire core_req_rw, + input wire [LINE_ADDR_BITS-1:0] core_req_addr, + input wire rw_st0, + input wire mshr_probe_pending_ld, + input wire mshr_probe_pending_amo, + + // commit outputs (tied off when IS_LLC=0) + output wire amo_hit_st1, // AMO commits locally at S1 + output wire commit_busy, // commit in flight + output wire chain_stall, // pace same-line chained AMO + output wire wb_pending, // writeback request live + output wire [WORD_WIDTH-1:0] rsp_data, // response word on amo_hit_st1 + output wire [LINE_ADDR_BITS-1:0] wb_addr, + output wire [WORD_SEL_WIDTH-1:0] wb_word_idx, + output wire [WORD_SIZE-1:0] wb_byteen, + output wire [WORD_WIDTH-1:0] wb_data, + output wire [TAG_WIDTH-1:0] wb_tag, + output wire [REQ_SEL_WIDTH-1:0] wb_idx, + output wire [ATTR_WIDTH-1:0] wb_attr, + + // passthrough outputs (tied off when IS_LLC=1) + output wire is_amo_fwd_st0, // AMO first pass (S0) + output wire is_amo_fwd_st1, // AMO first pass (S1) + output wire is_amo_replay_st1, // result replay + output wire is_passthru_fill_sel, + output wire [WORD_WIDTH-1:0] amo_ptw_word_st1, + output wire req_input_defer +); + if (IS_LLC != 0) begin : g_commit + // ---------------------------------------------------------------- + // LLC commit: RMW on the resident line + synthetic writeback + // ---------------------------------------------------------------- + localparam BIT_OFF_BITS = `CLOG2(WORD_WIDTH); + localparam AMO_OLD_BITS = (WORD_WIDTH < 64) ? WORD_WIDTH : 64; + + // Writeback queue (depth 2): a completed AMO pushes its result here + // instead of overwriting a still-draining different-line writeback. The + // head (slot 0) drains through the bank's synthetic-write path; pushes + // never clobber a pending entry, so different-line AMOs pipeline without + // stalling any replay (coalescer-safe) or the pipe (deadlock-free). + localparam WBQ_SIZE = 2; + localparam WBQ_CNTW = `CLOG2(WBQ_SIZE+1); + localparam WBQ_IDXW = `CLOG2(WBQ_SIZE); + reg [WBQ_CNTW-1:0] wbq_count; + reg [LINE_ADDR_BITS-1:0] wbq_addr [WBQ_SIZE]; + reg [WORD_SEL_WIDTH-1:0] wbq_wsel [WBQ_SIZE]; + reg [WORD_SIZE-1:0] wbq_byteen [WBQ_SIZE]; + reg [WORD_WIDTH-1:0] wbq_data [WBQ_SIZE]; + reg [TAG_WIDTH-1:0] wbq_tag [WBQ_SIZE]; + reg [REQ_SEL_WIDTH-1:0] wbq_idx [WBQ_SIZE]; + reg [ATTR_WIDTH-1:0] wbq_attr [WBQ_SIZE]; + + // Head aliases (slot 0 = oldest = the entry currently draining). + wire wb_pending_r = (wbq_count != '0); + wire [LINE_ADDR_BITS-1:0] wb_addr_r = wbq_addr[0]; + wire [WORD_SEL_WIDTH-1:0] wb_word_idx_r = wbq_wsel[0]; + wire [WORD_SIZE-1:0] wb_byteen_r = wbq_byteen[0]; + wire [WORD_WIDTH-1:0] wb_data_r = wbq_data[0]; + wire [TAG_WIDTH-1:0] wb_tag_r = wbq_tag[0]; + wire [REQ_SEL_WIDTH-1:0] wb_idx_r = wbq_idx[0]; + wire [ATTR_WIDTH-1:0] wb_attr_r = wbq_attr[0]; + + // BRAM-settle window: a fired writeback takes a couple cycles to land + // in cache_data; commit_busy stays high across it so the next AMO reads + // the committed line. post_wb_{addr,data} hold the just-drained entry. + reg [1:0] post_wb_age; + reg [LINE_ADDR_BITS-1:0] post_wb_addr; + reg [WORD_WIDTH-1:0] post_wb_data; + wire post_wb_valid = (post_wb_age != 2'd0); + + // Compute stage: S1 latches the aligned operands, the RMW ALU + the + // re-align shift run the next cycle, off the S1 critical path. AMO + // commits are serialized by commit_busy (the bank holds off core + // requests and replays), so the stage holds at most one operation and + // each AMO reads the freshly written line (no operand forwarding). + reg cmp_valid; + reg [63:0] cmp_old, cmp_rhs; + amo_op_e cmp_op; + reg [1:0] cmp_width; + reg cmp_unsigned; + reg [BIT_OFF_BITS-1:0] cmp_bit_off; + reg [LINE_ADDR_BITS-1:0] cmp_addr; + reg [WORD_SIZE-1:0] cmp_byteen; + reg [WORD_SEL_WIDTH-1:0] cmp_wsel; + reg [TAG_WIDTH-1:0] cmp_tag; + reg [REQ_SEL_WIDTH-1:0] cmp_idx; + reg [ATTR_WIDTH-1:0] cmp_attr; + + // Byte-offset alignment: shift the target down to bit 0 for compute, + // and shift results back for response/writeback. + wire [`UP(`CLOG2(WORD_SIZE))-1:0] byte_off_st1; + VX_priority_encoder #( + .N (WORD_SIZE) + ) byte_off_enc ( + .data_in (byteen_st1), + .index_out (byte_off_st1), + `UNUSED_PIN (valid_out), + `UNUSED_PIN (onehot_out) + ); + wire [BIT_OFF_BITS-1:0] bit_off_st1 = BIT_OFF_BITS'({byte_off_st1, 3'b0}); + + // Forward an in-flight (or just-fired) writeback on the same line back + // into the operand: chained same-line AMOs are paced one cycle apart + // (chain_stall) so the prior result already sits in wb_data_r while + // read_word_st1 may still be stale. + // Forward the newest in-flight value for this line: scan the queue + // newest-first, then the just-drained (settling) entry, else the array. + wire fwd_q1 = (wbq_count > 1) && (wbq_addr[1] == addr_st1); + wire fwd_q0 = (wbq_count > 0) && (wbq_addr[0] == addr_st1); + wire fwd_pw = post_wb_valid && (post_wb_addr == addr_st1); + wire [WORD_WIDTH-1:0] line_word_st1 = fwd_q1 ? wbq_data[1] + : fwd_q0 ? wbq_data[0] + : fwd_pw ? post_wb_data + : read_word_st1; + wire [WORD_WIDTH-1:0] line_word_shifted_st1 = line_word_st1 >> bit_off_st1; + wire [WORD_WIDTH-1:0] rhs_word_shifted_st1 = write_word_st1 >> bit_off_st1; + + // width from byteen popcount (.W -> 4 bytes, .D -> 8); operands top at .D. + wire [1:0] width_st1 = ($countones(byteen_st1) == 8) ? 2'd3 : 2'd2; + wire [63:0] rhs_st1 = (width_st1 == 2'd2) + ? 64'({32'h0, rhs_word_shifted_st1[31:0]}) + : 64'(rhs_word_shifted_st1[AMO_OLD_BITS-1:0]); + wire [63:0] old_st1 = (width_st1 == 2'd2) + ? 64'({32'h0, line_word_shifted_st1[31:0]}) + : 64'(line_word_shifted_st1[AMO_OLD_BITS-1:0]); + if (WORD_WIDTH > 64) begin : g_upper_unused + `UNUSED_VAR (line_word_shifted_st1[WORD_WIDTH-1:64]) + `UNUSED_VAR (rhs_word_shifted_st1[WORD_WIDTH-1:64]) + end + + wire res_check; + + // commit conditions (from the original AMO at S1; amo_st1.hart_id is + // valid there, not on the compute/writeback cycle). + wire amo_hit_w = amo_st1.amo_valid && is_hit_st1 && valid_st1 && is_creq_st1; + wire sc_fail_st1 = (amo_st1.amo_op == AMO_OP_SC) && ~res_check; + wire do_store_st1 = amo_hit_w && (amo_st1.amo_op != AMO_OP_LR) && ~sc_fail_st1; + wire do_store_st0 = amo_st0.amo_valid && valid_st0 && is_creq_st0 && is_hit_st0 + && (amo_st0.amo_op != AMO_OP_LR); + + wire res_reserve = amo_hit_w && (amo_st1.amo_op == AMO_OP_LR); + wire res_clear = amo_hit_w && (amo_st1.amo_op == AMO_OP_SC); + // any committed write to the line breaks other harts' reservations; + // AMOs ride the load path (rw=0) so do_write_st1 is plain stores only. + wire res_invalidate = do_store_st1 || do_write_st1; + + // RMW ALU runs on the registered compute-stage operands (off the S1 + // path); the reservation table is driven from S1 so the SC outcome is + // ready for the response. ret_word is unused — the response old value + // comes straight from S1 (no ALU). + wire [63:0] new_word; + wire [63:0] ret_word_unused; + VX_amo_unit #( + .NUM_RES_ENTRIES (NUM_RES_ENTRIES), + .LINE_ADDR_BITS (LINE_ADDR_BITS), + .DATA_WIDTH (AMO_OLD_BITS) // 32-bit word cache -> 32-bit RMW datapath + ) amo_unit ( + .clk (clk), + .reset (reset), + .compute_op (cmp_op), + .compute_unsigned (cmp_unsigned), + .compute_width (cmp_width), + .compute_old (cmp_old), + .compute_rhs (cmp_rhs), + .compute_new_word (new_word), + .compute_ret_word (ret_word_unused), + .res_reserve (res_reserve), + .res_clear (res_clear), + .res_invalidate(res_invalidate), + .res_hart_id (amo_st1.hart_id), + .res_line_addr (addr_st1), + .res_check (res_check) + ); + `UNUSED_VAR (ret_word_unused) + + // place the computed word at its byte offset within the cache word + wire [WORD_WIDTH-1:0] wb_data_w = WORD_WIDTH'(new_word) << cmp_bit_off; + + // Compute finished this cycle (result ready to enqueue): the compute + // stage is occupied and not being reloaded by a fresh latch. + wire wb_push = cmp_valid && ~(do_store_st1 && ~pipe_stall); + // A same-line result coalesces into its existing entry (only the latest + // value must reach the array; earlier ones are forwarded), so a same-line + // burst stays at a single entry. A new-line result enqueues at the tail. + // The head cannot be coalesced into the cycle it drains. + wire wb_coal0 = (wbq_count > 0) && (wbq_addr[0] == cmp_addr) && ~wb_fire; + wire wb_coal1 = (wbq_count > 1) && (wbq_addr[1] == cmp_addr); + wire wb_coalesce = wb_coal1 || wb_coal0; + wire [WBQ_IDXW-1:0] wb_coal_idx = wb_coal1 ? WBQ_IDXW'(1) : WBQ_IDXW'(0); + // New entry lands at the post-pop tail; a coalesce slot shifts down on a pop. + wire [WBQ_IDXW-1:0] wb_new_idx = WBQ_IDXW'(wb_fire ? (wbq_count - WBQ_CNTW'(1)) : wbq_count); + wire [WBQ_IDXW-1:0] wb_slot = wb_coalesce ? WBQ_IDXW'(wb_fire ? (wb_coal_idx - WBQ_IDXW'(1)) : wb_coal_idx) + : wb_new_idx; + + always @(posedge clk) begin + if (reset) begin + cmp_valid <= 1'b0; + wbq_count <= '0; + post_wb_age <= 2'd0; + end else begin + if (wb_fire) begin + post_wb_age <= 2'd2; + post_wb_addr <= wbq_addr[0]; + post_wb_data <= wbq_data[0]; + end else if (post_wb_valid) begin + post_wb_age <= post_wb_age - 2'd1; + end + + // Compute stage (single): latch a new AMO, else retire the result. + if (do_store_st1 && ~pipe_stall) begin + cmp_valid <= 1'b1; + cmp_old <= old_st1; + cmp_rhs <= rhs_st1; + cmp_op <= amo_st1.amo_op; + cmp_width <= width_st1; + cmp_unsigned <= amo_st1.amo_unsigned; + cmp_bit_off <= bit_off_st1; + cmp_addr <= addr_st1; + cmp_byteen <= byteen_st1; + cmp_wsel <= word_idx_st1; + cmp_tag <= tag_st1; + cmp_idx <= req_idx_st1; + cmp_attr <= attr_st1; + end else if (cmp_valid) begin + cmp_valid <= 1'b0; + end + + // Writeback queue: a drain (wb_fire) shifts the head out; a + // completed compute (wb_push) enqueues at the tail. The push is + // written after the shift so it wins when both hit the same slot. + if (wb_fire) begin + wbq_addr[0] <= wbq_addr[1]; + wbq_wsel[0] <= wbq_wsel[1]; + wbq_byteen[0] <= wbq_byteen[1]; + wbq_data[0] <= wbq_data[1]; + wbq_tag[0] <= wbq_tag[1]; + wbq_idx[0] <= wbq_idx[1]; + wbq_attr[0] <= wbq_attr[1]; + end + if (wb_push) begin + wbq_addr[wb_slot] <= cmp_addr; + wbq_wsel[wb_slot] <= cmp_wsel; + wbq_byteen[wb_slot] <= cmp_byteen; + wbq_data[wb_slot] <= wb_data_w; + wbq_tag[wb_slot] <= cmp_tag; + wbq_idx[wb_slot] <= cmp_idx; + wbq_attr[wb_slot] <= cmp_attr; + end + // Count grows only on a new (non-coalescing) enqueue; a coalesce + // updates in place. Pop removes the head. + if (wb_push && ~wb_coalesce && ~wb_fire) + wbq_count <= wbq_count + WBQ_CNTW'(1); + else if (~(wb_push && ~wb_coalesce) && wb_fire) + wbq_count <= wbq_count - WBQ_CNTW'(1); + end + end + + // response (fired at S1): SC -> 0/1; other -> old value (LSU sexts). + // The old value is available at S1 directly, no ALU needed. + wire [63:0] rsp_word = (amo_st1.amo_op == AMO_OP_SC) ? {63'h0, sc_fail_st1} : old_st1; + if (WORD_WIDTH < 64) begin : g_rsp_upper_unused + `UNUSED_VAR (rsp_word[63:WORD_WIDTH]) + end + + assign amo_hit_st1 = amo_hit_w; + assign rsp_data = WORD_WIDTH'(rsp_word) << bit_off_st1; + // Commit in flight: holds off new core-request admission from the S0 + // prediction through the compute stage and the writeback. Replays are + // NOT blocked (the MSHR streams coalesced same-line AMOs back to back); + // those are paced instead by chain_stall. + assign commit_busy = do_store_st0 || do_store_st1 || cmp_valid || wb_pending_r; + // Pace any same-line request sitting behind an in-flight compute by one + // cycle, so the result lands in wb_data_r and forwards cleanly. Gated on + // cmp_valid (an AMO is computing), so it never fires for baseline traffic. + assign chain_stall = cmp_valid && valid_st1 && is_creq_st1 && (cmp_addr == addr_st1); + + // Invariants: a store-bearing AMO is only ever accepted into a free + // compute stage (the queue absorbs different-line writebacks behind it), + // and the writeback queue must never overflow. + `RUNTIME_ASSERT (~(do_store_st1 && ~pipe_stall && cmp_valid), + ("%t: AMO compute-stage overwrite (addr=0x%0h)", $time, addr_st1)) + `RUNTIME_ASSERT (~(wb_push && ~wb_coalesce && ~wb_fire && (wbq_count == WBQ_CNTW'(WBQ_SIZE))), + ("%t: AMO writeback queue overflow (addr=0x%0h)", $time, cmp_addr)) + assign wb_pending = wb_pending_r; + assign wb_addr = wb_addr_r; + assign wb_word_idx = wb_word_idx_r; + assign wb_byteen = wb_byteen_r; + assign wb_data = wb_data_r; + assign wb_tag = wb_tag_r; + assign wb_idx = wb_idx_r; + assign wb_attr = wb_attr_r; + + // passthrough outputs unused in this role + assign is_amo_fwd_st0 = 1'b0; + assign is_amo_fwd_st1 = 1'b0; + assign is_amo_replay_st1 = 1'b0; + assign is_passthru_fill_sel = 1'b0; + assign amo_ptw_word_st1 = '0; + assign req_input_defer = 1'b0; + + `UNUSED_VAR (amo_st0) // only amo_valid/amo_op are consumed at S0 + `UNUSED_VAR (is_replay_st0) + `UNUSED_VAR (is_replay_st1) + `UNUSED_VAR (word_idx_st0) + `UNUSED_VAR (addr_st0) + `UNUSED_VAR (mshr_allocate_st0) + `UNUSED_VAR (mshr_alloc_id_st0) + `UNUSED_VAR (mshr_id_st1) + `UNUSED_VAR (mem_rsp_fire) + `UNUSED_VAR (mem_rsp_id) + `UNUSED_VAR (mem_rsp_data) + `UNUSED_VAR (is_fill_sel) + `UNUSED_VAR (core_req_valid) + `UNUSED_VAR (core_req_is_amo) + `UNUSED_VAR (core_req_rw) + `UNUSED_VAR (core_req_addr) + `UNUSED_VAR (rw_st0) + `UNUSED_VAR (mshr_probe_pending_ld) + `UNUSED_VAR (mshr_probe_pending_amo) + end else begin : g_passthru + // ---------------------------------------------------------------- + // Non-LLC passthrough: forward downstream, replay the result word + // ---------------------------------------------------------------- + assign is_amo_fwd_st0 = amo_st0.amo_valid && valid_st0 && is_creq_st0 && ~is_replay_st0; + assign is_amo_fwd_st1 = amo_st1.amo_valid && valid_st1 && is_creq_st1 && ~is_replay_st1; + assign is_amo_replay_st1 = amo_st1.amo_valid && valid_st1 && is_creq_st1 && is_replay_st1; + + reg [MSHR_SIZE-1:0] ptw_flag; // entry awaits a passthru fill + reg [WORD_SEL_WIDTH-1:0] ptw_wsel [MSHR_SIZE]; + reg [WORD_WIDTH-1:0] ptw_word [MSHR_SIZE]; + + wire [WORDS_PER_LINE-1:0][WORD_WIDTH-1:0] mem_rsp_words = mem_rsp_data; + + assign is_passthru_fill_sel = is_fill_sel && ptw_flag[mem_rsp_id]; + assign amo_ptw_word_st1 = ptw_word[mshr_id_st1]; + + always @(posedge clk) begin + if (reset) begin + ptw_flag <= '0; + end else begin + // mark the AMO's MSHR entry on allocation + if (is_amo_fwd_st0 && mshr_allocate_st0 && ~pipe_stall) begin + ptw_flag[mshr_alloc_id_st0] <= 1'b1; + ptw_wsel[mshr_alloc_id_st0] <= word_idx_st0; + end + // latch the result word on the passthru fill, clear the flag + if (mem_rsp_fire && ptw_flag[mem_rsp_id]) begin + ptw_word[mem_rsp_id] <= mem_rsp_words[ptw_wsel[mem_rsp_id]]; + ptw_flag[mem_rsp_id] <= 1'b0; + end + end + end + + // catch a same-line request mid-allocation at S0 (not yet visible to + // the MSHR probe in the window between admit and allocate). + wire alloc_same_line = mshr_allocate_st0 && ~pipe_stall && (addr_st0 == core_req_addr); + wire st0_ld_alloc = alloc_same_line && ~amo_st0.amo_valid && ~rw_st0; + wire st0_amo_alloc = alloc_same_line && amo_st0.amo_valid; + + wire amo_input_defer = core_req_valid && core_req_is_amo + && (mshr_probe_pending_ld || st0_ld_alloc); + wire load_input_defer = core_req_valid && ~core_req_is_amo && ~core_req_rw + && (mshr_probe_pending_amo || st0_amo_alloc); + assign req_input_defer = amo_input_defer || load_input_defer; + + // commit outputs unused in this role + assign amo_hit_st1 = 1'b0; + assign commit_busy = 1'b0; + assign chain_stall = 1'b0; + assign wb_pending = 1'b0; + assign rsp_data = '0; + assign wb_addr = '0; + assign wb_word_idx = '0; + assign wb_byteen = '0; + assign wb_data = '0; + assign wb_tag = '0; + assign wb_idx = '0; + assign wb_attr = '0; + + `UNUSED_VAR (amo_st0) // only amo_valid gates the passthru path + `UNUSED_VAR (amo_st1) + `UNUSED_VAR (is_hit_st0) + `UNUSED_VAR (is_hit_st1) + `UNUSED_VAR (do_write_st1) + `UNUSED_VAR (read_word_st1) + `UNUSED_VAR (byteen_st1) + `UNUSED_VAR (write_word_st1) + `UNUSED_VAR (word_idx_st1) + `UNUSED_VAR (addr_st1) + `UNUSED_VAR (tag_st1) + `UNUSED_VAR (req_idx_st1) + `UNUSED_VAR (attr_st1) + `UNUSED_VAR (wb_fire) + end + +endmodule diff --git a/hw/rtl/cache/VX_cache_bank.sv b/hw/rtl/cache/VX_cache_bank.sv index 1a24470c38..8a6f75b327 100644 --- a/hw/rtl/cache/VX_cache_bank.sv +++ b/hw/rtl/cache/VX_cache_bank.sv @@ -62,12 +62,10 @@ module VX_cache_bank import VX_gpu_pkg::*; #( // Memory request output buffer (TO_OUT_BUF_* encoding) parameter MEM_OUT_BUF = 0, - // AMO: this bank belongs to the last-level cache. Reservation - // table + AMO commit logic synthesize only when 1. + // This bank is the last-level cache (AMOs commit locally here). parameter IS_LLC = 0, - // AMO: this bank supports atomic ops. Default 0 — AMO is generated - // away. Pulled in from the cache wrapper hierarchy. + // This bank supports atomic ops (AMO logic synthesizes only when 1). parameter AMO_ENABLE = 0, parameter MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE), @@ -81,6 +79,7 @@ module VX_cache_bank import VX_gpu_pkg::*; #( `ifdef PERF_ENABLE output wire perf_read_miss, output wire perf_write_miss, + output wire perf_evictions, output wire perf_mshr_stall, `endif @@ -93,7 +92,7 @@ module VX_cache_bank import VX_gpu_pkg::*; #( input wire [`CS_WORD_WIDTH-1:0] core_req_data, // data to be written input wire [TAG_WIDTH-1:0] core_req_tag, // identifier of the request (request id) input wire [REQ_SEL_WIDTH-1:0] core_req_idx, // index of the request in the core request array - input wire [`UP(MEM_ATTR_WIDTH)-1:0] core_req_attr, + input wire [`UP(MEM_ATTR_WIDTH)-1:0] core_req_attr, output wire core_req_ready, // Core Response @@ -110,7 +109,7 @@ module VX_cache_bank import VX_gpu_pkg::*; #( output wire [LINE_SIZE-1:0] mem_req_byteen, output wire [`CS_LINE_WIDTH-1:0] mem_req_data, output wire [MEM_TAG_WIDTH-1:0] mem_req_tag, - output wire [`UP(MEM_ATTR_WIDTH)-1:0] mem_req_attr, + output wire [`UP(MEM_ATTR_WIDTH)-1:0] mem_req_attr, input wire mem_req_ready, // Memory response @@ -131,8 +130,7 @@ module VX_cache_bank import VX_gpu_pkg::*; #( // straight through, so it is unused locally. `UNUSED_PARAM (MRSQ_SIZE) - // Extract the AMO sideband from the attr field at the fixed offset. - // amo_req_t is always defined; AMO_ENABLE gates the actual AMO logic. + // AMO sideband, extracted from the attr field (gated by AMO_ENABLE). amo_req_t core_req_amo; assign core_req_amo = AMO_ENABLE ? amo_req_t'(core_req_attr[MEM_ATTR_AMO_OFFS +: AMO_REQ_BITS]) @@ -190,15 +188,29 @@ module VX_cache_bank import VX_gpu_pkg::*; #( wire is_hit_st0, is_hit_st1; wire [`UP(MEM_ATTR_WIDTH)-1:0] attr_sel, attr_st0, attr_st1; amo_req_t amo_sel, amo_st0, amo_st1; - // Forward decls — assigned in the IS_LLC block / S1 commit logic. - // Needed earlier (in wb_fire gating) to suppress the synthetic - // writeback fire on cycles where another AMO is committing at S1 - // (chain or fresh-latch). Without this gate, wb_fire latches a - // stale amo_wb_data_r into pipe_reg0 while the chain branch updates - // it, losing the chained AMO's effect. - wire amo_hit_st1; - wire sc_fail_st1; - wire amo_do_store_st1; + + // AMO interconnect (driven by the VX_cache_amo engine, tied off when the + // bank carries no AMO logic). Declared here because the input arbitration + // and sel mux consume them ahead of the instantiation. + wire amo_hit_st1; // AMO commits locally at S1 (LLC) + wire amo_commit_busy; // LLC commit in flight + wire amo_chain_stall; // pace same-line chained AMO + wire amo_wb_pending; // synthetic writeback request live + wire [`CS_WORD_WIDTH-1:0] amo_rsp_data; // LLC AMO response word + wire [`CS_LINE_ADDR_WIDTH-1:0] amo_wb_addr; + wire [WORD_SEL_WIDTH-1:0] amo_wb_word_idx; + wire [WORD_SIZE-1:0] amo_wb_byteen; + wire [`CS_WORD_WIDTH-1:0] amo_wb_data; + wire [TAG_WIDTH-1:0] amo_wb_tag; + wire [REQ_SEL_WIDTH-1:0] amo_wb_idx; + wire [`UP(MEM_ATTR_WIDTH)-1:0] amo_wb_attr; + wire is_amo_fwd_st0; // non-LLC AMO first pass (S0) + wire is_amo_fwd_st1; // non-LLC AMO first pass (S1) + wire is_amo_replay_st1; // non-LLC AMO result replay + wire is_passthru_fill_sel; + wire [`CS_WORD_WIDTH-1:0] amo_ptw_word_st1; + wire req_input_defer; // non-LLC age-ordering hold + wire mshr_pending_raw_st0; wire mshr_pending_st0, mshr_pending_st1; wire [MSHR_ADDR_WIDTH-1:0] mshr_previd_st0, mshr_previd_st1; @@ -235,7 +247,10 @@ module VX_cache_bank import VX_gpu_pkg::*; #( .bank_empty (no_pending_req) ); - wire pipe_stall = crsp_queue_stall; + // amo_chain_stall paces a same-line AMO behind an in-flight commit by one + // cycle so the prior result reaches the writeback register; it is 0 for all + // non-AMO traffic, so the baseline pipe is unaffected. + wire pipe_stall = crsp_queue_stall || amo_chain_stall; // inputs arbitration: // mshr replay has highest priority to maximize utilization since there is no miss. @@ -251,43 +266,12 @@ module VX_cache_bank import VX_gpu_pkg::*; #( wire flush_enable = flush_grant && flush_valid; wire creq_grant = ~init_valid && ~replay_enable && ~fill_enable && ~flush_enable; - // creq fires from real core_req OR from a pending AMO writeback - // (the synthetic write produced after an AMO commit at S1). The - // two paths are mutually exclusive: when amo_wb_pending=1 the sel - // mux feeds wb data, so a simultaneous real core_req cannot make - // it into pipe_reg0 — gate accordingly to avoid letting a stuck - // upstream core_req_valid pull the wb data into the pipe twice. - // When AMO_ENABLE=0 the writeback machinery is tied off so amo_wb_path=0. - // - // Age-ordering (non-LLC AMO configs only). Two symmetric holds at the - // input, both releasing when the in-flight entry retires: - // * an incoming AMO waits while a load fill is pending for its line, - // so its local invalidate lands on the installed (else stale) line; - // * an incoming plain load waits while an AMO passthrough is pending - // for its line, so the load observes that AMO (same-hart same- - // address program order). - // Also catch a same-line request that was just admitted and is - // allocating its MSHR entry at S0 this cycle — it is not yet visible - // to the MSHR probe (a 1-cycle window between admit and allocate). - wire amo_cfg = (AMO_ENABLE != 0) && (IS_LLC == 0); - wire st0_alloc_same_line = mshr_allocate_st0 && ~pipe_stall && (addr_st0 == core_req_addr); - wire st0_ld_alloc = st0_alloc_same_line && ~amo_st0.amo_valid && ~rw_st0; - wire st0_amo_alloc = st0_alloc_same_line && amo_st0.amo_valid; - wire amo_input_defer = amo_cfg && core_req_valid && core_req_amo.amo_valid - && (mshr_probe_pending_ld || st0_ld_alloc); - wire load_input_defer = amo_cfg && core_req_valid && ~core_req_amo.amo_valid && ~core_req_rw - && (mshr_probe_pending_amo || st0_amo_alloc); - wire req_input_defer = amo_input_defer || load_input_defer; - // The LLC AMO-commit writeback is single-outstanding. An AMO that will - // commit occupies the S0->S1->writeback path exclusively; admitting a - // second AMO (to another line) during that window lets it reach S1 while - // the writeback is busy, dropping its store. Block new admissions across - // the whole window: a committing AMO at S0 (predicted from its hit, non-LR), - // at S1 (do_store), or its writeback in flight. All terms are 0 at non-LLC - // banks (amo_hit/do_store gated to IS_LLC), so this only gates the LLC. - wire amo_do_store_st0 = (IS_LLC != 0) && amo_st0.amo_valid && valid_st0 - && is_creq_st0 && is_hit_st0 && (amo_st0.amo_op != AMO_OP_LR); - wire amo_commit_busy = amo_wb_pending || amo_do_store_st1 || amo_do_store_st0; + // creq fires from a real core_req or from a pending LLC AMO writeback + // (the synthetic write injected after a commit); the two are mutually + // exclusive. amo_commit_busy holds off new admits while a single- + // outstanding LLC commit is in flight; req_input_defer enforces non-LLC + // age-ordering. Both, plus amo_wb_pending/amo_hit_st1, are driven by the + // AMO engine below and tie to 0 when the bank carries no AMO logic. wire amo_creq_path = core_req_valid && ~amo_commit_busy && ~req_input_defer; wire amo_wb_path = amo_wb_pending && ~amo_hit_st1; wire creq_enable = creq_grant && (amo_creq_path || amo_wb_path); @@ -316,13 +300,8 @@ module VX_cache_bank import VX_gpu_pkg::*; #( wire replay_fire = replay_valid && replay_ready; wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready; wire flush_fire = flush_valid && flush_ready; - // wb fire: held off when another AMO is committing at S1 this - // cycle (either chain or different-line). Without this gate the - // wb would latch the stale pre-chain amo_wb_data_r into pipe_reg0 - // while the chain branch updates the register, losing the chained - // AMO. core_req_fire uses the SAME gate so the wb either fires or - // stays out of the pipe — never gets captured while wb is - // suppressed (which would inject ghost wb's). + // amo_wb_path already excludes the cycle a fresh AMO commits at S1 + // (amo_hit_st1), so the writeback never races the chain update. wire amo_wb_fire = amo_wb_path && creq_grant && ~mreq_queue_alm_full && ~mshr_alm_full && ~pipe_stall; wire core_req_fire = (amo_creq_path || amo_wb_path) && creq_grant @@ -350,70 +329,29 @@ module VX_cache_bank import VX_gpu_pkg::*; #( assign flush_tag = '0; end - // ============================================================ - // AMO writeback state machine - // ============================================================ - // When AMO commits at S1 with do_store, we need to write the - // computed new_word back into the cache line. cache_data writes - // fire at st0 (write addr/data arrive at st0, commit on next clk - // edge), but new_word isn't ready until st1. We close the timing - // gap by injecting a synthetic core-req-like write at sel that - // flows through st0/st1 normally. The bank stalls accepting new - // core_req during the writeback (1-2 cycles) so wb commits before - // the next AMO can race. - reg amo_wb_pending; - reg [`CS_LINE_ADDR_WIDTH-1:0] amo_wb_addr_r; - reg [WORD_SEL_WIDTH-1:0] amo_wb_word_idx_r; - reg [WORD_SIZE-1:0] amo_wb_byteen_r; - reg [`CS_WORD_WIDTH-1:0] amo_wb_data_r; - reg [TAG_WIDTH-1:0] amo_wb_tag_r; - reg [REQ_SEL_WIDTH-1:0] amo_wb_idx_r; - reg [`UP(MEM_ATTR_WIDTH)-1:0] amo_wb_attr_r; - // Post-wb forwarding: cache_data writes at S0 of the wb, so an AMO - // whose S0-read fired before the wb's S0 still has pre-wb data in - // read_data_st1. Keep amo_wb_data_r/addr_r forwardable for 2 more - // cycles after amo_wb_fire so those in-flight AMOs see the fresh - // value, and re-arm pending on chain so a follow-up wb commits the - // chained value to BRAM. - reg [1:0] amo_post_wb_age; - wire amo_post_wb_valid = (amo_post_wb_age != 2'd0); - `ifndef SC_FAIL_ST1_DECLARED - `define SC_FAIL_ST1_DECLARED - `endif - + // Input arbitration mux. The AMO writeback fields tie to 0 when no LLC + // commit engine is present, so the wb arms prune away for non-AMO banks. assign valid_sel = init_fire || replay_fire || mem_rsp_fire || flush_fire || core_req_fire; - // The amo_wb_* paths collapse to 0 when AMO_ENABLE=0 (the writeback - // FSM never fires), so the same mux works whether or not AMOs are - // enabled — the synthesizer prunes the dead arms. assign rw_sel = replay_valid ? replay_rw : (amo_wb_pending ? 1'b1 : core_req_rw); assign byteen_sel = replay_valid ? replay_byteen - : (amo_wb_pending ? amo_wb_byteen_r : core_req_byteen); + : (amo_wb_pending ? amo_wb_byteen : core_req_byteen); assign addr_sel = (init_valid | flush_valid) ? `CS_LINE_ADDR_WIDTH'(flush_sel) : (replay_valid ? replay_addr : (mem_rsp_valid ? mem_rsp_addr : - (amo_wb_pending ? amo_wb_addr_r : core_req_addr))); + (amo_wb_pending ? amo_wb_addr : core_req_addr))); assign word_idx_sel= replay_valid ? replay_wsel - : (amo_wb_pending ? amo_wb_word_idx_r : core_req_wsel); + : (amo_wb_pending ? amo_wb_word_idx : core_req_wsel); assign req_idx_sel = replay_valid ? replay_idx - : (amo_wb_pending ? amo_wb_idx_r : core_req_idx); + : (amo_wb_pending ? amo_wb_idx : core_req_idx); assign tag_sel = (init_valid | flush_valid) ? (flush_valid ? flush_tag : '0) : (replay_valid ? replay_tag : (mem_rsp_valid ? mem_rsp_tag_s : - (amo_wb_pending ? amo_wb_tag_r : core_req_tag))); - assign attr_sel = amo_wb_pending ? amo_wb_attr_r + (amo_wb_pending ? amo_wb_tag : core_req_tag))); + assign attr_sel = amo_wb_pending ? amo_wb_attr : (core_req_valid ? core_req_attr : '0); - // AMO sideband at sel: - // * writeback synthetic creq: amo.valid=0 (plain write, no - // double-commit at S1). - // * replay: amo carried through MSHR — allocate-time amo_st0 is - // restored on dequeue, so the AMO re-enters the pipe with the - // same op/width/rhs/hart_id and commits on the now-warm line. - // * real core_req: amo from the LSU sideband. - // - // Priority must match the sel mux's source priority (replay > wb > - // core_req): replay can fire during a pending wb (chained AMO - // replays from MSHR after a fill), so replay_amo must NOT be - // clobbered to zero by amo_wb_pending. A pending wb only zeros - // amo_sel when it actually wins arbitration (no replay/core_req). + // AMO sideband priority must match the sel mux (replay > wb > core_req): + // a replay can fire during a pending wb (chained AMO replays from MSHR + // after a fill), so it must not be cleared by amo_wb_pending. The + // synthetic writeback carries amo.valid=0 so it never re-commits at S1. assign amo_sel = replay_valid ? replay_amo : (amo_wb_pending ? amo_req_t'('0) : (core_req_valid ? core_req_amo : amo_req_t'('0))); @@ -423,7 +361,7 @@ module VX_cache_bank import VX_gpu_pkg::*; #( if (i < `CS_WORD_WIDTH) begin : g_lo assign data_sel[i] = replay_valid ? replay_data[i] : (mem_rsp_valid ? mem_rsp_data[i] : - (amo_wb_pending ? amo_wb_data_r[i] : core_req_data[i])); + (amo_wb_pending ? amo_wb_data[i] : core_req_data[i])); end else begin : g_hi assign data_sel[i] = mem_rsp_data[i]; // only the memory response fills the upper words of data_sel end @@ -432,9 +370,7 @@ module VX_cache_bank import VX_gpu_pkg::*; #( assign data_sel = mem_rsp_data; `UNUSED_VAR (core_req_data) `UNUSED_VAR (replay_data) - // icache (WRITE_ENABLE=0) doesn't read amo_wb_data_r since the - // wb data only flows through the writable g_data_sel branch. - `UNUSED_VAR (amo_wb_data_r) + `UNUSED_VAR (amo_wb_data) // read-only banks have no writeback data end if (UUID_WIDTH != 0) begin : g_req_uuid_sel @@ -450,19 +386,14 @@ module VX_cache_bank import VX_gpu_pkg::*; #( wire is_replay_sel = replay_enable; VX_pipe_register #( - .DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + `UP(MEM_ATTR_WIDTH) + `CS_WAY_SEL_WIDTH + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH - + AMO_REQ_BITS), + .DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + `UP(MEM_ATTR_WIDTH) + `CS_WAY_SEL_WIDTH + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + AMO_REQ_BITS), .RESETW (1) ) pipe_reg0 ( .clk (clk), .reset (reset), .enable (~pipe_stall), - .data_in ({valid_sel, is_init_sel, is_fill_sel, is_flush_sel, is_creq_sel, is_replay_sel, is_passthru_fill_sel, attr_sel, flush_way, addr_sel, data_sel, rw_sel, byteen_sel, word_idx_sel, req_idx_sel, tag_sel, replay_id - , amo_sel - }), - .data_out ({valid_st0, is_init_st0, is_fill_st0, is_flush_st0, is_creq_st0, is_replay_st0, is_passthru_fill_st0, attr_st0, flush_way_st0, addr_st0, data_st0, rw_st0, byteen_st0, word_idx_st0, req_idx_st0, tag_st0, replay_id_st0 - , amo_st0 - }) + .data_in ({valid_sel, is_init_sel, is_fill_sel, is_flush_sel, is_creq_sel, is_replay_sel, is_passthru_fill_sel, attr_sel, flush_way, addr_sel, data_sel, rw_sel, byteen_sel, word_idx_sel, req_idx_sel, tag_sel, replay_id, amo_sel}), + .data_out ({valid_st0, is_init_st0, is_fill_st0, is_flush_st0, is_creq_st0, is_replay_st0, is_passthru_fill_st0, attr_st0, flush_way_st0, addr_st0, data_st0, rw_st0, byteen_st0, word_idx_st0, req_idx_st0, tag_st0, replay_id_st0, amo_st0}) ); if (UUID_WIDTH != 0) begin : g_req_uuid_st0 @@ -567,19 +498,14 @@ module VX_cache_bank import VX_gpu_pkg::*; #( assign mshr_id_st0 = is_replay_st0 ? replay_id_st0 : mshr_alloc_id_st0; VX_pipe_register #( - .DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `UP(MEM_ATTR_WIDTH) + `CS_WAY_SEL_WIDTH + `CS_TAG_SEL_BITS + `CS_TAG_SEL_BITS + `CS_LINE_SEL_BITS + `CS_WORD_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1 - + AMO_REQ_BITS), + .DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `UP(MEM_ATTR_WIDTH) + `CS_WAY_SEL_WIDTH + `CS_TAG_SEL_BITS + `CS_TAG_SEL_BITS + `CS_LINE_SEL_BITS + `CS_WORD_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1 + AMO_REQ_BITS), .RESETW (1) ) pipe_reg1 ( .clk (clk), .reset (reset), .enable (~pipe_stall), - .data_in ({valid_st0, is_fill_st0, is_flush_st0, is_creq_st0, is_replay_st0, is_dirty_st0, is_hit_st0, rw_st0, attr_st0, way_idx_st0, evict_tag_st0, line_tag_st0, line_idx_st0, write_word_st0, byteen_st0, word_idx_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_previd_st0, mshr_pending_st0 - , amo_st0 - }), - .data_out ({valid_st1, is_fill_st1, is_flush_st1, is_creq_st1, is_replay_st1, is_dirty_st1, is_hit_st1, rw_st1, attr_st1, way_idx_st1, evict_tag_st1, line_tag_st1, line_idx_st1, write_word_st1, byteen_st1, word_idx_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_previd_st1, mshr_pending_st1 - , amo_st1 - }) + .data_in ({valid_st0, is_fill_st0, is_flush_st0, is_creq_st0, is_replay_st0, is_dirty_st0, is_hit_st0, rw_st0, attr_st0, way_idx_st0, evict_tag_st0, line_tag_st0, line_idx_st0, write_word_st0, byteen_st0, word_idx_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_previd_st0, mshr_pending_st0, amo_st0}), + .data_out ({valid_st1, is_fill_st1, is_flush_st1, is_creq_st1, is_replay_st1, is_dirty_st1, is_hit_st1, rw_st1, attr_st1, way_idx_st1, evict_tag_st1, line_tag_st1, line_idx_st1, write_word_st1, byteen_st1, word_idx_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_previd_st1, mshr_pending_st1, amo_st1}) ); if (UUID_WIDTH != 0) begin : g_req_uuid_st1 @@ -674,8 +600,7 @@ module VX_cache_bank import VX_gpu_pkg::*; #( .MSHR_SIZE (MSHR_SIZE), .WRITEBACK (WRITEBACK), .AMO_ENABLE ((AMO_ENABLE != 0) && (IS_LLC == 0)), - .DATA_WIDTH (WORD_SEL_WIDTH + WORD_SIZE + `CS_WORD_WIDTH + TAG_WIDTH + REQ_SEL_WIDTH - + AMO_REQ_BITS) + .DATA_WIDTH (WORD_SEL_WIDTH + WORD_SIZE + `CS_WORD_WIDTH + TAG_WIDTH + REQ_SEL_WIDTH + AMO_REQ_BITS) ) cache_mshr ( .clk (clk), .reset (reset), @@ -691,16 +616,14 @@ module VX_cache_bank import VX_gpu_pkg::*; #( // probe: pending entries for the incoming request's line, by type. .probe_addr (core_req_addr), - .probe_pending_ld (mshr_probe_pending_ld), + .probe_pending_ld (mshr_probe_pending_ld), .probe_pending_amo (mshr_probe_pending_amo), // dequeue .dequeue_valid (replay_valid), .dequeue_addr (replay_addr), .dequeue_rw (replay_rw), - .dequeue_data ({replay_wsel, replay_byteen, replay_data, replay_tag, replay_idx - , replay_amo - }), + .dequeue_data ({replay_wsel, replay_byteen, replay_data, replay_tag, replay_idx, replay_amo}), .dequeue_id (replay_id), .dequeue_ready (replay_ready), @@ -712,9 +635,7 @@ module VX_cache_bank import VX_gpu_pkg::*; #( // round-trip). At the LLC, same-line AMOs coalesce and serialize // their commits on the single filled line. .allocate_is_amo((AMO_ENABLE && !IS_LLC) ? amo_st0.amo_valid : 1'b0), - .allocate_data ({word_idx_st0, byteen_st0, write_word_st0, tag_st0, req_idx_st0 - , amo_st0 - }), + .allocate_data ({word_idx_st0, byteen_st0, write_word_st0, tag_st0, req_idx_st0, amo_st0}), .allocate_id (mshr_alloc_id_st0), .allocate_pending(mshr_pending_raw_st0), .allocate_previd(mshr_previd_st0), @@ -729,190 +650,102 @@ module VX_cache_bank import VX_gpu_pkg::*; #( ); // ============================================================ - // Non-LLC AMO passthrough - // ============================================================ - // An AMO at a non-LLC bank is forwarded downstream (the LLC performs - // the RMW) and its result word is routed back to the requester — it is - // never committed locally. We reuse the miss->fill->replay path with - // three twists, all gated to non-LLC AMO entries: - // 1) the AMO never coalesces (MSHR amo_table), - // 2) its fill does NOT install a line (the response is a result - // word, not a cacheable line), and - // 3) the replay emits the latched result word as the core response. - wire is_amo_fwd_st0; // AMO first pass at a non-LLC bank (st0) - wire is_amo_fwd_st1; // AMO first pass at a non-LLC bank (st1) - wire is_amo_replay_st1; // replay of a forwarded AMO (carries result) - wire is_passthru_fill_sel; // incoming fill targets a passthru entry - wire [`CS_WORD_WIDTH-1:0] amo_ptw_word_st1; // latched result word @ st1 - - if ((AMO_ENABLE != 0) && (IS_LLC == 0)) begin : g_amo_ptw - assign is_amo_fwd_st0 = amo_st0.amo_valid && valid_st0 && is_creq_st0 && ~is_replay_st0; - assign is_amo_fwd_st1 = amo_st1.amo_valid && valid_st1 && is_creq_st1 && ~is_replay_st1; - assign is_amo_replay_st1 = amo_st1.amo_valid && valid_st1 && is_creq_st1 && is_replay_st1; - - reg [MSHR_SIZE-1:0] amo_ptw_flag; // entry awaits a passthru fill - reg [WORD_SEL_WIDTH-1:0] amo_ptw_wsel [MSHR_SIZE]; - reg [`CS_WORD_WIDTH-1:0] amo_ptw_word [MSHR_SIZE]; - - wire [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] mem_rsp_words = mem_rsp_data; - - assign is_passthru_fill_sel = is_fill_sel && amo_ptw_flag[mem_rsp_id]; - assign amo_ptw_word_st1 = amo_ptw_word[mshr_id_st1]; - - always @(posedge clk) begin - if (reset) begin - amo_ptw_flag <= '0; - end else begin - // mark the AMO's MSHR entry on allocation - if (is_amo_fwd_st0 && mshr_allocate_st0 && ~pipe_stall) begin - amo_ptw_flag[mshr_alloc_id_st0] <= 1'b1; - amo_ptw_wsel[mshr_alloc_id_st0] <= word_idx_st0; - end - // latch the result word on the passthru fill, clear the flag - if (mem_rsp_fire && amo_ptw_flag[mem_rsp_id]) begin - amo_ptw_word[mem_rsp_id] <= mem_rsp_words[amo_ptw_wsel[mem_rsp_id]]; - amo_ptw_flag[mem_rsp_id] <= 1'b0; - end - end - end - end else begin : g_no_amo_ptw - assign is_amo_fwd_st0 = 1'b0; - assign is_amo_fwd_st1 = 1'b0; - assign is_amo_replay_st1 = 1'b0; - assign is_passthru_fill_sel = 1'b0; - assign amo_ptw_word_st1 = '0; - end - - // Force the AMO requester non-pending so it never coalesces onto a - // prior (plain-read) entry for the same line — each atomic must take - // its own downstream round-trip. - assign mshr_pending_st0 = mshr_pending_raw_st0 && ~is_amo_fwd_st0; - - // Passthru replay is treated as a hit (its line was never installed): - // fires the core response, allocates no mreq, releases the MSHR entry. - wire eff_hit_st1 = is_hit_st1 || is_amo_replay_st1; - - // ============================================================ - // AMO commit path + // AMO engine // ============================================================ - // - // VX_amo_unit instantiated only at the LLC bank. The compute - // kernel is combinational (read line word at S1 → ret_word / - // new_word). `res_check` from the reservation table drives the SC - // outcome. Compute outputs feed the response data mux below. - // - // Response data mux: when amo.valid && hit at S1, override - // crsp_queue_data with amo_alu.ret_word (LR/AMO* old value at - // byte offset) or with 0/1 for SC. - wire [63:0] amo_compute_new; - wire [63:0] amo_compute_ret; - wire amo_res_check; - // Bit-offset of the AMO target within the cache word. - // Set inside the IS_LLC generate; tied to 0 for non-LLC banks. - wire [`CLOG2(`CS_WORD_WIDTH)-1:0] amo_bit_off_st1; - // Forward in-flight wb back into compute_old when a new AMO targets - // the same line. Driven inside the IS_LLC generate. - wire amo_fwd_active_st1; - - if (IS_LLC) begin : g_amo_unit - // Byte-offset alignment: the AMO target may sit at a non-zero - // byte offset within the cache word. byteen_st1 has the targeted - // bytes set; the lowest set bit gives byte_off. We shift the - // cache word right by byte_off*8 to expose the operand at bit 0, - // and shift compute results back to that offset for the response - // and writeback. - localparam BYTE_OFF_BITS = `CLOG2(WORD_SIZE); - localparam BIT_OFF_BITS = `CLOG2(`CS_WORD_WIDTH); - localparam AMO_OLD_BITS = (`CS_WORD_WIDTH < 64) ? `CS_WORD_WIDTH : 64; - - wire [`UP(BYTE_OFF_BITS)-1:0] amo_byte_off_st1; - VX_priority_encoder #( - .N (WORD_SIZE) - ) amo_byte_off_enc ( - .data_in (byteen_st1), - .index_out (amo_byte_off_st1), - `UNUSED_PIN (valid_out), - `UNUSED_PIN (onehot_out) - ); - assign amo_bit_off_st1 = BIT_OFF_BITS'({amo_byte_off_st1, 3'b0}); - - // Forward amo_wb_data_r when an in-flight writeback (or a - // recently-fired one) targets the same line. cache_data isn't - // updated until the synthetic wb fires at S0; an AMO whose S0 - // read pre-dates the wb's S0 still sees pre-wb data in - // read_data_st1. amo_post_wb_valid extends forwarding for 2 - // cycles past amo_wb_fire to cover that window. - assign amo_fwd_active_st1 = (amo_wb_pending || amo_post_wb_valid) - && (amo_wb_addr_r == addr_st1); - wire [`CS_WORD_WIDTH-1:0] amo_line_word_raw_st1 = read_data_st1[word_idx_st1]; - wire [`CS_WORD_WIDTH-1:0] amo_line_word_st1 = - amo_fwd_active_st1 ? amo_wb_data_r : amo_line_word_raw_st1; - wire [`CS_WORD_WIDTH-1:0] amo_line_word_shifted_st1 = - amo_line_word_st1 >> amo_bit_off_st1; - // Width / rhs are no longer carried in amo_req_t (slim form). Derive - // width from byteen popcount (.W → 4 bytes set, .D → 8) and rhs by - // shifting the request data word by the AMO bit-offset like the - // old line word. AMO operands top out at .D (64 bits); upper bits - // of the shifted cache word are unused. - wire [1:0] amo_width_st1 = ($countones(byteen_st1) == 8) ? 2'd3 : 2'd2; - wire [`CS_WORD_WIDTH-1:0] amo_rhs_word_shifted_st1 = write_word_st1 >> amo_bit_off_st1; - wire [63:0] amo_rhs_st1 = (amo_width_st1 == 2'd2) - ? 64'({32'h0, amo_rhs_word_shifted_st1[31:0]}) - : 64'(amo_rhs_word_shifted_st1[AMO_OLD_BITS-1:0]); - wire [63:0] amo_compute_old = (amo_width_st1 == 2'd2) - ? 64'({32'h0, amo_line_word_shifted_st1[31:0]}) - : 64'(amo_line_word_shifted_st1[AMO_OLD_BITS-1:0]); - if (`CS_WORD_WIDTH > 64) begin : g_amo_upper_unused - `UNUSED_VAR (amo_line_word_shifted_st1[`CS_WORD_WIDTH-1:64]) - `UNUSED_VAR (amo_rhs_word_shifted_st1[`CS_WORD_WIDTH-1:64]) - end + wire [`CS_WORD_WIDTH-1:0] read_word_st1 = read_data_st1[word_idx_st1]; - // Drive reservation activity from S1 commit conditions. - // All three fire from the original AMO at S1 (amo_st1.hart_id - // is valid only there, not on the writeback cycle). - wire amo_res_reserve_w = amo_hit_st1 - && (amo_st1.amo_op == AMO_OP_LR); - wire amo_res_clear_w = amo_hit_st1 - && (amo_st1.amo_op == AMO_OP_SC); - // Break other harts' reservations on every committed write to the - // line, not just AMO stores: a plain store from another hart must - // fail a racing SC. AMOs ride the load path (rw=0), so do_write_st1 - // captures plain stores only; amo_st1.hart_id is the requester's id - // (the LSU sets it for every lane) so the writer is excluded. - wire amo_res_invalidate_w = amo_do_store_st1 || do_write_st1; - - VX_amo_unit #( + if (AMO_ENABLE) begin : g_amo + VX_cache_amo #( + .IS_LLC (IS_LLC), .NUM_RES_ENTRIES (`VX_CFG_AMO_RS_SIZE), - .LINE_ADDR_BITS (`CS_LINE_ADDR_WIDTH) - ) amo_unit ( - .clk (clk), - .reset (reset), - .compute_op (amo_st1.amo_op), - .compute_amo_unsigned (amo_st1.amo_unsigned), - .compute_width (amo_width_st1), - .compute_old (amo_compute_old), - .compute_rhs (amo_rhs_st1), - .compute_new_word (amo_compute_new), - .compute_ret_word (amo_compute_ret), - .res_reserve (amo_res_reserve_w), - .res_clear (amo_res_clear_w), - .res_invalidate(amo_res_invalidate_w), - .res_hart_id (amo_st1.hart_id), - .res_line_addr (addr_st1), - .res_check (amo_res_check) + .LINE_ADDR_BITS (`CS_LINE_ADDR_WIDTH), + .WORD_WIDTH (`CS_WORD_WIDTH), + .WORD_SIZE (WORD_SIZE), + .WORD_SEL_WIDTH (WORD_SEL_WIDTH), + .TAG_WIDTH (TAG_WIDTH), + .REQ_SEL_WIDTH (REQ_SEL_WIDTH), + .ATTR_WIDTH (`UP(MEM_ATTR_WIDTH)), + .MSHR_SIZE (MSHR_SIZE), + .MSHR_ADDR_WIDTH (MSHR_ADDR_WIDTH), + .WORDS_PER_LINE (`CS_WORDS_PER_LINE) + ) amo ( + .clk (clk), + .reset (reset), + .pipe_stall (pipe_stall), + .amo_st0 (amo_st0), + .valid_st0 (valid_st0), + .is_creq_st0 (is_creq_st0), + .is_hit_st0 (is_hit_st0), + .is_replay_st0 (is_replay_st0), + .amo_st1 (amo_st1), + .valid_st1 (valid_st1), + .is_creq_st1 (is_creq_st1), + .is_hit_st1 (is_hit_st1), + .is_replay_st1 (is_replay_st1), + .do_write_st1 (do_write_st1), + .read_word_st1 (read_word_st1), + .byteen_st1 (byteen_st1), + .write_word_st1 (write_word_st1), + .word_idx_st0 (word_idx_st0), + .word_idx_st1 (word_idx_st1), + .addr_st0 (addr_st0), + .addr_st1 (addr_st1), + .tag_st1 (tag_st1), + .req_idx_st1 (req_idx_st1), + .attr_st1 (attr_st1), + .wb_fire (amo_wb_fire), + .mshr_allocate_st0 (mshr_allocate_st0), + .mshr_alloc_id_st0 (mshr_alloc_id_st0), + .mshr_id_st1 (mshr_id_st1), + .mem_rsp_fire (mem_rsp_fire), + .mem_rsp_id (mem_rsp_id), + .mem_rsp_data (mem_rsp_data), + .is_fill_sel (is_fill_sel), + .core_req_valid (core_req_valid), + .core_req_is_amo (core_req_amo.amo_valid), + .core_req_rw (core_req_rw), + .core_req_addr (core_req_addr), + .rw_st0 (rw_st0), + .mshr_probe_pending_ld (mshr_probe_pending_ld), + .mshr_probe_pending_amo (mshr_probe_pending_amo), + .amo_hit_st1 (amo_hit_st1), + .commit_busy (amo_commit_busy), + .chain_stall (amo_chain_stall), + .wb_pending (amo_wb_pending), + .rsp_data (amo_rsp_data), + .wb_addr (amo_wb_addr), + .wb_word_idx (amo_wb_word_idx), + .wb_byteen (amo_wb_byteen), + .wb_data (amo_wb_data), + .wb_tag (amo_wb_tag), + .wb_idx (amo_wb_idx), + .wb_attr (amo_wb_attr), + .is_amo_fwd_st0 (is_amo_fwd_st0), + .is_amo_fwd_st1 (is_amo_fwd_st1), + .is_amo_replay_st1 (is_amo_replay_st1), + .is_passthru_fill_sel (is_passthru_fill_sel), + .amo_ptw_word_st1 (amo_ptw_word_st1), + .req_input_defer (req_input_defer) ); - end else begin : g_no_amo_unit - // Non-LLC banks: AMO commit machinery is not synthesized. - // Sink the conditionally-unused inputs so lint stays clean. - assign amo_compute_new = '0; - assign amo_compute_ret = '0; - assign amo_res_check = 1'b0; - assign amo_bit_off_st1 = '0; - assign amo_fwd_active_st1 = 1'b0; + end else begin : g_no_amo + assign {amo_hit_st1, amo_commit_busy, amo_wb_pending, amo_chain_stall} = '0; + assign {amo_rsp_data, amo_wb_addr, amo_wb_word_idx, amo_wb_byteen} = '0; + assign {amo_wb_data, amo_wb_tag, amo_wb_idx, amo_wb_attr} = '0; + assign {is_amo_fwd_st0, is_amo_fwd_st1, is_amo_replay_st1} = '0; + assign {is_passthru_fill_sel, amo_ptw_word_st1, req_input_defer} = '0; `UNUSED_VAR (amo_st1) - `UNUSED_VAR (addr_st1) + `UNUSED_VAR (amo_wb_fire) + `UNUSED_VAR (mshr_probe_pending_ld) + `UNUSED_VAR (mshr_probe_pending_amo) end - `UNUSED_VAR (amo_compute_new) + + // Force the AMO requester non-pending so it never coalesces onto a prior + // entry for the same line — each atomic takes its own downstream trip. + assign mshr_pending_st0 = mshr_pending_raw_st0 && ~is_amo_fwd_st0; + + // Passthru replay counts as a hit (its line was never installed): fires + // the core response, allocates no mreq, releases the MSHR entry. + wire eff_hit_st1 = is_hit_st1 || is_amo_replay_st1; // schedule core response @@ -921,90 +754,20 @@ module VX_cache_bank import VX_gpu_pkg::*; #( wire [REQ_SEL_WIDTH-1:0] crsp_queue_idx; wire [TAG_WIDTH-1:0] crsp_queue_tag; - // Fire response for AMO requests on hit at S1. AMO bits are carried - // through the MSHR so a replay re-enters with valid amo fields and - // commits on the freshly filled line. - // Local AMO commit happens only at the LLC. Non-LLC banks forward the - // AMO downstream (passthrough) and never commit/writeback it locally. - assign amo_hit_st1 = amo_st1.amo_valid && is_hit_st1 && valid_st1 && is_creq_st1 && (IS_LLC != 0); - assign sc_fail_st1 = (amo_st1.amo_op == AMO_OP_SC) && ~amo_res_check; - - // do_store: AMO commit other than LR or sc_fail. Triggers the - // writeback state machine to inject a synthetic write at sel. - assign amo_do_store_st1 = amo_hit_st1 - && (amo_st1.amo_op != AMO_OP_LR) - && ~sc_fail_st1; - - // Writeback data: byteen is the AMO request's own byteen, - // already at the correct offset within the cache word. - wire [WORD_SIZE-1:0] amo_wb_byteen_w = byteen_st1; - // Place new_word at the byte-offset slot within the cache word. - wire [`CS_WORD_WIDTH-1:0] amo_wb_data_w = - `CS_WORD_WIDTH'(amo_compute_new) << amo_bit_off_st1; - - always @(posedge clk) begin - if (reset) begin - amo_wb_pending <= 1'b0; - amo_post_wb_age <= 2'd0; - end else begin - // amo_post_wb_age counts down from 2 after amo_wb_fire, - // keeping amo_fwd_active alive across the BRAM settle window. - if (amo_wb_fire) begin - amo_post_wb_age <= 2'd2; - end else if (amo_post_wb_valid) begin - amo_post_wb_age <= amo_post_wb_age - 2'd1; - end - - if (amo_do_store_st1 && ~pipe_stall && ~amo_wb_pending && ~amo_fwd_active_st1) begin - // Fresh wb: no in-flight or recently-fired wb on this line. - amo_wb_pending <= 1'b1; - amo_wb_addr_r <= addr_st1; - amo_wb_word_idx_r <= word_idx_st1; - amo_wb_byteen_r <= amo_wb_byteen_w; - amo_wb_data_r <= amo_wb_data_w; - amo_wb_tag_r <= tag_st1; - amo_wb_idx_r <= req_idx_st1; - amo_wb_attr_r <= attr_st1; - end else if (amo_do_store_st1 && ~pipe_stall && amo_fwd_active_st1) begin - // Chain into existing wb on the same line. amo_compute_new - // was computed against the forwarded value, so overwriting - // folds this AMO's effect into the next wb. Re-arm pending - // so the chained value commits to BRAM. - amo_wb_data_r <= amo_wb_data_w; - amo_wb_pending <= 1'b1; - end else if (amo_wb_fire) begin - amo_wb_pending <= 1'b0; - end - end - end - // crsp_queue fires for reads and AMO commits at S1 on hit, but not // for the synthetic writeback write (rw=1). A non-LLC AMO's first // pass forwards downstream and must NOT respond locally; its result // returns later via the passthru replay (eff_hit covers that replay). - assign crsp_queue_valid = do_read_st1 && eff_hit_st1 && ~is_amo_fwd_st1; + // Suppress the response while a same-line AMO is chain-stalled at S1, so a + // read held for the extra pacing cycle enqueues its response exactly once + // (it fires when the op advances). amo_chain_stall is 0 for non-AMO traffic. + assign crsp_queue_valid = do_read_st1 && eff_hit_st1 && ~is_amo_fwd_st1 && ~amo_chain_stall; assign crsp_queue_idx = req_idx_st1; - // Response data mux: - // * AMO-SC at hit: 0 (success) or 1 (fail). - // * AMO-other at hit: compute_ret_word (old value). LSU sexts to XLEN. - // * Plain load: read_data_st1[word_idx_st1]. - // When AMO_ENABLE=0, amo_hit_st1 is tied off and the AMO arms collapse. - // amo_rsp_word is 64-bit wide for .D AMOs; upper bits truncated by cast. - wire [63:0] amo_rsp_word = (amo_st1.amo_op == AMO_OP_SC) - ? {63'h0, sc_fail_st1} - : amo_compute_ret; - if (`CS_WORD_WIDTH < 64) begin : g_amo_rsp_upper_unused - `UNUSED_VAR (amo_rsp_word[63:`CS_WORD_WIDTH]) - end - // Place the AMO response at the byte-offset slot the LSU extracts. - // Without this shift the response only lands correctly for line-aligned addresses. - wire [`CS_WORD_WIDTH-1:0] amo_rsp_aligned = - `CS_WORD_WIDTH'(amo_rsp_word) << amo_bit_off_st1; - // A passthru-AMO replay returns the result word latched from the - // downstream response (no installed line to read). + // Response data: passthru replay returns the latched downstream result, + // an LLC AMO commit returns its formatted result word, else plain load. assign crsp_queue_data = is_amo_replay_st1 ? amo_ptw_word_st1 - : (amo_hit_st1 ? amo_rsp_aligned - : read_data_st1[word_idx_st1]); + : (amo_hit_st1 ? amo_rsp_data + : read_word_st1); assign crsp_queue_tag = tag_st1; VX_elastic_buffer #( @@ -1088,8 +851,7 @@ module VX_cache_bank import VX_gpu_pkg::*; #( end end else begin : g_mreq_queue_ro // issue a fill request on a read miss - assign mreq_queue_push = (do_read_st1 && ~is_hit_st1 && ~mshr_pending_st1) - && ~pipe_stall; + assign mreq_queue_push = (do_read_st1 && ~is_hit_st1 && ~mshr_pending_st1) && ~pipe_stall; assign mreq_queue_addr = addr_st1; assign mreq_queue_rw = 0; assign mreq_queue_data = '0; @@ -1138,10 +900,10 @@ module VX_cache_bank import VX_gpu_pkg::*; #( `ifdef PERF_ENABLE assign perf_read_miss = do_read_st1 && ~is_hit_st1; assign perf_write_miss = do_write_st1 && ~is_hit_st1; + assign perf_evictions = do_writeback_st1; // dirty-line writeback eviction assign perf_mshr_stall = mshr_alm_full; `endif - `ifdef DBG_TRACE_CACHE wire crsp_queue_fire = crsp_queue_valid && crsp_queue_ready; wire input_stall = (replay_valid || mem_rsp_valid || core_req_valid || flush_valid) diff --git a/hw/rtl/cache/VX_cache_define.vh b/hw/rtl/cache/VX_cache_define.vh index 6988f4b7f7..42772ca02b 100644 --- a/hw/rtl/cache/VX_cache_define.vh +++ b/hw/rtl/cache/VX_cache_define.vh @@ -65,6 +65,7 @@ `PERF_COUNTER_ADD (dst, src, writes, PERF_CTR_BITS, count, (count > 1)) \ `PERF_COUNTER_ADD (dst, src, read_misses, PERF_CTR_BITS, count, (count > 1)) \ `PERF_COUNTER_ADD (dst, src, write_misses, PERF_CTR_BITS, count, (count > 1)) \ + `PERF_COUNTER_ADD (dst, src, evictions, PERF_CTR_BITS, count, (count > 1)) \ `PERF_COUNTER_ADD (dst, src, bank_stalls, PERF_CTR_BITS, count, (count > 1)) \ `PERF_COUNTER_ADD (dst, src, mshr_stalls, PERF_CTR_BITS, count, (count > 1)) \ `PERF_COUNTER_ADD (dst, src, mem_stalls, PERF_CTR_BITS, count, (count > 1)) \ diff --git a/hw/rtl/cache/VX_cache_wrap.sv b/hw/rtl/cache/VX_cache_wrap.sv index 3f4fdcb061..f5511cc2fd 100644 --- a/hw/rtl/cache/VX_cache_wrap.sv +++ b/hw/rtl/cache/VX_cache_wrap.sv @@ -262,6 +262,7 @@ module VX_cache_wrap import VX_gpu_pkg::*; #( assign cache_perf.writes = perf_core_writes; assign cache_perf.read_misses = '0; assign cache_perf.write_misses = '0; + assign cache_perf.evictions = '0; assign cache_perf.bank_stalls = '0; assign cache_perf.mshr_stalls = '0; assign cache_perf.mem_stalls = perf_mem_stalls; diff --git a/hw/rtl/core/VX_csr_data.sv b/hw/rtl/core/VX_csr_data.sv index 4856437b00..696441f858 100644 --- a/hw/rtl/core/VX_csr_data.sv +++ b/hw/rtl/core/VX_csr_data.sv @@ -293,9 +293,7 @@ import VX_fpu_pkg::*; // PERF: branches `CSR_READ_64(`VX_CSR_MPM_BRANCHES, read_data_ro_w, pipeline_perf.sched.branches); `CSR_READ_64(`VX_CSR_MPM_DIVERGENCE, read_data_ro_w, pipeline_perf.sched.divergence); - // PERF: memory - `CSR_READ_64(`VX_CSR_MPM_MEM_READS, read_data_ro_w, sysmem_perf.mem.reads); - `CSR_READ_64(`VX_CSR_MPM_MEM_WRITES, read_data_ro_w, sysmem_perf.mem.writes); + // PERF: memory (core-issued requests; DRAM traffic is in the MEM class) `CSR_READ_64(`VX_CSR_MPM_IFETCHES, read_data_ro_w, pipeline_perf.ifetches); `CSR_READ_64(`VX_CSR_MPM_LOADS, read_data_ro_w, pipeline_perf.loads); `CSR_READ_64(`VX_CSR_MPM_STORES, read_data_ro_w, pipeline_perf.stores); @@ -304,57 +302,71 @@ import VX_fpu_pkg::*; default:; endcase end - `ifdef VX_CFG_VM_ENABLE - `VX_DCR_MPM_CLASS_VM: begin + `VX_DCR_MPM_CLASS_ICACHE: begin case (read_addr) - // PERF: VM/MMU (icache + dcache MMU summed) - `CSR_READ_64(`VX_CSR_MPM_TLB_READS, read_data_ro_w, pipeline_perf.mmu.tlb_reads); - `CSR_READ_64(`VX_CSR_MPM_TLB_HITS, read_data_ro_w, pipeline_perf.mmu.tlb_hits); - `CSR_READ_64(`VX_CSR_MPM_TLB_MISSES, read_data_ro_w, pipeline_perf.mmu.tlb_misses); - `CSR_READ_64(`VX_CSR_MPM_TLB_EVICTS, read_data_ro_w, pipeline_perf.mmu.tlb_evictions); - `CSR_READ_64(`VX_CSR_MPM_PTW_WALKS, read_data_ro_w, pipeline_perf.mmu.ptw_walks); - `CSR_READ_64(`VX_CSR_MPM_PTW_LATENCY, read_data_ro_w, pipeline_perf.mmu.ptw_latency); + `CSR_READ_64(`VX_CSR_MPM_ICACHE_READS, read_data_ro_w, sysmem_perf.icache.reads); + `CSR_READ_64(`VX_CSR_MPM_ICACHE_MISS_R, read_data_ro_w, sysmem_perf.icache.read_misses); + `CSR_READ_64(`VX_CSR_MPM_ICACHE_MSHR_ST, read_data_ro_w, sysmem_perf.icache.mshr_stalls); default:; endcase end - `endif - `VX_DCR_MPM_CLASS_MEM: begin + `VX_DCR_MPM_CLASS_DCACHE: begin case (read_addr) - // PERF: icache - `CSR_READ_64(`VX_CSR_MPM_ICACHE_READS, read_data_ro_w, sysmem_perf.icache.reads); - `CSR_READ_64(`VX_CSR_MPM_ICACHE_MISS_R, read_data_ro_w, sysmem_perf.icache.read_misses); - `CSR_READ_64(`VX_CSR_MPM_ICACHE_MSHR_ST, read_data_ro_w, sysmem_perf.icache.mshr_stalls); - // PERF: dcache `CSR_READ_64(`VX_CSR_MPM_DCACHE_READS, read_data_ro_w, sysmem_perf.dcache.reads); `CSR_READ_64(`VX_CSR_MPM_DCACHE_WRITES, read_data_ro_w, sysmem_perf.dcache.writes); `CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_R, read_data_ro_w, sysmem_perf.dcache.read_misses); `CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_W, read_data_ro_w, sysmem_perf.dcache.write_misses); + `CSR_READ_64(`VX_CSR_MPM_DCACHE_EVICTS, read_data_ro_w, sysmem_perf.dcache.evictions); `CSR_READ_64(`VX_CSR_MPM_DCACHE_BANK_ST, read_data_ro_w, sysmem_perf.dcache.bank_stalls); `CSR_READ_64(`VX_CSR_MPM_DCACHE_MSHR_ST, read_data_ro_w, sysmem_perf.dcache.mshr_stalls); - // PERF: lmem - `CSR_READ_64(`VX_CSR_MPM_LMEM_READS, read_data_ro_w, sysmem_perf.lmem.reads); - `CSR_READ_64(`VX_CSR_MPM_LMEM_WRITES, read_data_ro_w, sysmem_perf.lmem.writes); - `CSR_READ_64(`VX_CSR_MPM_LMEM_BANK_ST, read_data_ro_w, sysmem_perf.lmem.bank_stalls); - // PERF: l2cache + default:; + endcase + end + `VX_DCR_MPM_CLASS_L2CACHE: begin + case (read_addr) `CSR_READ_64(`VX_CSR_MPM_L2CACHE_READS, read_data_ro_w, sysmem_perf.l2cache.reads); `CSR_READ_64(`VX_CSR_MPM_L2CACHE_WRITES, read_data_ro_w, sysmem_perf.l2cache.writes); `CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_R, read_data_ro_w, sysmem_perf.l2cache.read_misses); `CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_W, read_data_ro_w, sysmem_perf.l2cache.write_misses); + `CSR_READ_64(`VX_CSR_MPM_L2CACHE_EVICTS, read_data_ro_w, sysmem_perf.l2cache.evictions); `CSR_READ_64(`VX_CSR_MPM_L2CACHE_BANK_ST, read_data_ro_w, sysmem_perf.l2cache.bank_stalls); `CSR_READ_64(`VX_CSR_MPM_L2CACHE_MSHR_ST, read_data_ro_w, sysmem_perf.l2cache.mshr_stalls); - // PERF: l3cache + default:; + endcase + end + `VX_DCR_MPM_CLASS_L3CACHE: begin + case (read_addr) `CSR_READ_64(`VX_CSR_MPM_L3CACHE_READS, read_data_ro_w, sysmem_perf.l3cache.reads); `CSR_READ_64(`VX_CSR_MPM_L3CACHE_WRITES, read_data_ro_w, sysmem_perf.l3cache.writes); `CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_R, read_data_ro_w, sysmem_perf.l3cache.read_misses); `CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_W, read_data_ro_w, sysmem_perf.l3cache.write_misses); + `CSR_READ_64(`VX_CSR_MPM_L3CACHE_EVICTS, read_data_ro_w, sysmem_perf.l3cache.evictions); `CSR_READ_64(`VX_CSR_MPM_L3CACHE_BANK_ST, read_data_ro_w, sysmem_perf.l3cache.bank_stalls); `CSR_READ_64(`VX_CSR_MPM_L3CACHE_MSHR_ST, read_data_ro_w, sysmem_perf.l3cache.mshr_stalls); - // PERF: memory + default:; + endcase + end + `VX_DCR_MPM_CLASS_MEM: begin + case (read_addr) + // PERF: off-chip memory `CSR_READ_64(`VX_CSR_MPM_MEM_READS, read_data_ro_w, sysmem_perf.mem.reads); `CSR_READ_64(`VX_CSR_MPM_MEM_WRITES, read_data_ro_w, sysmem_perf.mem.writes); `CSR_READ_64(`VX_CSR_MPM_MEM_LT, read_data_ro_w, sysmem_perf.mem.latency); + // PERF: lmem + `CSR_READ_64(`VX_CSR_MPM_LMEM_READS, read_data_ro_w, sysmem_perf.lmem.reads); + `CSR_READ_64(`VX_CSR_MPM_LMEM_WRITES, read_data_ro_w, sysmem_perf.lmem.writes); + `CSR_READ_64(`VX_CSR_MPM_LMEM_BANK_ST, read_data_ro_w, sysmem_perf.lmem.bank_stalls); // PERF: coalescer `CSR_READ_64(`VX_CSR_MPM_COALESCER_MISS, read_data_ro_w, sysmem_perf.coalescer.misses); + `ifdef VX_CFG_VM_ENABLE + // PERF: VM/MMU (icache + dcache MMU summed) + `CSR_READ_64(`VX_CSR_MPM_TLB_READS, read_data_ro_w, pipeline_perf.mmu.tlb_reads); + `CSR_READ_64(`VX_CSR_MPM_TLB_HITS, read_data_ro_w, pipeline_perf.mmu.tlb_hits); + `CSR_READ_64(`VX_CSR_MPM_TLB_MISSES, read_data_ro_w, pipeline_perf.mmu.tlb_misses); + `CSR_READ_64(`VX_CSR_MPM_TLB_EVICTS, read_data_ro_w, pipeline_perf.mmu.tlb_evictions); + `CSR_READ_64(`VX_CSR_MPM_PTW_WALKS, read_data_ro_w, pipeline_perf.mmu.ptw_walks); + `CSR_READ_64(`VX_CSR_MPM_PTW_LATENCY, read_data_ro_w, pipeline_perf.mmu.ptw_latency); + `endif default:; endcase end diff --git a/hw/scripts/saif_filter.py b/hw/scripts/saif_filter.py new file mode 100755 index 0000000000..9b36f14a15 --- /dev/null +++ b/hw/scripts/saif_filter.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python3 +# Extract an instance subtree from a Verilator master SAIF and re-root it under a +# synthesis top module, so the result can be annotated directly by the hw/syn flows +# (Vivado read_saif, Synopsys read_saif, OpenSTA read_saif). +# +# The master SAIF produced by Verilator (--trace-saif) is rooted at the simulation +# top (e.g. TOP/.../core/execute/tcu_unit/...) and contains the full design. The +# synthesis DUT is a standalone wrapper (e.g. VX_tcu_unit_top) whose top-level child +# instance matches the extracted scope. This tool slices the requested subtree +# verbatim -- preserving every nested generate scope and net, regardless of tile size +# or NUM_THREADS -- and wraps it under the named top module. +# +# Usage: +# saif_filter.py --instance [--top ] [-o out.saif] master.saif +# saif_filter.py --list [--list-depth N] master.saif +# +# --instance slash-separated instance path to extract. Matched as a suffix of the +# full hierarchy path, so "execute/tcu_unit" or just "tcu_unit" both work. +# --top wrapper instance name placed above the extracted subtree (and written +# as the SAIF DESIGN). Omit to emit the extracted instance as the root. +# --all extract every match instead of only the first (names disambiguated). +# --list print the instance hierarchy (for discovering paths) and exit. + +import argparse +import re +import sys + +INSTANCE_RE = re.compile(r'^(\s*)\(INSTANCE\s+(\S+)\s*$') +HEADER_KEYS = ('DIVIDER', 'TIMESCALE', 'DURATION', 'DIRECTION', 'SAIFVERSION') + + +def paren_delta(line): + """Net change in paren depth on a line, ignoring anything inside double quotes.""" + delta = 0 + in_str = False + for c in line: + if c == '"': + in_str = not in_str + elif in_str: + continue + elif c == '(': + delta += 1 + elif c == ')': + delta -= 1 + return delta + + +def iter_instances(lines): + """Yield (start_idx, path) for every (INSTANCE ...) opener, tracking hierarchy. + + Depth is the running paren balance *before* the line. An instance opens one paren + on its own line, so its body lives one level deeper and closes when the balance + returns to the opener's level. + """ + balance = 0 + stack = [] # (name, level) for currently-open instances + for idx, line in enumerate(lines): + while stack and balance <= stack[-1][1]: + stack.pop() + m = INSTANCE_RE.match(line) + if m: + name = m.group(2) + path = [s[0] for s in stack] + [name] + yield idx, path + stack.append((name, balance)) + balance += paren_delta(line) + + +def capture_subtree(lines, start_idx): + """Return the list of lines forming the instance subtree opened at start_idx.""" + depth = 0 + out = [] + for line in lines[start_idx:]: + out.append(line) + depth += paren_delta(line) + if depth == 0: + break + else: + raise ValueError("unterminated INSTANCE subtree (unbalanced parens)") + return out + + +def dedent(block, pad): + """Strip the leading indentation of the root line from every line, then re-pad.""" + root_indent = len(block[0]) - len(block[0].lstrip()) + out = [] + for line in block: + stripped = line[root_indent:] if line[:root_indent].isspace() else line.lstrip() + out.append(pad + stripped if stripped.strip() else stripped) + return out + + +def grab_header(lines): + """Collect header fields to carry over into the emitted SAIF.""" + hdr = {} + for line in lines: + s = line.strip() + if s.startswith('(INSTANCE'): + break + for key in HEADER_KEYS: + if s.startswith('(' + key): + hdr[key] = s + return hdr + + +def emit(out, header, top, subtrees): + w = out.write + w('(SAIFILE\n') + w(header.get('SAIFVERSION', '(SAIFVERSION "2.0")') + '\n') + w(header.get('DIRECTION', '(DIRECTION "backward")') + '\n') + if top: + w('(DESIGN "%s")\n' % top) + w('(VENDOR "Verilator")\n') + w('(PROGRAM_NAME "Verilator")\n') + w(header.get('DIVIDER', '(DIVIDER / )') + '\n') + w(header.get('TIMESCALE', '(TIMESCALE 1ps)') + '\n') + w(header.get('DURATION', '(DURATION 0)') + '\n') + if top: + w(' (INSTANCE %s\n' % top) + pad = ' ' + else: + pad = ' ' + for block in subtrees: + for line in dedent(block, pad): + w(line if line.endswith('\n') else line + '\n') + if top: + w(' )\n') + w(')\n') + + +def main(): + ap = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + ap.add_argument('saif', help='master SAIF produced by Verilator --trace-saif') + ap.add_argument('--instance', help='slash-separated instance path to extract (suffix match)') + ap.add_argument('--top', help='wrapper top-module/instance name for the extracted subtree') + ap.add_argument('--all', action='store_true', help='extract every match, not just the first') + ap.add_argument('-o', '--output', help='output SAIF path (default: stdout)') + ap.add_argument('--list', action='store_true', help='print the instance hierarchy and exit') + ap.add_argument('--list-depth', type=int, default=6, help='max depth for --list (default 6)') + args = ap.parse_args() + + with open(args.saif) as f: + lines = f.readlines() + + if args.list: + for idx, path in iter_instances(lines): + if len(path) <= args.list_depth: + print(' ' * (len(path) - 1) + path[-1]) + return 0 + + if not args.instance: + ap.error('--instance is required unless --list is given') + + target = [p for p in args.instance.strip('/').split('/') if p] + n = len(target) + matches = [idx for idx, path in iter_instances(lines) if path[-n:] == target] + if not matches: + sys.stderr.write('ERROR: instance path not found: %s\n' % args.instance) + return 1 + if not args.all: + matches = matches[:1] + + subtrees = [capture_subtree(lines, idx) for idx in matches] + header = grab_header(lines) + + out = open(args.output, 'w') if args.output else sys.stdout + try: + emit(out, header, args.top, subtrees) + finally: + if args.output: + out.close() + + fedp = sum(1 for b in subtrees for ln in b if INSTANCE_RE.match(ln) and INSTANCE_RE.match(ln).group(2) == 'fedp') + sys.stderr.write('saif_filter: extracted %d instance(s) under "%s"; %d fedp leaf instance(s)\n' + % (len(subtrees), args.top or target[-1], fedp)) + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/hw/syn/synopsys/Makefile b/hw/syn/synopsys/Makefile index f5cc84b343..b3337681f5 100644 --- a/hw/syn/synopsys/Makefile +++ b/hw/syn/synopsys/Makefile @@ -24,6 +24,13 @@ ASAP7_LIB_ROOT := $(ASAP7_ROOT)/asap7sc7p5t_28/LIB/NLDM ASAP7_SRAM_LIB := $(ASAP7_ROOT)/ASAP7_SRAM_0p0/generated/LIB/srambank_128x4x32_6t122.lib ASAP7_SRAM_DB := $(ASAP7_ROOT)/ASAP7_SRAM_0p0/generated/LIB/srambank_128x4x32_6t122.db ASAP7_NAME := "ASAP7 (7nm)" +# physical libs for layout/images (fc_shell): 4x-scaled LEF + .tf are 4000 DBU/um +ASAP7_TF := $(ASAP7_ROOT)/asap7_snps/icc/asap07_icc.tf +ASAP7_LEFS := $(wildcard $(ASAP7_ROOT)/asap7sc7p5t_28/LEF/scaled/asap7sc7p5t_28_*_4x_220121a.lef) +ASAP7_TLU := $(ASAP7_ROOT)/asap7_snps/starrc/asap07.tluplus +ASAP7_DBS := $(wildcard $(ASAP7_ROOT)/asap7sc7p5t_28/LIB/NLDM/*_TT_nldm.db) +ASAP7_PSRAM_DBS := $(wildcard $(ASAP7_ROOT)/ASAP7_SRAM_0p0/generated/LIB/*.db) +ASAP7_SCALE := 4000 SAED14_ROOT := /mnt/nas0/eda.libs/saed14/EDK_03_2025 SAED14_LIB_ROOT := $(SAED14_ROOT)/SAED14nm_EDK_STD_SLVT/liberty/nldm @@ -46,6 +53,15 @@ ifeq ($(LIB_NAME),) $(error Invalid LIB_TYPE=$(LIB_TYPE).) endif +# physical PDK selection for the layout/images flow (empty for libs not yet wired) +PDK_TF := $($(LIB_TYPE)_TF) +PDK_LEFS := $($(LIB_TYPE)_LEFS) +PDK_TLU := $($(LIB_TYPE)_TLU) +PDK_DBS := $($(LIB_TYPE)_DBS) +PDK_SRAM_DBS := $($(LIB_TYPE)_PSRAM_DBS) +PDK_SCALE := $(or $($(LIB_TYPE)_SCALE),10000) +IMG_UTIL ?= 0.60 + $(info [Make] Using library $(LIB_NAME)) DEFAULT_LIB := $(LIB_DIR)/NanGate_15nm_OCL.db @@ -176,7 +192,7 @@ CFLAGS += -DVX_CFG_XLEN=$(XLEN) -DVX_CFG_XLEN_$(XLEN) CFLAGS += $(CONFIGS) CFLAGS += $(RTL_INCLUDE) -.PHONY: clean gen-sources synthesis-nosram synthesis-estsram synthesis +.PHONY: clean gen-sources synthesis-nosram synthesis-estsram synthesis images power-grid gds-render all: synthesis @@ -234,5 +250,50 @@ synthesis-estsram: $(BUILD_DIR)/sources.txt WALL_IGNORE=$(WALL_IGNORE) \ dc_shell -f $(SRC_DIR)/project.tcl +# Layout images (placement, hierarchy, cell-density + power-density heat maps). +# Requires a completed synthesis (out/$(TOP).mapped.v). Uses fc_shell under Xvfb since +# icc2_shell is not installed. SAIF_FILE is optional -> power falls back to vectorless. +images: + cd $(BUILD_DIR); \ + TOP=$(TOP_LEVEL_ENTITY) \ + NETLIST=out/$(TOP_LEVEL_ENTITY).mapped.v \ + SDC=out/$(TOP_LEVEL_ENTITY).post_compile.sdc \ + OUT_DIR=images \ + SAIF_FILE="$(SAIF_FILE)" \ + SAIF_INST="$(SAIF_INST)" \ + PDK_TF="$(PDK_TF)" \ + PDK_LEFS="$(PDK_LEFS)" \ + PDK_TLU="$(PDK_TLU)" \ + PDK_DBS="$(PDK_DBS)" \ + PDK_SRAM_DBS="$(PDK_SRAM_DBS)" \ + PDK_SCALE="$(PDK_SCALE)" \ + UTIL="$(IMG_UTIL)" \ + GDS="$(GDS)" \ + GDS_PLACE_OPT="$(GDS_PLACE_OPT)" \ + xvfb-run -a fc_shell -f $(SRC_DIR)/run_icc2.tcl > images.log 2>&1 + +# Spatial grid power-density map (paper/EDA-tool style): dump per-cell power from the +# placed block (images/.dlib, so run 'make images' first), then grid + render. +# SAIF_FILE drives activity (else vectorless). Tune grid with POWER_GRID_N. +POWER_GRID_N ?= 64 +power-grid: + cd $(BUILD_DIR); \ + TOP=$(TOP_LEVEL_ENTITY) \ + OUT_DIR=images \ + SAIF_FILE="$(SAIF_FILE)" \ + SAIF_INST="$(SAIF_INST)" \ + PDK_DBS="$(PDK_DBS)" \ + CSV=images/cellpower.csv \ + xvfb-run -a fc_shell -f $(SRC_DIR)/dump_power.tcl > power_grid.log 2>&1 + cd $(BUILD_DIR); \ + python3 $(SRC_DIR)/powergrid.py images/cellpower.csv images/power_grid.png $(POWER_GRID_N) "$(TOP_LEVEL_ENTITY)" >> power_grid.log 2>&1 + +# Render a streamed GDS to layout images (full-chip overview + zoom) and a complete, +# self-contained GDS (ASAP7 cell polygons merged in). Needs the GDS from 'make images GDS=1'. +GDS_ZOOM_UM ?= 30 +gds-render: + cd $(BUILD_DIR); \ + python3 $(SRC_DIR)/gds_render.py images/$(TOP_LEVEL_ENTITY).gds gdsimg $(GDS_ZOOM_UM) > gds_render.log 2>&1 + clean: $(RMDIR) $(BUILD_DIR) diff --git a/hw/syn/synopsys/dump_power.tcl b/hw/syn/synopsys/dump_power.tcl new file mode 100644 index 0000000000..0984ed391c --- /dev/null +++ b/hw/syn/synopsys/dump_power.tcl @@ -0,0 +1,36 @@ +## dump_power.tcl -- reopen a placed block, annotate power (SAIF or vectorless), +## and dump per-cell (x, y, total_power, area) to CSV for a spatial power-density grid. +## Pairs with powergrid.py. Run: xvfb-run -a fc_shell -f dump_power.tcl +## env: OUT_DIR (dir with .dlib), SAIF_FILE, SAIF_INST, CSV, PDK_DBS +proc ev {n d} { if {[info exists ::env($n)] && [string trim $::env($n)] ne ""} { return $::env($n) }; return $d } +set OUT_DIR [ev OUT_DIR images] +set TOP [ev TOP ""] +set SAIF [ev SAIF_FILE ""] +set SI [ev SAIF_INST ""] +set CSV [ev CSV $OUT_DIR/cellpower.csv] +set DBS [ev PDK_DBS ""] +if {$DBS eq ""} { set DBS [glob -nocomplain /mnt/nas0/eda.libs/asap7/asap7sc7p5t_28/LIB/NLDM/*_TT_nldm.db] } +set_app_var link_library [concat "*" $DBS] +if {$TOP ne "" && [file isdirectory $OUT_DIR/${TOP}.dlib]} { + open_lib $OUT_DIR/${TOP}.dlib +} else { + open_lib [lindex [glob $OUT_DIR/*.dlib] 0] +} +open_block [lindex [get_object_name [get_blocks]] 0] +link_block +if {$SAIF ne "" && [file exists $SAIF]} { if {[catch {read_saif $SAIF -strip_path $SI} e]} { puts "SAIFERR $e" } } +catch { report_power > $OUT_DIR/power.rpt } +set bb "" +foreach a {boundary_bbox bbox} { if {$bb eq ""} { catch { set bb [get_attribute [current_block] $a] } } } +set fh [open $CSV w] +puts $fh "#bbox [lindex [lindex $bb 0] 0] [lindex [lindex $bb 0] 1] [lindex [lindex $bb 1] 0] [lindex [lindex $bb 1] 1]" +puts $fh "x,y,power,area" +set cnt 0 +foreach_in_collection c [get_cells -hierarchical -filter "is_hierarchical==false"] { + set o [get_attribute $c origin]; set p [get_attribute $c total_power]; set ar [get_attribute $c area] + if {$o eq "" || $p eq ""} continue + puts $fh "[lindex $o 0],[lindex $o 1],$p,$ar"; incr cnt +} +close $fh +puts "DUMP DONE: $cnt cells -> $CSV" +exit diff --git a/hw/syn/synopsys/gds_render.py b/hw/syn/synopsys/gds_render.py new file mode 100644 index 0000000000..5b562d7ff5 --- /dev/null +++ b/hw/syn/synopsys/gds_render.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python3 +"""gds_render.py - complete + render an FC placement GDS. + +Fusion Compiler streams the design GDS with std-cell PLACEMENTS only (SREFs by +name); the ASAP7 std-cell polygons are not embedded (LEF-only physical lib). This +script merges the ASAP7 cell GDS (drawn at the same 4x scale as the 4x LEF used +for placement, so coordinates align) to produce a complete, self-contained GDS, +then rasterizes a full-chip overview and a zoomed crop with pycairo. + +Usage: gds_render.py [zoom_um] +Outputs in : .complete.gds, gds_full.png, gds_zoom.png +""" +import sys, os, math +import gdstk, cairo + +DESIGN = sys.argv[1] +OUT = sys.argv[2] +ZOOM_UM = float(sys.argv[3]) if len(sys.argv) > 3 else 30.0 # window size (4x um) + +GDSDIR = os.environ.get("ASAP7_GDS_DIR", "/mnt/nas0/eda.libs/asap7/asap7sc7p5t_28/GDS") +CELLGDS = ["asap7sc7p5t_28_R_220121a.gds", "asap7sc7p5t_28_L_220121a.gds", + "asap7sc7p5t_28_SL_220121a.gds", "asap7sc7p5t_28_SRAM_220121a.gds"] +# ASAP7 ships GDS at 1x but the 4x-scaled LEF was used for placement, so cells +# must be magnified to match the 4x placement coordinates. +MAG = float(os.environ.get("GDS_CELL_MAG", "4")) + +os.makedirs(OUT, exist_ok=True) + +# ---- load std-cell defs ---- +cellmap = {} +for f in CELLGDS: + p = os.path.join(GDSDIR, f) + if not os.path.exists(p): + continue + for c in gdstk.read_gds(p).cells: + cellmap.setdefault(c.name, c) +print("[gds_render] ASAP7 cell defs: %d" % len(cellmap)) + +# ---- load design ---- +d = gdstk.read_gds(DESIGN) +top = d.top_level()[0] +refs = top.references +print("[gds_render] top %s: %d placements" % (top.name, len(refs))) + +# ---- merge: add referenced cell defs -> complete GDS ---- +used = {} +for r in refs: + n = r.cell if isinstance(r.cell, str) else r.cell.name + used[n] = used.get(n, 0) + 1 +resolved = [n for n in used if n in cellmap] +missing = [n for n in used if n not in cellmap] +print("[gds_render] distinct cells %d resolved %d missing %d" % (len(used), len(resolved), len(missing))) +if missing: + print("[gds_render] missing:", sorted(missing)[:20]) +# scale each cell's geometry by MAG and add under the same name, so the design's +# SREFs (magnification 1) resolve to correctly-sized cells -> consistent GDS. +for n in resolved: + src = cellmap[n] + nc = gdstk.Cell(n) + nc.add(*[p.scale(MAG, center=(0, 0)) for p in src.get_polygons(depth=None)]) + d.add(nc) +complete = os.path.join(OUT, top.name + ".complete.gds") +d.write_gds(complete) +print("[gds_render] wrote %s (%d bytes)" % (complete, os.path.getsize(complete))) + +# ---- layer color map (by GDS layer number) ---- +LAYER_RGB = {} +_palette = [(0.20,0.45,0.95),(0.95,0.30,0.25),(0.20,0.75,0.35),(0.95,0.80,0.20), + (0.30,0.80,0.85),(0.85,0.35,0.85),(0.95,0.55,0.20),(0.70,0.70,0.75), + (0.55,0.85,0.45),(0.45,0.55,0.90),(0.90,0.50,0.55),(0.60,0.40,0.85)] +def layer_color(layer): + if layer not in LAYER_RGB: + LAYER_RGB[layer] = _palette[len(LAYER_RGB) % len(_palette)] + return LAYER_RGB[layer] + +def vt_color(name): + if name.endswith("_SRAM"): return (0.95,0.55,0.20) + if name.endswith("_SL"): return (0.95,0.30,0.25) + if name.endswith("_L"): return (0.20,0.75,0.35) + return (0.20,0.45,0.95) # _R + +(x0,y0),(x1,y1) = top.bounding_box() +W = x1-x0; H = y1-y0 + +# ---- full-chip overview: cell footprints colored by Vt flavor ---- +def render_full(png, px=2000): + sc = px/max(W,H) + wpx, hpx = int(W*sc)+2, int(H*sc)+2 + surf = cairo.ImageSurface(cairo.FORMAT_ARGB32, wpx, hpx) + cr = cairo.Context(surf) + cr.set_source_rgb(0.06,0.06,0.08); cr.paint() + # cache cell footprint sizes + size = {} + for n in resolved: + (a,b),(c,e) = cellmap[n].bounding_box() + size[n] = ((c-a)*MAG, (e-b)*MAG) + for r in refs: + n = r.cell if isinstance(r.cell,str) else r.cell.name + if n not in size: continue + w,h = size[n] + rot = (r.rotation or 0.0) + if abs(math.sin(rot)) > 0.5: w,h = h,w # 90/270 swap + ox,oy = r.origin + X = (ox-x0)*sc; Y = hpx-(oy-y0)*sc - h*sc # flip Y + cr.set_source_rgb(*vt_color(n)) + cr.rectangle(X, Y, max(w*sc,0.4), max(h*sc,0.4)); cr.fill() + surf.write_to_png(png) + print("[gds_render] wrote %s (%dx%d)" % (png, wpx, hpx)) + +# ---- zoom: real polygons in a central window, colored by layer ---- +def render_zoom(png, win_um, px=1600): + cx,cy = (x0+x1)/2.0, (y0+y1)/2.0 + wx0,wy0,wx1,wy1 = cx-win_um/2, cy-win_um/2, cx+win_um/2, cy+win_um/2 + sub = gdstk.Cell("__zoom__") + cnt = 0 + for r in refs: + ox,oy = r.origin + if wx0-5 <= ox <= wx1+5 and wy0-5 <= oy <= wy1+5: + n = r.cell if isinstance(r.cell,str) else r.cell.name + if n in cellmap: + sub.add(gdstk.Reference(cellmap[n], origin=r.origin, + rotation=r.rotation or 0, x_reflection=bool(r.x_reflection), + magnification=MAG)) + cnt += 1 + sub.flatten() + polys = sub.get_polygons(depth=None) + # top-level routing (paths/wires live in the top cell, already in 4x space) + route = [] + try: + for p in top.get_polygons(depth=0, include_paths=True): + (px0,py0),(px1,py1) = p.bounding_box() + if px1 >= wx0 and px0 <= wx1 and py1 >= wy0 and py0 <= wy1: + route.append(p) + except Exception as e: + print("[gds_render] top-level routing: %s" % e) + polys = polys + route + print("[gds_render] zoom window %.1fum: %d cells, %d cell-polys + %d routing-polys" % (win_um, cnt, len(polys)-len(route), len(route))) + sc = px/win_um + surf = cairo.ImageSurface(cairo.FORMAT_ARGB32, px, px) + cr = cairo.Context(surf) + cr.set_source_rgb(0.04,0.04,0.05); cr.paint() + # draw lower layers first (sort by layer) + polys.sort(key=lambda p: p.layer) + for p in polys: + pts = p.points + if len(pts) < 3: continue + cr.set_source_rgba(*layer_color(p.layer), 0.78) + cr.move_to((pts[0][0]-wx0)*sc, px-(pts[0][1]-wy0)*sc) + for (ux,uy) in pts[1:]: + cr.line_to((ux-wx0)*sc, px-(uy-wy0)*sc) + cr.close_path(); cr.fill() + surf.write_to_png(png) + print("[gds_render] wrote %s (%dx%d)" % (png, px, px)) + +render_full(os.path.join(OUT, "gds_full.png")) +render_zoom(os.path.join(OUT, "gds_zoom.png"), ZOOM_UM) +print("[gds_render] DONE") diff --git a/hw/syn/synopsys/powergrid.py b/hw/syn/synopsys/powergrid.py new file mode 100644 index 0000000000..63f8f7edab --- /dev/null +++ b/hw/syn/synopsys/powergrid.py @@ -0,0 +1,24 @@ +import sys, numpy as np, matplotlib; matplotlib.use('Agg'); import matplotlib.pyplot as plt +csv, out = sys.argv[1], sys.argv[2] +N = int(sys.argv[3]) if len(sys.argv) > 3 else 64 +title = sys.argv[4] if len(sys.argv) > 4 else "" +bb=None; xs=[]; ys=[]; ps=[] +for ln in open(csv): + if ln.startswith("#bbox"): bb=[float(v) for v in ln.split()[1:5]]; continue + if ln.startswith("x,") or not ln.strip(): continue + x,y,p,a = ln.split(","); xs.append(float(x)); ys.append(float(y)); ps.append(float(p)) +xs=np.array(xs); ys=np.array(ys); ps=np.array(ps) +x0,y0,x1,y1 = bb if bb else [xs.min(),ys.min(),xs.max(),ys.max()] +grid=np.zeros((N,N)) +ix=np.clip(((xs-x0)/(x1-x0)*N).astype(int),0,N-1) +iy=np.clip(((ys-y0)/(y1-y0)*N).astype(int),0,N-1) +np.add.at(grid,(iy,ix),ps) +tile_area=((x1-x0)/N)*((y1-y0)/N) +dens=grid/tile_area # power per unit area +fig,ax=plt.subplots(figsize=(7,6)) +vmax=np.percentile(dens[dens>0],99) if (dens>0).any() else dens.max() +im=ax.imshow(dens,vmin=0,vmax=vmax,origin='lower',extent=[x0,x1,y0,y1],cmap='jet',interpolation='bilinear',aspect='equal') +cb=fig.colorbar(im,ax=ax,fraction=0.046,pad=0.04); cb.set_label('power density (pW/um^2, 4x); clipped @ 99th pct') +ax.set_xlabel('x (um, 4x)'); ax.set_ylabel('y (um, 4x)') +ax.set_title(title+f" ({N}x{N} grid, sum P per tile)") +fig.tight_layout(); fig.savefig(out,dpi=130); print("wrote",out,"grid sum=%.3g max_tile=%.3g"%(grid.sum(),dens.max())) diff --git a/hw/syn/synopsys/project.tcl b/hw/syn/synopsys/project.tcl index ae3d8cb930..01cf154f84 100755 --- a/hw/syn/synopsys/project.tcl +++ b/hw/syn/synopsys/project.tcl @@ -896,6 +896,7 @@ if {[sizeof_collection $unmapped_cells] > 0} { # ---------------- reports ---------------- report_qor > [file join $RPT_DIR "qor.rpt"] report_area -hier -nosplit > [file join $RPT_DIR "area.rpt"] +report_timing -delay_type max -path_type summary -max_paths 50 > [file join $RPT_DIR "timing_summary.rpt"] report_timing -delay_type max -path_type full_clock -max_paths 50 -nets -transition_time -capacitance > [file join $RPT_DIR "timing_max.rpt"] report_timing -delay_type min -path_type full_clock -max_paths 50 -nets -transition_time -capacitance > [file join $RPT_DIR "timing_min.rpt"] report_clock -skew > [file join $RPT_DIR "clock_skew.rpt"] diff --git a/hw/syn/synopsys/run_icc2.tcl b/hw/syn/synopsys/run_icc2.tcl new file mode 100644 index 0000000000..fc2f4f6571 --- /dev/null +++ b/hw/syn/synopsys/run_icc2.tcl @@ -0,0 +1,287 @@ +## run_icc2.tcl +## Fusion Compiler placement + layout images for a synthesized netlist. +## Produces: placement, hierarchy-colored, cell-density heat map, and power-density +## heat map PNGs. Power uses SAIF activity if provided, else vectorless. +## +## NOTE: icc2_shell is not installed on this host; fc_shell shares the same ICC2 P&R +## engine / NDM database / physical libraries. Run headless via Xvfb: +## xvfb-run -a fc_shell -f run_icc2.tcl +## +## Driven entirely by environment variables (see the Makefile 'images' target): +## TOP top module name (required) +## NETLIST gate-level netlist (default out/$TOP.mapped.v) +## SDC constraints (default out/$TOP.post_compile.sdc) +## OUT_DIR output directory for the .dlib + PNGs (default images) +## SAIF_FILE switching activity (optional -> vectorless) +## SAIF_INST instance scope of SAIF in the netlist (optional) +## PDK_TF technology file (.tf) (required) +## PDK_LEFS cell/tech LEF files (space separated) (required) +## PDK_DBS timing/power .db libs (space separated) (required for power) +## PDK_SRAM_DBS SRAM .db libs (space separated) (optional) +## PDK_TLU TLUPlus parasitic file (optional) +## PDK_SCALE internal length precision / scale factor (default 10000) +## UTIL core utilization for the floorplan (default 0.60) + +proc ev {name def} { + if {[info exists ::env($name)] && [string trim $::env($name)] ne ""} { return $::env($name) } + return $def +} +proc note {m} { puts "\[run_icc2\] $m" } + +set TOP [ev TOP ""] +if {$TOP eq ""} { error "run_icc2: TOP not set" } +set NETLIST [ev NETLIST "out/${TOP}.mapped.v"] +set SDC [ev SDC "out/${TOP}.post_compile.sdc"] +set OUT_DIR [ev OUT_DIR "images"] +set SAIF_FILE [ev SAIF_FILE ""] +set SAIF_INST [ev SAIF_INST ""] +set TF [ev PDK_TF ""] +set LEFS [ev PDK_LEFS ""] +set DBS [ev PDK_DBS ""] +set SRAM_DBS [ev PDK_SRAM_DBS ""] +set TLU [ev PDK_TLU ""] +set SCALE [ev PDK_SCALE 10000] +set UTIL [ev UTIL 0.60] +set GDS_MODE [ev GDS 0] +## GDS_PLACE_OPT=1 (default) uses timing-driven place_opt+CTS; =0 uses create_placement only +## (a safe fallback — place_opt can segfault optimizing large DesignWare datapath cells). +set GDS_PLACE_OPT [ev GDS_PLACE_OPT 1] +set route_done 0 + +if {![file exists $NETLIST]} { error "run_icc2: netlist not found: $NETLIST (run synthesis first)" } +if {$TF eq "" || $LEFS eq ""} { error "run_icc2: PDK_TF / PDK_LEFS not set (unsupported LIB_TYPE for layout?)" } +file mkdir $OUT_DIR +set_host_options -max_cores 8 + +## ---------------- 1. Library (timing + physical) ---------------- +## The .db are 1x native and LEF/.tf may be Nx scaled; forcing -scale_factor avoids the +## LIB-055 precision clash. NDM cell libs are cached under ./CLIBs after the first run. +if {$DBS ne ""} { set_app_var link_library [concat "*" $DBS $SRAM_DBS] } +set DLIB $OUT_DIR/${TOP}.dlib +file delete -force $DLIB +catch { set_app_options -name lib.setting.use_tech_scale_factor -value true } +create_lib -technology $TF -ref_libs $LEFS -scale_factor $SCALE $DLIB + +## ---------------- 2. Netlist ---------------- +read_verilog -top $TOP $NETLIST +link_block + +## ---------------- 3. Parasitics + constraints ---------------- +if {$TLU ne "" && [file exists $TLU]} { + set tlu_use $TLU + ## ASAP7 ships asap07.tluplus gzip-compressed; decompress to OUT_DIR if needed. + if {![catch {exec file -L $TLU} ft] && [string match -nocase "*gzip*" $ft]} { + set tlu_use $OUT_DIR/parasitic.tluplus + if {[catch {exec sh -c "gunzip -c [list $TLU] > [list $tlu_use]"} ge]} { note "WARN gunzip tlu: $ge"; set tlu_use $TLU } + } + if {[catch {read_parasitic_tech -tlup $tlu_use -name nominal} e]} { note "WARN parasitic: $e" } \ + else { catch { set_parasitic_parameters -late_spec nominal -early_spec nominal } } +} +if {[file exists $SDC]} { if {[catch {read_sdc $SDC} e]} { note "WARN sdc: $e" } } + +## ---------------- 4. Floorplan + placement (+ optional route for GDS) ---------------- +initialize_floorplan -core_utilization $UTIL -core_offset 5 +if {[catch {connect_pg_net -automatic} e]} { note "WARN pg: $e" } +if {[catch {place_pins -self} e]} { note "WARN pins: $e" } + +if {$GDS_MODE == 1 && $DBS ne ""} { + ## ---- full place-and-route to produce a routed GDS ---- + if {$GDS_PLACE_OPT == 1} { + note "GDS mode: place_opt -> CTS -> route" + if {[catch {place_opt} e]} { note "WARN place_opt: $e"; catch {create_placement}; catch {legalize_placement} } + if {[catch {clock_opt} e]} { note "WARN clock_opt: $e" } + } else { + note "GDS mode (safe): create_placement -> route (skip place_opt/CTS)" + if {[catch {create_placement} e]} { note "WARN place: $e" } + if {[catch {legalize_placement} e]} { note "WARN legalize: $e" } + } + if {[catch {route_auto} e]} { note "WARN route_auto: $e" } else { set route_done 1; note "routing complete" } +} else { + if {[catch {create_placement} e]} { note "WARN place: $e" } + if {[catch {legalize_placement} e]} { note "WARN legalize: $e" } +} +catch { redirect $OUT_DIR/placement.rpt {report_placement} } +catch { save_block } + +## ---------------- 5. Power (SAIF if given, else vectorless) ---------------- +set have_power 0 +if {$DBS ne ""} { + if {$SAIF_FILE ne "" && [file exists $SAIF_FILE]} { + note "power: SAIF activity from $SAIF_FILE" + if {$SAIF_INST ne ""} { + catch { read_saif $SAIF_FILE -strip_path $SAIF_INST } + } else { + catch { read_saif $SAIF_FILE } + } + } else { + note "power: no SAIF -> vectorless estimation" + } + if {[catch {redirect $OUT_DIR/power.rpt {report_power}} e]} { note "WARN power: $e" } else { set have_power 1 } +} else { + note "power: PDK_DBS not set -> skipping power map" +} + +## ---------------- 5b. GDS ---------------- +if {$GDS_MODE == 1} { + if {[catch {write_gds $OUT_DIR/${TOP}.gds} e]} { + note "WARN write_gds: $e" + } else { + note "wrote GDS ([expr {$route_done ? {routed} : {placed-only}}]): $OUT_DIR/${TOP}.gds" + } +} + +## ---------------- 6. Layout images ---------------- +## FC's native GUI map modes (powerDensityMap/cellDensityMap/Hierarchy) do not compute +## in a headless/batch session (no interactive event loop). We instead render each figure +## by coloring leaf cells per bin with gui_change_highlight, which works in batch. +gui_start +set WIN [lindex [gui_get_window_ids] 0] +if {$WIN eq ""} { set WIN BlockWindow.1 } +set VIEW Layout.1 +gui_set_active_window -window $WIN +catch { gui_zoom -window $WIN -full } + +set HEAT {blue cyan green yellow orange red} ;# low -> high +set PALETTE {red green blue yellow cyan magenta orange white salmon} +set LEAF [get_cells -hierarchical -filter "is_hierarchical==false"] +note "leaf cells: [sizeof_collection $LEAF]" + +proc snap {name} { + global OUT_DIR WIN VIEW + catch { gui_zoom -window $WIN -full } + if {[catch {gui_write_window_image -window $VIEW -format png -file $OUT_DIR/${name}.png}]} { + catch { gui_write_window_image -window $WIN -format png -file $OUT_DIR/${name}.png } + } +} +proc clear_hl {} { global LEAF; catch { gui_change_highlight -remove -all_colors -collection $LEAF } } + +## plain placement (no coloring) +clear_hl +snap placement + +## A/C-style attribute heat map: QUANTILE bins (≈equal cells per color) by $attr. +## Quantile (rank-based) edges instead of linear, so a few high-power outliers don't +## collapse 99% of cells into the lowest bin — the spatial distribution stays legible. +proc heat_attr {attr name} { + global LEAF HEAT OUT_DIR + set s [sort_collection $LEAF $attr] + set names [get_object_name $s] + set n [llength $names] + if {$n == 0} { note "$name: no leaf cells (skip)"; return 0 } + set vmax [get_attribute [index_collection $s [expr {$n-1}]] $attr] + if {$vmax eq "" || $vmax <= 0} { note "$name: no positive '$attr' (skip)"; return 0 } + set nb [llength $HEAT] + set lf [open $OUT_DIR/${name}_legend.txt w] + puts $lf "# ${name}.png - '$attr' per leaf cell, QUANTILE bins (~equal cells/color), low(blue) -> high(red)" + puts $lf "# color ${attr}_range cells" + clear_hl + for {set i 0} {$i < $nb} {incr i} { + set lo [expr {$n*$i/$nb}]; set hi [expr {$n*($i+1)/$nb}] + if {$hi <= $lo} continue + set sub [get_cells [lrange $names $lo [expr {$hi-1}]]] + set loval [get_attribute [index_collection $s $lo] $attr] + set hival [get_attribute [index_collection $s [expr {$hi-1}]] $attr] + set col [lindex $HEAT $i] + note [format " %s %-6s \[%.4g, %.4g]: %d cells" $name $col $loval $hival [expr {$hi-$lo}]] + puts $lf [format "%-8s \[%.4g, %.4g\] %d" $col $loval $hival [expr {$hi-$lo}]] + catch { gui_change_highlight -add -color $col -collection $sub } + } + close $lf + snap $name + clear_hl + return 1 +} + +## B: color leaf cells by their hierarchical block (first 2 path levels; flat glue grouped) +proc color_hier {name} { + global LEAF PALETTE OUT_DIR + array unset grp + foreach_in_collection c $LEAF { + set f [get_object_name $c] + set parts [split $f "/"] + if {[llength $parts] <= 1} { set g "_top_glue_" } \ + elseif {[llength $parts] == 2} { set g [lindex $parts 0] } \ + else { set g "[lindex $parts 0]/[lindex $parts 1]" } + lappend grp($g) $f + } + set lf [open $OUT_DIR/${name}_legend.txt w] + puts $lf "# ${name}.png - color -> hierarchical block (leaf cells)." + puts $lf "# Colors assigned by ALPHABETICAL block name from palette (cyclic; positional, not semantic):" + puts $lf "# $PALETTE" + puts $lf "# color block cells" + clear_hl + set i 0 + foreach g [lsort [array names grp]] { + set col [lindex $PALETTE [expr {$i % [llength $PALETTE]}]] + note " hier $col <- $g ([llength $grp($g)] cells)" + puts $lf [format "%-8s %-36s %d" $col $g [llength $grp($g)]] + catch { gui_change_highlight -add -color $col -collection [get_cells $grp($g)] } + incr i + } + close $lf + snap $name + clear_hl +} + +## C: placement cell-density heat map. Grid the core by cell origin, color cells by the +## area-occupancy of their grid bin. +proc density_map {name {N 40}} { + global LEAF HEAT OUT_DIR + set bb "" + foreach a {boundary_bbox bbox} { if {$bb eq ""} { catch { set bb [get_attribute [current_block] $a] } } } + if {$bb eq ""} { note "$name: no block bbox (skip)"; return 0 } + set x0 [lindex [lindex $bb 0] 0]; set y0 [lindex [lindex $bb 0] 1] + set x1 [lindex [lindex $bb 1] 0]; set y1 [lindex [lindex $bb 1] 1] + set dx [expr {($x1-$x0)/double($N)}]; set dy [expr {($y1-$y0)/double($N)}] + if {$dx <= 0 || $dy <= 0} { note "$name: bad bbox (skip)"; return 0 } + array unset barea; array unset bcells + foreach_in_collection c $LEAF { + set o [get_attribute $c origin]; set a [get_attribute $c area] + if {$o eq "" || $a eq ""} continue + set bx [expr {int(([lindex $o 0]-$x0)/$dx)}]; set by [expr {int(([lindex $o 1]-$y0)/$dy)}] + if {$bx < 0} {set bx 0}; if {$bx >= $N} {set bx [expr {$N-1}]} + if {$by < 0} {set by 0}; if {$by >= $N} {set by [expr {$N-1}]} + set k "$bx.$by" + set barea($k) [expr {([info exists barea($k)] ? $barea($k) : 0)+$a}] + lappend bcells($k) [get_object_name $c] + } + set binA [expr {$dx*$dy}]; set dmax 0 + foreach k [array names barea] { set d [expr {$barea($k)/$binA}]; if {$d > $dmax} {set dmax $d} } + if {$dmax <= 0} { note "$name: zero density (skip)"; return 0 } + set n [llength $HEAT]; array unset lvl + foreach k [array names barea] { + set li [expr {int(($barea($k)/$binA)/$dmax*$n)}]; if {$li >= $n} {set li [expr {$n-1}]} + foreach cn $bcells($k) { lappend lvl($li) $cn } + } + set lf [open $OUT_DIR/${name}_legend.txt w] + puts $lf "# ${name}.png - cell-area occupancy per ${N}x${N} grid bin (fraction of bin area)" + puts $lf "# gradient low(blue) -> high(red); dmax = [format %.3f $dmax]" + puts $lf "# color occupancy_range" + clear_hl + for {set i 0} {$i < $n} {incr i} { + puts $lf [format "%-8s \[%.3f, %.3f)" [lindex $HEAT $i] [expr {$dmax*$i/$n}] [expr {$dmax*($i+1)/$n}]] + if {[info exists lvl($i)]} { catch { gui_change_highlight -add -color [lindex $HEAT $i] -collection [get_cells $lvl($i)] } } + } + close $lf + note " density max = [format %.3f $dmax] (grid ${N}x${N})" + snap $name + clear_hl + return 1 +} + +## B: hierarchy-colored floorplan +color_hier hierarchy + +## C: cell-density heat map +density_map cell_density 40 + +## A: power heat map (vectorless or SAIF); fall back to leakage if no dynamic +if {$have_power} { + if {![heat_attr total_power power_density]} { heat_attr leakage_power power_density } +} else { + note "skipping power_density image (no power data)" +} + +gui_stop +note "DONE. Images in $OUT_DIR/" +exit diff --git a/hw/syn/xilinx/xrt/Makefile b/hw/syn/xilinx/xrt/Makefile index 37653446cd..b3cda2c0f0 100644 --- a/hw/syn/xilinx/xrt/Makefile +++ b/hw/syn/xilinx/xrt/Makefile @@ -158,9 +158,15 @@ ifneq (,$(filter -DVX_CFG_EXT_OM_ENABLE, $(XCONFIGS))) RTL_INCLUDE += -I$(RTL_DIR)/om endif +# Kernel clock target frequency (MHz). The U55C platform default is 300; +# lowered to 250 for timing-closure margin. Override on the command line +# (e.g. KERNEL_FREQ=300) to retarget. +KERNEL_FREQ ?= 250 + # Kernel compiler global settings VPP_FLAGS += --link --target $(TARGET) --platform $(PLATFORM) --save-temps --no_ip_cache VPP_FLAGS += --vivado.synth.jobs $(JOBS) --vivado.impl.jobs $(JOBS) +VPP_FLAGS += --kernel_frequency $(KERNEL_FREQ) # register compilation hooks VPP_FLAGS += --xp "vivado_prop:run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=${SRC_DIR}/pre_opt_hook.tcl" diff --git a/hw/unittest/cache/VX_cache_top.sv b/hw/unittest/cache/VX_cache_top.sv index e4854187d2..dc51ae59c7 100644 --- a/hw/unittest/cache/VX_cache_top.sv +++ b/hw/unittest/cache/VX_cache_top.sv @@ -57,6 +57,9 @@ module VX_cache_top import VX_gpu_pkg::*; #( // Core response output buffer parameter CORE_OUT_BUF = 3, + // Enable AMO support (tracks the A extension by default) + parameter AMO_ENABLE = `VX_CFG_EXT_A_ENABLED, + // Memory request output buffer parameter MEM_OUT_BUF = 3, @@ -167,6 +170,7 @@ module VX_cache_top import VX_gpu_pkg::*; #( .WRITE_ENABLE (WRITE_ENABLE), .WRITEBACK (WRITEBACK), .DIRTY_BYTES (DIRTY_BYTES), + .AMO_ENABLE (AMO_ENABLE), .CORE_OUT_BUF (CORE_OUT_BUF), .MEM_OUT_BUF (MEM_OUT_BUF) ) cache ( diff --git a/sim/opaesim/opae_sim.cpp b/sim/opaesim/opae_sim.cpp index bf825e30dc..0996a2734c 100644 --- a/sim/opaesim/opae_sim.cpp +++ b/sim/opaesim/opae_sim.cpp @@ -38,6 +38,8 @@ #include #include +#include +#include #include #include #include @@ -174,8 +176,19 @@ class opae_sim::Impl { // launch execution thread future_ = std::async(std::launch::async, [&]{ while (!stop_) { - std::lock_guard guard(mutex_); - this->tick(); + // Give host-side MMIO/mem calls absolute priority: while any host op + // is pending, fully back off (don't even contend for mutex_). Without + // this the free-running ticker re-acquires the lock in a tight loop and + // starves the host thread, hanging the run (it stalled at device init). + if (host_waiters_.load(std::memory_order_acquire) != 0) { + std::this_thread::yield(); + continue; + } + { + std::lock_guard guard(mutex_); + this->tick(); + } + std::this_thread::yield(); } }); @@ -221,7 +234,7 @@ class opae_sim::Impl { } void read_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t *value) { - std::lock_guard guard(mutex_); + HostLock guard(*this); // simulate CPU-GPU latency for (uint32_t i = 0; i < CPU_GPU_LATENCY; ++i) { @@ -248,7 +261,7 @@ class opae_sim::Impl { } void write_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t value) { - std::lock_guard guard(mutex_); + HostLock guard(*this); // simulate CPU-GPU latency for (uint32_t i = 0; i < CPU_GPU_LATENCY; ++i) { @@ -266,8 +279,8 @@ class opae_sim::Impl { } void copy(uint64_t dest, uint64_t src, uint64_t size) { - - std::lock_guard guard(mutex_); + + HostLock guard(*this); ram_->copy(dest, src, size); } @@ -546,6 +559,24 @@ class opae_sim::Impl { std::list cci_writes_; std::mutex mutex_; + // Count of host threads waiting on / holding mutex_. The free-running sim + // ticker backs off whenever this is non-zero so host MMIO/mem ops are never + // starved by the background ticker. + std::atomic host_waiters_{0}; + + // RAII guard for every host-side entry point: registers intent (so the + // ticker yields) before acquiring mutex_, and clears it after releasing. + struct HostLock { + opae_sim::Impl& impl_; + explicit HostLock(opae_sim::Impl& impl) : impl_(impl) { + impl_.host_waiters_.fetch_add(1, std::memory_order_acquire); + impl_.mutex_.lock(); + } + ~HostLock() { + impl_.mutex_.unlock(); + impl_.host_waiters_.fetch_sub(1, std::memory_order_release); + } + }; std::queue dram_queue_; diff --git a/sim/simx/csr_unit.cpp b/sim/simx/csr_unit.cpp index f49813c844..b39f679842 100644 --- a/sim/simx/csr_unit.cpp +++ b/sim/simx/csr_unit.cpp @@ -173,8 +173,6 @@ Word CsrUnit::get_csr(uint32_t addr, uint32_t wid, uint32_t tid) { #ifdef VX_CFG_EXT_TCU_ENABLE CSR_READ_64(VX_CSR_MPM_INSTR_TCU, core_perf.tcu_instrs); #endif - CSR_READ_64(VX_CSR_MPM_MEM_READS, proc_perf.mem_reads); - CSR_READ_64(VX_CSR_MPM_MEM_WRITES, proc_perf.mem_writes); CSR_READ_64(VX_CSR_MPM_IFETCHES, core_perf.ifetches); CSR_READ_64(VX_CSR_MPM_IFETCH_LT, core_perf.ifetch_latency); CSR_READ_64(VX_CSR_MPM_LOADS, core_perf.loads); @@ -182,52 +180,64 @@ Word CsrUnit::get_csr(uint32_t addr, uint32_t wid, uint32_t tid) { CSR_READ_64(VX_CSR_MPM_LOAD_LT, core_perf.load_latency); } } break; - case VX_DCR_MPM_CLASS_MEM: { - auto cluster_perf = core_->socket()->cluster()->perf_stats(); + case VX_DCR_MPM_CLASS_ICACHE: { auto socket_perf = core_->socket()->perf_stats(); - auto lmem_perf = core_->local_mem()->perf_stats(); - - uint64_t coalescer_misses = 0; - for (uint i = 0; i < VX_CFG_NUM_LSU_BLOCKS; ++i) { - coalescer_misses += core_->mem_coalescer(i)->perf_stats().misses; - } - switch (addr) { CSR_READ_64(VX_CSR_MPM_ICACHE_READS, socket_perf.icache.reads); CSR_READ_64(VX_CSR_MPM_ICACHE_MISS_R, socket_perf.icache.read_misses); CSR_READ_64(VX_CSR_MPM_ICACHE_MSHR_ST, socket_perf.icache.mshr_stalls); - + } + } break; + case VX_DCR_MPM_CLASS_DCACHE: { + auto socket_perf = core_->socket()->perf_stats(); + switch (addr) { CSR_READ_64(VX_CSR_MPM_DCACHE_READS, socket_perf.dcache.reads); CSR_READ_64(VX_CSR_MPM_DCACHE_WRITES, socket_perf.dcache.writes); CSR_READ_64(VX_CSR_MPM_DCACHE_MISS_R, socket_perf.dcache.read_misses); CSR_READ_64(VX_CSR_MPM_DCACHE_MISS_W, socket_perf.dcache.write_misses); + CSR_READ_64(VX_CSR_MPM_DCACHE_EVICTS, socket_perf.dcache.evictions); CSR_READ_64(VX_CSR_MPM_DCACHE_BANK_ST, socket_perf.dcache.bank_stalls); CSR_READ_64(VX_CSR_MPM_DCACHE_MSHR_ST, socket_perf.dcache.mshr_stalls); - + } + } break; + case VX_DCR_MPM_CLASS_L2CACHE: { + auto cluster_perf = core_->socket()->cluster()->perf_stats(); + switch (addr) { CSR_READ_64(VX_CSR_MPM_L2CACHE_READS, cluster_perf.l2cache.reads); CSR_READ_64(VX_CSR_MPM_L2CACHE_WRITES, cluster_perf.l2cache.writes); CSR_READ_64(VX_CSR_MPM_L2CACHE_MISS_R, cluster_perf.l2cache.read_misses); CSR_READ_64(VX_CSR_MPM_L2CACHE_MISS_W, cluster_perf.l2cache.write_misses); + CSR_READ_64(VX_CSR_MPM_L2CACHE_EVICTS, cluster_perf.l2cache.evictions); CSR_READ_64(VX_CSR_MPM_L2CACHE_BANK_ST, cluster_perf.l2cache.bank_stalls); CSR_READ_64(VX_CSR_MPM_L2CACHE_MSHR_ST, cluster_perf.l2cache.mshr_stalls); - + } + } break; + case VX_DCR_MPM_CLASS_L3CACHE: { + switch (addr) { CSR_READ_64(VX_CSR_MPM_L3CACHE_READS, proc_perf.l3cache.reads); CSR_READ_64(VX_CSR_MPM_L3CACHE_WRITES, proc_perf.l3cache.writes); CSR_READ_64(VX_CSR_MPM_L3CACHE_MISS_R, proc_perf.l3cache.read_misses); CSR_READ_64(VX_CSR_MPM_L3CACHE_MISS_W, proc_perf.l3cache.write_misses); + CSR_READ_64(VX_CSR_MPM_L3CACHE_EVICTS, proc_perf.l3cache.evictions); CSR_READ_64(VX_CSR_MPM_L3CACHE_BANK_ST, proc_perf.l3cache.bank_stalls); CSR_READ_64(VX_CSR_MPM_L3CACHE_MSHR_ST, proc_perf.l3cache.mshr_stalls); - + } + } break; + case VX_DCR_MPM_CLASS_MEM: { + auto lmem_perf = core_->local_mem()->perf_stats(); + uint64_t coalescer_misses = 0; + for (uint i = 0; i < VX_CFG_NUM_LSU_BLOCKS; ++i) { + coalescer_misses += core_->mem_coalescer(i)->perf_stats().misses; + } + switch (addr) { CSR_READ_64(VX_CSR_MPM_MEM_READS, proc_perf.mem_reads); CSR_READ_64(VX_CSR_MPM_MEM_WRITES, proc_perf.mem_writes); CSR_READ_64(VX_CSR_MPM_MEM_LT, proc_perf.mem_latency); CSR_READ_64(VX_CSR_MPM_MEM_BANK_ST, proc_perf.memsim.bank_stalls); - - CSR_READ_64(VX_CSR_MPM_COALESCER_MISS, coalescer_misses); - CSR_READ_64(VX_CSR_MPM_LMEM_READS, lmem_perf.reads); CSR_READ_64(VX_CSR_MPM_LMEM_WRITES, lmem_perf.writes); CSR_READ_64(VX_CSR_MPM_LMEM_BANK_ST, lmem_perf.bank_stalls); + CSR_READ_64(VX_CSR_MPM_COALESCER_MISS, coalescer_misses); } } break; #ifdef VX_CFG_EXT_TCU_ENABLE diff --git a/sim/simx/mem/cache.cpp b/sim/simx/mem/cache.cpp index e4c98cf93b..112f08a505 100644 --- a/sim/simx/mem/cache.cpp +++ b/sim/simx/mem/cache.cpp @@ -112,11 +112,13 @@ struct line_t { uint32_t lru_ctr; bool valid; bool dirty; + uint64_t dirty_mask; // per-byte dirty bits (DIRTY_BYTES model) std::shared_ptr data; // line bytes void reset() { valid = false; dirty = false; + dirty_mask = 0; lru_ctr = 0; data.reset(); } @@ -423,7 +425,14 @@ class MSHR { mshr_entry_t &replay(uint32_t id) { auto &root_entry = entries_.at(id); assert(root_entry.bank_req.type == bank_req_t::Core); - assert(ready_reqs_ == 0); + // A prior fill's replay batch may still be draining: a stalled fill + // (writeback egress backpressure) can sit in the pipe while a second + // fill is admitted behind it, so two fills' replays can be live at once. + // This is safe — the MSHR coalesces misses, so the two fills are for + // distinct lines and this replay marks only its own line's waiters; the + // accumulated ready_reqs_ stays correct and dequeue preserves per-line + // read-before-write ordering. Double-replaying the same fill is caught by + // the Core-type assert above. for (auto &entry : entries_) { if (entry.bank_req.type == bank_req_t::Core && entry.set_id == root_entry.set_id && entry.addr_tag == root_entry.addr_tag) { entry.bank_req.type = bank_req_t::Replay; @@ -768,6 +777,7 @@ class CacheBank : public SimObject { line_merge(hit_line, store_block, byteen); if (config_.write_back) { hit_line.dirty = true; + hit_line.dirty_mask |= byteen; } else { // Write-through: emit a write of the merged word downstream. MemReq w; @@ -848,7 +858,7 @@ class CacheBank : public SimObject { wb.hart_id = bank_req.hart_id; wb.uuid = bank_req.uuid; wb.data = line.data; - wb.byteen = ~uint64_t(0) >> (64 - VX_CFG_MEM_BLOCK_SIZE); + wb.byteen = line.dirty_mask; this->mem_req_out.send(wb); DT(3, this->name() << " amo-probe-wb: " << wb); ++perf_stats_.evictions; @@ -857,6 +867,7 @@ class CacheBank : public SimObject { auto &line = set.lines.at(hit_id); line.valid = false; line.dirty = false; + line.dirty_mask = 0; } // Forward AMO downstream. Tag is rewritten so the response @@ -911,7 +922,7 @@ class CacheBank : public SimObject { wb.hart_id = bank_req.hart_id; wb.uuid = bank_req.uuid; wb.data = victim_line.data; - wb.byteen = ~uint64_t(0) >> (64 - VX_CFG_MEM_BLOCK_SIZE); + wb.byteen = victim_line.dirty_mask; this->mem_req_out.send(wb); DT(3, this->name() << " writeback: " << wb); ++perf_stats_.evictions; @@ -920,6 +931,7 @@ class CacheBank : public SimObject { victim_line.tag = addr_tag; victim_line.lru_ctr = 0; victim_line.dirty = false; + victim_line.dirty_mask = 0; victim_line.data = bank_req.data; mshr_.replay(bank_req.mshr_id); pipe_req_->pop(); @@ -994,8 +1006,10 @@ class CacheBank : public SimObject { assert(bank_req.skip_core_rsp && "WT replay without pre-sent store"); } line_merge(hit_line, bank_req.data, bank_req.byteen); - if (config_.write_back) + if (config_.write_back) { hit_line.dirty = true; + hit_line.dirty_mask |= bank_req.byteen; + } #if VX_CFG_EXT_A_ENABLED // Write-back write-miss replay reaching the LLC tag array: // break other harts' reservations. For the WT wt-merge replay, @@ -1055,6 +1069,7 @@ class CacheBank : public SimObject { line_merge(hit_line, bank_req.data, bank_req.byteen); if (config_.write_back) { hit_line.dirty = true; + hit_line.dirty_mask |= bank_req.byteen; } else { MemReq w; w.addr = params_.mem_addr(bank_id_, set_id, addr_tag); @@ -1198,11 +1213,12 @@ class CacheBank : public SimObject { mem_req.addr = params_.mem_addr(bank_id_, flush_set_idx_, line.tag); mem_req.op = MemOp::ST; mem_req.data = line.data; - mem_req.byteen = ~uint64_t(0) >> (64 - VX_CFG_MEM_BLOCK_SIZE); + mem_req.byteen = line.dirty_mask; this->mem_req_out.send(mem_req); DT(3, this->name() << " flush-wb: " << mem_req); ++perf_stats_.evictions; line.dirty = false; + line.dirty_mask = 0; } ++flush_way_idx_; } diff --git a/sim/simx/mem/mmu_tlb.h b/sim/simx/mem/mmu_tlb.h index ba9aca8a82..f751a7cdd6 100644 --- a/sim/simx/mem/mmu_tlb.h +++ b/sim/simx/mem/mmu_tlb.h @@ -17,7 +17,7 @@ namespace vortex { // Per-core TLB. Small fully-associative CAM of {vpn → ppn} translations -// with MRU-style eviction. Tracks MMU perf counters (VX_DCR_MPM_CLASS_VM). +// with MRU-style eviction. Tracks MMU perf counters (VX_DCR_MPM_CLASS_MEM). class Tlb { public: explicit Tlb(uint32_t size = VX_CFG_TLB_SIZE); diff --git a/sim/xrtsim/xrt_sim.cpp b/sim/xrtsim/xrt_sim.cpp index 4b48214e9b..21b9bff275 100644 --- a/sim/xrtsim/xrt_sim.cpp +++ b/sim/xrtsim/xrt_sim.cpp @@ -36,6 +36,8 @@ #include #include +#include +#include #include #include #include @@ -262,8 +264,20 @@ class xrt_sim::Impl { // launch execution thread future_ = std::async(std::launch::async, [&]{ while (!stop_) { - std::lock_guard guard(mutex_); - this->tick(); + // Give host-side MMIO/mem calls absolute priority: while any host op + // is pending, fully back off (don't even contend for mutex_). Without + // this the free-running ticker re-acquires the lock in a tight loop and + // starves the host thread — it could not complete cp_init/cp_submit and + // the sim ticked forever (dumping VCD unboundedly), hanging the run. + if (host_waiters_.load(std::memory_order_acquire) != 0) { + std::this_thread::yield(); + continue; + } + { + std::lock_guard guard(mutex_); + this->tick(); + } + std::this_thread::yield(); } }); @@ -283,7 +297,7 @@ class xrt_sim::Impl { } int mem_write(uint32_t bank_id, uint64_t addr, uint64_t size, const void* data) { - std::lock_guard guard(mutex_); + HostLock guard(*this); if (bank_id >= VX_CFG_PLATFORM_MEMORY_NUM_BANKS) return -1; @@ -293,7 +307,7 @@ class xrt_sim::Impl { } int mem_read(uint32_t bank_id, uint64_t addr, uint64_t size, void* data) { - std::lock_guard guard(mutex_); + HostLock guard(*this); if (bank_id >= VX_CFG_PLATFORM_MEMORY_NUM_BANKS) return -1; @@ -303,7 +317,7 @@ class xrt_sim::Impl { } int mem_copy(uint32_t bank_id_dest , uint32_t bank_id_src, uint64_t dest_addr, uint64_t src_addr, uint64_t size) { - std::lock_guard guard(mutex_); + HostLock guard(*this); if( bank_id_dest >= VX_CFG_PLATFORM_MEMORY_NUM_BANKS || bank_id_src >= VX_CFG_PLATFORM_MEMORY_NUM_BANKS) return -1; uint64_t dest_base_addr = bank_id_dest * mem_bank_size_ + dest_addr; @@ -315,29 +329,29 @@ class xrt_sim::Impl { // ----- Host memory (XRT host-only BOs; reached by m_axi_host) ----- int host_mem_alloc(uint64_t size, uint64_t* addr) { - std::lock_guard guard(mutex_); + HostLock guard(*this); return host_alloc_->allocate(size, addr); } int host_mem_free(uint64_t addr) { - std::lock_guard guard(mutex_); + HostLock guard(*this); return host_alloc_->release(addr); } int host_mem_write(uint64_t addr, uint64_t size, const void* data) { - std::lock_guard guard(mutex_); + HostLock guard(*this); host_ram_->write(data, addr, size); return 0; } int host_mem_read(uint64_t addr, uint64_t size, void* data) { - std::lock_guard guard(mutex_); + HostLock guard(*this); host_ram_->read(data, addr, size); return 0; } int register_write(uint32_t offset, uint32_t value) { - std::lock_guard guard(mutex_); + HostLock guard(*this); // write address device_->s_axi_ctrl_awvalid = 1; @@ -369,7 +383,7 @@ class xrt_sim::Impl { } int register_read(uint32_t offset, uint32_t* value) { - std::lock_guard guard(mutex_); + HostLock guard(*this); // read address device_->s_axi_ctrl_arvalid = 1; device_->s_axi_ctrl_araddr = offset; @@ -798,6 +812,24 @@ class xrt_sim::Impl { bool stop_; std::mutex mutex_; + // Count of host threads waiting on / holding mutex_. The free-running sim + // ticker backs off whenever this is non-zero so host MMIO/mem ops are never + // starved by the background ticker. + std::atomic host_waiters_{0}; + + // RAII guard for every host-side entry point: registers intent (so the + // ticker yields) before acquiring mutex_, and clears it after releasing. + struct HostLock { + Impl& impl_; + explicit HostLock(Impl& impl) : impl_(impl) { + impl_.host_waiters_.fetch_add(1, std::memory_order_acquire); + impl_.mutex_.lock(); + } + ~HostLock() { + impl_.mutex_.unlock(); + impl_.host_waiters_.fetch_sub(1, std::memory_order_release); + } + }; std::list pending_mem_reqs_[VX_CFG_PLATFORM_MEMORY_NUM_BANKS]; diff --git a/sw/runtime/common/legacy_perf.cpp b/sw/runtime/common/legacy_perf.cpp index b3d49630c0..b002a6c318 100644 --- a/sw/runtime/common/legacy_perf.cpp +++ b/sw/runtime/common/legacy_perf.cpp @@ -141,15 +141,6 @@ struct CoreCounters { // memory uint64_t mem_reads = 0; uint64_t mem_writes = 0; - - // VM (icache + dcache MMU summed in hardware) — fields always present; - // populated only when the device reports VM (VX_CAPS_VM_SUPPORT). - uint64_t tlb_reads = 0; - uint64_t tlb_hits = 0; - uint64_t tlb_misses = 0; - uint64_t tlb_evicts = 0; - uint64_t ptw_walks = 0; - uint64_t ptw_latency = 0; }; struct CacheCounters { @@ -288,17 +279,6 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE *stream) { CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_LOAD_LT, core_id, &c.load_lt), { return err; }); CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_STORES, core_id, &c.stores), { return err; }); - if (vm_enabled) { - // VM/MMU lives in its own perf class (CLASS_VM), independent of CORE/MEM. - // Hardware sums icache + dcache MMU counters into one bank. - CHECK_ERR(vx_mpm_query(hdevice, VX_DCR_MPM_CLASS_VM, VX_CSR_MPM_TLB_READS, core_id, &c.tlb_reads), { return err; }); - CHECK_ERR(vx_mpm_query(hdevice, VX_DCR_MPM_CLASS_VM, VX_CSR_MPM_TLB_HITS, core_id, &c.tlb_hits), { return err; }); - CHECK_ERR(vx_mpm_query(hdevice, VX_DCR_MPM_CLASS_VM, VX_CSR_MPM_TLB_MISSES, core_id, &c.tlb_misses), { return err; }); - CHECK_ERR(vx_mpm_query(hdevice, VX_DCR_MPM_CLASS_VM, VX_CSR_MPM_TLB_EVICTS, core_id, &c.tlb_evicts), { return err; }); - CHECK_ERR(vx_mpm_query(hdevice, VX_DCR_MPM_CLASS_VM, VX_CSR_MPM_PTW_WALKS, core_id, &c.ptw_walks), { return err; }); - CHECK_ERR(vx_mpm_query(hdevice, VX_DCR_MPM_CLASS_VM, VX_CSR_MPM_PTW_LATENCY, core_id, &c.ptw_latency), { return err; }); - } - if (num_cores > 1) { // Per-Core report const uint64_t cycles = c.cycles; @@ -349,13 +329,6 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE *stream) { perf_print_core(stream, core_id, "memory: ifetch_lat=%.2f, load_lat=%.2f, loads=%" PRIu64 ", stores=%" PRIu64, ifetch_avg_lt, load_avg_lt, c.loads, c.stores); - if (vm_enabled) { - const int tlb_hit_pct = calc_percent(c.tlb_hits, c.tlb_reads); - const double ptw_avg_lt = safe_div((double)c.ptw_latency, (double)c.ptw_walks); - perf_print_core(stream, core_id, "vm: tlb_reads=%" PRIu64 ", hit=%d%%, evicts=%" PRIu64 ", ptw_walks=%" PRIu64 ", ptw_avg_lat=%.2f", - c.tlb_reads, tlb_hit_pct, c.tlb_evicts, c.ptw_walks, ptw_avg_lt); - } - perf_print_core(stream, core_id, "instrs=%" PRIu64 ", cycles=%" PRIu64 ", IPC=%.3f", c.instrs, c.cycles, ipc); } @@ -392,21 +365,8 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE *stream) { tot.loads += c.loads; tot.load_lt += c.load_lt; tot.stores += c.stores; - - if (vm_enabled) { - tot.tlb_reads += c.tlb_reads; - tot.tlb_hits += c.tlb_hits; - tot.tlb_misses += c.tlb_misses; - tot.tlb_evicts += c.tlb_evicts; - tot.ptw_walks += c.ptw_walks; - tot.ptw_latency += c.ptw_latency; - } } - // Query global MPM counters - CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_MEM_READS, 0, &tot.mem_reads), { return err; }); - CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_MEM_WRITES, 0, &tot.mem_writes), { return err; }); - // Core Summary uint64_t tot_cycles_wide = tot.cycles * issue_width; @@ -451,138 +411,131 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE *stream) { // Memory report const double tot_ifetch_avg_lt = safe_div((double)tot.ifetch_lt, (double)tot.ifetches); const double tot_load_avg_lt = safe_div((double)tot.load_lt, (double)tot.loads); - uint64_t read_bytes = tot.mem_reads * CACHE_BLOCK_SIZE; - uint64_t write_bytes = tot.mem_writes * CACHE_BLOCK_SIZE; - perf_print(stream, "memory: ifetch_lat=%.2f, load_lat=%.2f, loads=%" PRIu64 ", stores=%" PRIu64 ", read_bytes=%" PRIu64 ", write_bytes=%" PRIu64, - tot_ifetch_avg_lt, tot_load_avg_lt, tot.loads, tot.stores, read_bytes, write_bytes); - - if (vm_enabled) { - const int tot_tlb_hit_pct = calc_percent(tot.tlb_hits, tot.tlb_reads); - const double tot_ptw_avg_lt = safe_div((double)tot.ptw_latency, (double)tot.ptw_walks); - perf_print(stream, "vm: tlb_reads=%" PRIu64 ", hit=%d%%, evicts=%" PRIu64 ", ptw_walks=%" PRIu64 ", ptw_avg_lat=%.2f", - tot.tlb_reads, tot_tlb_hit_pct, tot.tlb_evicts, tot.ptw_walks, tot_ptw_avg_lt); - } + perf_print(stream, "memory: ifetch_lat=%.2f, load_lat=%.2f, loads=%" PRIu64 ", stores=%" PRIu64, + tot_ifetch_avg_lt, tot_load_avg_lt, tot.loads, tot.stores); } break; - case VX_DCR_MPM_CLASS_MEM: { - const uint64_t num_sockets = (num_cores + socket_size - 1) / socket_size; - const uint64_t cores_per_cluster = (num_cores + num_clusters - 1) / num_clusters; - - CacheCounters icache_tot, dcache_tot, l2_tot, l3_tot; - CacheCounters lmem_tot; - uint64_t coalescer_miss_tot = 0; - - // Per-Core Local Memory & Coalescer (Print Core First) - for (uint32_t core_id = 0; core_id < num_cores; ++core_id) { - if (lmem_en) { - uint64_t r = 0, w = 0, bst = 0; - CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_LMEM_READS, core_id, &r), { return err; }); - CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_LMEM_WRITES, core_id, &w), { return err; }); - CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_LMEM_BANK_ST, core_id, &bst), { return err; }); - - lmem_tot.reads += r; - lmem_tot.writes += w; - lmem_tot.bank_st += bst; - perf_print_core(stream, core_id, "lmem: reqs=%" PRIu64 ", bank_stalls=%" PRIu64 " (utility=%d%%)", - r + w, bst, calc_utility(r + w, bst)); - } - - if (dcache_en) { - uint64_t cm = 0; - CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_COALESCER_MISS, core_id, &cm), { return err; }); - coalescer_miss_tot += cm; - perf_print_core(stream, core_id, "coalescer: misses=%" PRIu64, cm); - } - } - - // Per-Socket L1 - for (uint32_t s = 0; s < num_sockets; ++s) { - uint32_t rep_core = s * (uint32_t)socket_size; - if (rep_core >= num_cores) - continue; - - if (icache_en) { + // Each cache level is now its own MPM class; --perf selects which to dump. + case VX_DCR_MPM_CLASS_ICACHE: { + if (icache_en) { + const uint64_t num_sockets = (num_cores + socket_size - 1) / socket_size; + for (uint32_t s = 0; s < num_sockets; ++s) { + uint32_t rep_core = s * (uint32_t)socket_size; + if (rep_core >= num_cores) + continue; uint64_t r = 0, m = 0, st = 0; CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_ICACHE_READS, rep_core, &r), { return err; }); CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_ICACHE_MISS_R, rep_core, &m), { return err; }); CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_ICACHE_MSHR_ST, rep_core, &st), { return err; }); - - icache_tot.reads += r; - icache_tot.miss_r += m; - icache_tot.mshr_st += st; - perf_print_core(stream, rep_core, "icache: reads=%" PRIu64 ", miss=%" PRIu64 " (hit=%d%%), mshr_st=%" PRIu64 " (utility=%d%%)", r, m, calc_ratio(m, r), st, calc_utility(m, st)); } - if (dcache_en) { - uint64_t r = 0, w = 0, mr = 0, mw = 0, bst = 0, mst = 0; + } else { + perf_print(stream, "icache: disabled"); + } + } break; + + case VX_DCR_MPM_CLASS_DCACHE: { + if (dcache_en) { + const uint64_t num_sockets = (num_cores + socket_size - 1) / socket_size; + for (uint32_t s = 0; s < num_sockets; ++s) { + uint32_t rep_core = s * (uint32_t)socket_size; + if (rep_core >= num_cores) + continue; + uint64_t r = 0, w = 0, mr = 0, mw = 0, ev = 0, bst = 0, mst = 0; CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_DCACHE_READS, rep_core, &r), { return err; }); CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_DCACHE_WRITES, rep_core, &w), { return err; }); CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_DCACHE_MISS_R, rep_core, &mr), { return err; }); CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_DCACHE_MISS_W, rep_core, &mw), { return err; }); + CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_DCACHE_EVICTS, rep_core, &ev), { return err; }); CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_DCACHE_BANK_ST, rep_core, &bst), { return err; }); CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_DCACHE_MSHR_ST, rep_core, &mst), { return err; }); - - dcache_tot.reads += r; - dcache_tot.writes += w; - dcache_tot.miss_r += mr; - dcache_tot.miss_w += mw; - dcache_tot.bank_st += bst; - dcache_tot.mshr_st += mst; - - perf_print_core(stream, rep_core, "dcache: reqs=%" PRIu64 ", miss_r=%" PRIu64 " (hit=%d%%), miss_w=%" PRIu64 " (hit=%d%%), bank_st=%" PRIu64 " (utility=%d%%)", - r + w, mr, calc_ratio(mr, r), mw, calc_ratio(mw, w), bst, calc_utility(r + w, bst)); + perf_print_core(stream, rep_core, "dcache: reqs=%" PRIu64 ", miss_r=%" PRIu64 " (hit=%d%%), miss_w=%" PRIu64 " (hit=%d%%), evicts=%" PRIu64 ", bank_st=%" PRIu64 " (utility=%d%%)", + r + w, mr, calc_ratio(mr, r), mw, calc_ratio(mw, w), ev, bst, calc_utility(r + w, bst)); } + } else { + perf_print(stream, "dcache: disabled"); } + } break; - // Per-Cluster L2 + case VX_DCR_MPM_CLASS_L2CACHE: { if (l2_en) { + const uint64_t cores_per_cluster = (num_cores + num_clusters - 1) / num_clusters; for (uint32_t c = 0; c < num_clusters; ++c) { uint32_t rep_core = c * (uint32_t)cores_per_cluster; if (rep_core >= num_cores) continue; - uint64_t r = 0, w = 0, mr = 0, mw = 0, bst = 0, mst = 0; + uint64_t r = 0, w = 0, mr = 0, mw = 0, ev = 0, bst = 0, mst = 0; CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L2CACHE_READS, rep_core, &r), { return err; }); CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L2CACHE_WRITES, rep_core, &w), { return err; }); CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L2CACHE_MISS_R, rep_core, &mr), { return err; }); CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L2CACHE_MISS_W, rep_core, &mw), { return err; }); + CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L2CACHE_EVICTS, rep_core, &ev), { return err; }); CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L2CACHE_BANK_ST, rep_core, &bst), { return err; }); CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L2CACHE_MSHR_ST, rep_core, &mst), { return err; }); - - l2_tot.reads += r; - l2_tot.writes += w; - l2_tot.miss_r += mr; - l2_tot.miss_w += mw; - l2_tot.bank_st += bst; - l2_tot.mshr_st += mst; - - perf_print_core(stream, rep_core, "l2cache: reqs=%" PRIu64 ", miss_r=%" PRIu64 " (hit=%d%%), miss_w=%" PRIu64 " (hit=%d%%)", - r + w, mr, calc_ratio(mr, r), mw, calc_ratio(mw, w)); + perf_print_core(stream, rep_core, "l2cache: reqs=%" PRIu64 ", miss_r=%" PRIu64 " (hit=%d%%), miss_w=%" PRIu64 " (hit=%d%%), evicts=%" PRIu64, + r + w, mr, calc_ratio(mr, r), mw, calc_ratio(mw, w), ev); } + } else { + perf_print(stream, "l2cache: disabled"); } + } break; - // Global L3 + case VX_DCR_MPM_CLASS_L3CACHE: { if (l3_en) { - CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L3CACHE_READS, 0, &l3_tot.reads), { return err; }); - CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L3CACHE_WRITES, 0, &l3_tot.writes), { return err; }); - CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L3CACHE_MISS_R, 0, &l3_tot.miss_r), { return err; }); - CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L3CACHE_MISS_W, 0, &l3_tot.miss_w), { return err; }); - CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L3CACHE_BANK_ST, 0, &l3_tot.bank_st), { return err; }); - CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L3CACHE_MSHR_ST, 0, &l3_tot.mshr_st), { return err; }); - - perf_print(stream, "l3cache: reqs=%" PRIu64 ", miss_r=%" PRIu64 " (hit=%d%%), miss_w=%" PRIu64 " (hit=%d%%)", - l3_tot.reads + l3_tot.writes, l3_tot.miss_r, calc_ratio(l3_tot.miss_r, l3_tot.reads), - l3_tot.miss_w, calc_ratio(l3_tot.miss_w, l3_tot.writes)); + uint64_t r = 0, w = 0, mr = 0, mw = 0, ev = 0, bst = 0, mst = 0; + CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L3CACHE_READS, 0, &r), { return err; }); + CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L3CACHE_WRITES, 0, &w), { return err; }); + CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L3CACHE_MISS_R, 0, &mr), { return err; }); + CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L3CACHE_MISS_W, 0, &mw), { return err; }); + CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L3CACHE_EVICTS, 0, &ev), { return err; }); + CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L3CACHE_BANK_ST, 0, &bst), { return err; }); + CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L3CACHE_MSHR_ST, 0, &mst), { return err; }); + perf_print(stream, "l3cache: reqs=%" PRIu64 ", miss_r=%" PRIu64 " (hit=%d%%), miss_w=%" PRIu64 " (hit=%d%%), evicts=%" PRIu64, + r + w, mr, calc_ratio(mr, r), mw, calc_ratio(mw, w), ev); + } else { + perf_print(stream, "l3cache: disabled"); } + } break; - // Global DRAM + case VX_DCR_MPM_CLASS_MEM: { + // Per-core local memory + coalescer + VM/MMU. + for (uint32_t core_id = 0; core_id < num_cores; ++core_id) { + if (lmem_en) { + uint64_t r = 0, w = 0, bst = 0; + CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_LMEM_READS, core_id, &r), { return err; }); + CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_LMEM_WRITES, core_id, &w), { return err; }); + CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_LMEM_BANK_ST, core_id, &bst), { return err; }); + perf_print_core(stream, core_id, "lmem: reqs=%" PRIu64 ", bank_stalls=%" PRIu64 " (utility=%d%%)", + r + w, bst, calc_utility(r + w, bst)); + } + // The coalescer lives on the LSU->memory path; it exists when + // NUM_LSU_LANES > 1 && LSU_LINE_SIZE > XLENB, independent of the dcache. + // The counter is hard-zeroed in hardware when no coalescer is present. + { + uint64_t cm = 0; + CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_COALESCER_MISS, core_id, &cm), { return err; }); + perf_print_core(stream, core_id, "coalescer: misses=%" PRIu64, cm); + } + // VM/MMU (per-core; hardware sums icache + dcache MMU counters). + if (vm_enabled) { + uint64_t reads = 0, hits = 0, evicts = 0, walks = 0, lat = 0; + CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_TLB_READS, core_id, &reads), { return err; }); + CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_TLB_HITS, core_id, &hits), { return err; }); + CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_TLB_EVICTS, core_id, &evicts), { return err; }); + CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_PTW_WALKS, core_id, &walks), { return err; }); + CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_PTW_LATENCY, core_id, &lat), { return err; }); + perf_print_core(stream, core_id, "vm: tlb_reads=%" PRIu64 ", hit=%d%%, evicts=%" PRIu64 ", ptw_walks=%" PRIu64 ", ptw_avg_lat=%.2f", + reads, calc_percent(hits, reads), evicts, walks, safe_div((double)lat, (double)walks)); + } + } + // Global off-chip memory. { uint64_t r = 0, w = 0, lat = 0, bst = 0; CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_MEM_READS, 0, &r), { return err; }); CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_MEM_WRITES, 0, &w), { return err; }); CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_MEM_LT, 0, &lat), { return err; }); CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_MEM_BANK_ST, 0, &bst), { return err; }); - double avg_lat = safe_div((double)lat, (double)r); perf_print(stream, "memory: reqs=%" PRIu64 " (r=%" PRIu64 ", w=%" PRIu64 "), lat=%.1f cyc, bank_st=%" PRIu64 " (utility=%d%%)", r + w, r, w, avg_lat, bst, calc_utility(r + w, bst)); diff --git a/tests/hip/Makefile b/tests/hip/Makefile index 725b122ece..a6ee89b05e 100644 --- a/tests/hip/Makefile +++ b/tests/hip/Makefile @@ -2,10 +2,13 @@ ROOT_DIR := $(realpath ../..) include $(ROOT_DIR)/config.mk # --- master list ------------------------------------------------------ -TESTS := vecadd sgemm +TESTS := vecadd sgemm histogram atomicreduce # --- common exclude list --------------------------------------------- -EXCLUDE := +# histogram/atomicreduce use atomicAdd (RVA amo*.w); run them only with the +# A extension (CONFIGS="-DVX_CFG_EXT_A_ENABLE"), so exclude them from the +# default sweep which builds for the no-atomics config. +EXCLUDE := histogram atomicreduce # --- per-backend exclude lists --------------------------------------- EXCLUDE_simx := diff --git a/tests/hip/atomicreduce/Makefile b/tests/hip/atomicreduce/Makefile new file mode 100644 index 0000000000..af9bb8686f --- /dev/null +++ b/tests/hip/atomicreduce/Makefile @@ -0,0 +1,15 @@ +ROOT_DIR := $(realpath ../../..) +include $(ROOT_DIR)/config.mk + +PROJECT := atomicreduce + +SRC_DIR := $(VORTEX_HOME)/tests/hip/$(PROJECT) + +SRCS := $(SRC_DIR)/main.cpp + +common.h: $(SRC_DIR)/common.h + cp $< $@ + +OPTS ?= -n1024 + +include ../common.mk diff --git a/tests/hip/atomicreduce/common.h b/tests/hip/atomicreduce/common.h new file mode 100644 index 0000000000..2a816d643b --- /dev/null +++ b/tests/hip/atomicreduce/common.h @@ -0,0 +1,4 @@ +#ifndef COMMON_H +#define COMMON_H + +#endif // COMMON_H diff --git a/tests/hip/atomicreduce/main.cpp b/tests/hip/atomicreduce/main.cpp new file mode 100644 index 0000000000..9d863b4ecb --- /dev/null +++ b/tests/hip/atomicreduce/main.cpp @@ -0,0 +1,112 @@ +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" + +#define KERNEL_NAME "atomicreduce" + +#define HIP_CHECK(_expr) \ + do { \ + hipError_t _err = (_expr); \ + if (_err != hipSuccess) { \ + fprintf(stderr, "HIP Error: '%s' returned %d (%s)\n", \ + #_expr, (int)_err, hipGetErrorString(_err)); \ + exit(-1); \ + } \ + } while (0) + +// Every thread accumulates its element into a single global counter. +// Maximum-contention atomicAdd, lowering to a hardware RVA amoadd.w on +// Vortex (requires the A extension). +__global__ void atomicreduce(const int* data, int* result, int N) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + if (gid < N) { + atomicAdd(&result[0], data[gid]); + } +} + +static uint32_t size = 1024; + +static void show_usage() { + printf("Usage: [-n size] [-h: help]\n"); +} + +static void parse_args(int argc, char** argv) { + int c; + while ((c = getopt(argc, argv, "n:h")) != -1) { + switch (c) { + case 'n': size = atoi(optarg); break; + case 'h': show_usage(); exit(0); + default: show_usage(); exit(-1); + } + } + printf("Workload size=%u\n", size); +} + +int main(int argc, char** argv) { + parse_args(argc, argv); + + std::vector h_data(size); + int h_ref = 0; + + srand(50); + for (uint32_t i = 0; i < size; ++i) { + h_data[i] = rand() % 1000; + h_ref += h_data[i]; + } + int h_result = 0; + + printf("Allocate device buffers\n"); + int *d_data = nullptr, *d_result = nullptr; + HIP_CHECK(hipMalloc((void**)&d_data, size * sizeof(int))); + HIP_CHECK(hipMalloc((void**)&d_result, sizeof(int))); + + printf("Upload source buffers\n"); + HIP_CHECK(hipMemcpy(d_data, h_data.data(), size * sizeof(int), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_result, &h_result, sizeof(int), hipMemcpyHostToDevice)); + + printf("Execute the kernel '%s'\n", KERNEL_NAME); + int dev_id = 0; + HIP_CHECK(hipGetDevice(&dev_id)); + hipDeviceProp_t dev_props{}; + HIP_CHECK(hipGetDeviceProperties(&dev_props, dev_id)); + uint32_t block_size = 64; + if ((int)block_size > dev_props.maxThreadsPerBlock) + block_size = (uint32_t)dev_props.maxThreadsPerBlock; + const uint32_t grid_size = (size + block_size - 1) / block_size; + printf("block_size=%u (device max=%d)\n", block_size, dev_props.maxThreadsPerBlock); + + auto t0 = std::chrono::high_resolution_clock::now(); + atomicreduce<<>>(d_data, d_result, (int)size); + HIP_CHECK(hipDeviceSynchronize()); + auto t1 = std::chrono::high_resolution_clock::now(); + printf("Elapsed time: %lld ms\n", + (long long)std::chrono::duration_cast(t1 - t0).count()); + + printf("Download destination buffer\n"); + HIP_CHECK(hipMemcpy(&h_result, d_result, sizeof(int), hipMemcpyDeviceToHost)); + + printf("Verify result\n"); + int errors = 0; + if (h_result != h_ref) { + printf("*** error: expected=%d, actual=%d\n", h_ref, h_result); + ++errors; + } + + HIP_CHECK(hipFree(d_data)); + HIP_CHECK(hipFree(d_result)); + + if (errors == 0) { + printf("PASSED!\n"); + } else { + printf("FAILED! - %d errors\n", errors); + } + return errors; +} diff --git a/tests/hip/histogram/Makefile b/tests/hip/histogram/Makefile new file mode 100644 index 0000000000..4c6eb2c5dc --- /dev/null +++ b/tests/hip/histogram/Makefile @@ -0,0 +1,15 @@ +ROOT_DIR := $(realpath ../../..) +include $(ROOT_DIR)/config.mk + +PROJECT := histogram + +SRC_DIR := $(VORTEX_HOME)/tests/hip/$(PROJECT) + +SRCS := $(SRC_DIR)/main.cpp + +common.h: $(SRC_DIR)/common.h + cp $< $@ + +OPTS ?= -n1024 + +include ../common.mk diff --git a/tests/hip/histogram/common.h b/tests/hip/histogram/common.h new file mode 100644 index 0000000000..af56d37b33 --- /dev/null +++ b/tests/hip/histogram/common.h @@ -0,0 +1,8 @@ +#ifndef COMMON_H +#define COMMON_H + +#ifndef NUM_BINS +#define NUM_BINS 16 +#endif + +#endif // COMMON_H diff --git a/tests/hip/histogram/main.cpp b/tests/hip/histogram/main.cpp new file mode 100644 index 0000000000..3155a05760 --- /dev/null +++ b/tests/hip/histogram/main.cpp @@ -0,0 +1,115 @@ +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" + +#define KERNEL_NAME "histogram" + +#define HIP_CHECK(_expr) \ + do { \ + hipError_t _err = (_expr); \ + if (_err != hipSuccess) { \ + fprintf(stderr, "HIP Error: '%s' returned %d (%s)\n", \ + #_expr, (int)_err, hipGetErrorString(_err)); \ + exit(-1); \ + } \ + } while (0) + +// Each thread atomically increments the bin its element maps to. +// atomicAdd on an int* lowers to a hardware RVA amoadd.w on Vortex +// (requires the A extension). +__global__ void histogram(const int* data, int* bins, int N) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + if (gid < N) { + atomicAdd(&bins[data[gid] % NUM_BINS], 1); + } +} + +static uint32_t size = 1024; + +static void show_usage() { + printf("Usage: [-n size] [-h: help]\n"); +} + +static void parse_args(int argc, char** argv) { + int c; + while ((c = getopt(argc, argv, "n:h")) != -1) { + switch (c) { + case 'n': size = atoi(optarg); break; + case 'h': show_usage(); exit(0); + default: show_usage(); exit(-1); + } + } + printf("Workload size=%u, bins=%d\n", size, NUM_BINS); +} + +int main(int argc, char** argv) { + parse_args(argc, argv); + + std::vector h_data(size); + std::vector h_bins(NUM_BINS, 0); + std::vector h_ref(NUM_BINS, 0); + + srand(50); + for (uint32_t i = 0; i < size; ++i) { + h_data[i] = rand() % 1000; + h_ref[h_data[i] % NUM_BINS] += 1; + } + + printf("Allocate device buffers\n"); + int *d_data = nullptr, *d_bins = nullptr; + HIP_CHECK(hipMalloc((void**)&d_data, size * sizeof(int))); + HIP_CHECK(hipMalloc((void**)&d_bins, NUM_BINS * sizeof(int))); + + printf("Upload source buffers\n"); + HIP_CHECK(hipMemcpy(d_data, h_data.data(), size * sizeof(int), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemset(d_bins, 0, NUM_BINS * sizeof(int))); + + printf("Execute the kernel '%s'\n", KERNEL_NAME); + int dev_id = 0; + HIP_CHECK(hipGetDevice(&dev_id)); + hipDeviceProp_t dev_props{}; + HIP_CHECK(hipGetDeviceProperties(&dev_props, dev_id)); + uint32_t block_size = 64; + if ((int)block_size > dev_props.maxThreadsPerBlock) + block_size = (uint32_t)dev_props.maxThreadsPerBlock; + const uint32_t grid_size = (size + block_size - 1) / block_size; + printf("block_size=%u (device max=%d)\n", block_size, dev_props.maxThreadsPerBlock); + + auto t0 = std::chrono::high_resolution_clock::now(); + histogram<<>>(d_data, d_bins, (int)size); + HIP_CHECK(hipDeviceSynchronize()); + auto t1 = std::chrono::high_resolution_clock::now(); + printf("Elapsed time: %lld ms\n", + (long long)std::chrono::duration_cast(t1 - t0).count()); + + printf("Download destination buffer\n"); + HIP_CHECK(hipMemcpy(h_bins.data(), d_bins, NUM_BINS * sizeof(int), hipMemcpyDeviceToHost)); + + printf("Verify result\n"); + int errors = 0; + for (int i = 0; i < NUM_BINS; ++i) { + if (h_bins[i] != h_ref[i]) { + if (errors < 100) + printf("*** error: bin[%d] expected=%d, actual=%d\n", i, h_ref[i], h_bins[i]); + ++errors; + } + } + + HIP_CHECK(hipFree(d_data)); + HIP_CHECK(hipFree(d_bins)); + + if (errors == 0) { + printf("PASSED!\n"); + } else { + printf("FAILED! - %d errors\n", errors); + } + return errors; +} diff --git a/tests/opencl/Makefile b/tests/opencl/Makefile index 2cc2ff1009..0c849432b1 100644 --- a/tests/opencl/Makefile +++ b/tests/opencl/Makefile @@ -5,15 +5,20 @@ include $(ROOT_DIR)/config.mk TESTS := \ vecadd sgemm conv3 psort saxpy sfilter sgemm2 sgemm3 psum oclprintf \ dotproduct transpose spmv stencil lbm nearn guassian kmeans \ - blackscholes bfs copybuf + blackscholes bfs copybuf histogram atomicreduce # --- common exclude list --------------------------------------------- +# histogram/atomicreduce use atomic_add (RVA amo*.w); run them only with the +# A extension (CONFIGS="-DVX_CFG_EXT_A_ENABLE"), so exclude them from the +# default sweep which builds for the no-atomics config. EXCLUDE := \ transpose \ kmeans \ bfs \ lbm \ - copybuf + copybuf \ + histogram \ + atomicreduce # --- per-backend exclude lists --------------------------------------- EXCLUDE_simx := diff --git a/tests/opencl/atomicreduce/Makefile b/tests/opencl/atomicreduce/Makefile new file mode 100644 index 0000000000..196382be22 --- /dev/null +++ b/tests/opencl/atomicreduce/Makefile @@ -0,0 +1,20 @@ +ROOT_DIR := $(realpath ../../..) +include $(ROOT_DIR)/config.mk + +PROJECT := atomicreduce + +SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT) + +SRCS := $(SRC_DIR)/main.cc + +common.h: $(SRC_DIR)/common.h + cp $< $@ + +kernel.cl: $(SRC_DIR)/kernel.cl + cp $< $@ + +KERNEL_SRCS := kernel.cl common.h + +OPTS ?= -n1024 + +include ../common.mk diff --git a/tests/opencl/atomicreduce/common.h b/tests/opencl/atomicreduce/common.h new file mode 100644 index 0000000000..2a816d643b --- /dev/null +++ b/tests/opencl/atomicreduce/common.h @@ -0,0 +1,4 @@ +#ifndef COMMON_H +#define COMMON_H + +#endif // COMMON_H diff --git a/tests/opencl/atomicreduce/kernel.cl b/tests/opencl/atomicreduce/kernel.cl new file mode 100644 index 0000000000..fd2f03e868 --- /dev/null +++ b/tests/opencl/atomicreduce/kernel.cl @@ -0,0 +1,10 @@ +#include "common.h" + +// Atomic reduction: every work-item accumulates its element into a single +// global counter. Maximum-contention atomic_add, lowering to a hardware RVA +// amoadd.w on Vortex (requires the A extension). +__kernel void atomicreduce(__global const int* data, + __global int* result) { + int gid = get_global_id(0); + atomic_add(&result[0], data[gid]); +} diff --git a/tests/opencl/atomicreduce/main.cc b/tests/opencl/atomicreduce/main.cc new file mode 100644 index 0000000000..7ff0f74fab --- /dev/null +++ b/tests/opencl/atomicreduce/main.cc @@ -0,0 +1,174 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "common.h" + +#define KERNEL_NAME "atomicreduce" + +#define CL_CHECK(_expr) \ + do { \ + cl_int _err = _expr; \ + if (_err == CL_SUCCESS) \ + break; \ + printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \ + cleanup(); \ + exit(-1); \ + } while (0) + +#define CL_CHECK2(_expr) \ + ({ \ + cl_int _err = CL_INVALID_VALUE; \ + decltype(_expr) _ret = _expr; \ + if (_err != CL_SUCCESS) { \ + printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \ + cleanup(); \ + exit(-1); \ + } \ + _ret; \ + }) + +static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) { + if (nullptr == filename || nullptr == data || 0 == size) + return -1; + + FILE* fp = fopen(filename, "r"); + if (NULL == fp) { + fprintf(stderr, "Failed to load kernel."); + return -1; + } + fseek(fp , 0 , SEEK_END); + long fsize = ftell(fp); + rewind(fp); + + *data = (uint8_t*)malloc(fsize); + *size = fread(*data, 1, fsize, fp); + + fclose(fp); + + return 0; +} + +cl_device_id device_id = NULL; +cl_context context = NULL; +cl_command_queue commandQueue = NULL; +cl_program program = NULL; +cl_kernel kernel = NULL; +cl_mem data_memobj = NULL; +cl_mem result_memobj = NULL; +uint8_t *kernel_bin = NULL; + +static void cleanup() { + if (commandQueue) clReleaseCommandQueue(commandQueue); + if (kernel) clReleaseKernel(kernel); + if (program) clReleaseProgram(program); + if (data_memobj) clReleaseMemObject(data_memobj); + if (result_memobj) clReleaseMemObject(result_memobj); + if (context) clReleaseContext(context); + if (device_id) clReleaseDevice(device_id); + if (kernel_bin) free(kernel_bin); +} + +uint32_t size = 1024; + +static void show_usage() { + printf("Usage: [-n size] [-h: help]\n"); +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "n:h")) != -1) { + switch (c) { + case 'n': + size = atoi(optarg); + break; + case 'h': + show_usage(); + exit(0); + break; + default: + show_usage(); + exit(-1); + } + } + printf("Workload size=%d\n", size); +} + +int main (int argc, char **argv) { + parse_args(argc, argv); + + cl_platform_id platform_id; + size_t kernel_size; + + CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL)); + CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL)); + + printf("Create context\n"); + context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err)); + + printf("Allocate device buffers\n"); + size_t data_bytes = size * sizeof(int); + data_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, data_bytes, NULL, &_err)); + result_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int), NULL, &_err)); + + printf("Create program from kernel source\n"); + if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size)) + return -1; + program = CL_CHECK2(clCreateProgramWithSource( + context, 1, (const char**)&kernel_bin, &kernel_size, &_err)); + + CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL)); + + kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err)); + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&data_memobj)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&result_memobj)); + + // Generate input values and the reference sum. + std::vector h_data(size); + int h_ref = 0; + for (uint32_t i = 0; i < size; ++i) { + h_data[i] = rand() % 1000; + h_ref += h_data[i]; + } + int h_result = 0; + + commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err)); + + printf("Upload source buffers\n"); + CL_CHECK(clEnqueueWriteBuffer(commandQueue, data_memobj, CL_TRUE, 0, data_bytes, h_data.data(), 0, NULL, NULL)); + CL_CHECK(clEnqueueWriteBuffer(commandQueue, result_memobj, CL_TRUE, 0, sizeof(int), &h_result, 0, NULL, NULL)); + + printf("Execute the kernel\n"); + size_t global_work_size[1] = {size}; + size_t local_work_size[1] = {1}; + auto time_start = std::chrono::high_resolution_clock::now(); + CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL)); + CL_CHECK(clFinish(commandQueue)); + auto time_end = std::chrono::high_resolution_clock::now(); + double elapsed = std::chrono::duration_cast(time_end - time_start).count(); + printf("Elapsed time: %lg ms\n", elapsed); + + printf("Download destination buffer\n"); + CL_CHECK(clEnqueueReadBuffer(commandQueue, result_memobj, CL_TRUE, 0, sizeof(int), &h_result, 0, NULL, NULL)); + + printf("Verify result\n"); + int errors = 0; + if (h_result != h_ref) { + printf("*** error: expected=%d, actual=%d\n", h_ref, h_result); + ++errors; + } + if (0 == errors) { + printf("PASSED!\n"); + } else { + printf("FAILED! - %d errors\n", errors); + } + + cleanup(); + + return errors; +} diff --git a/tests/opencl/histogram/Makefile b/tests/opencl/histogram/Makefile new file mode 100644 index 0000000000..65b58c1f18 --- /dev/null +++ b/tests/opencl/histogram/Makefile @@ -0,0 +1,20 @@ +ROOT_DIR := $(realpath ../../..) +include $(ROOT_DIR)/config.mk + +PROJECT := histogram + +SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT) + +SRCS := $(SRC_DIR)/main.cc + +common.h: $(SRC_DIR)/common.h + cp $< $@ + +kernel.cl: $(SRC_DIR)/kernel.cl + cp $< $@ + +KERNEL_SRCS := kernel.cl common.h + +OPTS ?= -n1024 + +include ../common.mk diff --git a/tests/opencl/histogram/common.h b/tests/opencl/histogram/common.h new file mode 100644 index 0000000000..af56d37b33 --- /dev/null +++ b/tests/opencl/histogram/common.h @@ -0,0 +1,8 @@ +#ifndef COMMON_H +#define COMMON_H + +#ifndef NUM_BINS +#define NUM_BINS 16 +#endif + +#endif // COMMON_H diff --git a/tests/opencl/histogram/kernel.cl b/tests/opencl/histogram/kernel.cl new file mode 100644 index 0000000000..9b1421d7df --- /dev/null +++ b/tests/opencl/histogram/kernel.cl @@ -0,0 +1,11 @@ +#include "common.h" + +// Histogram: each work-item atomically increments the bin its element maps +// to. atomic_add on a __global int lowers to a hardware RVA amoadd.w on +// Vortex (requires the A extension). +__kernel void histogram(__global const int* data, + __global int* bins) { + int gid = get_global_id(0); + int bin = data[gid] % NUM_BINS; + atomic_add(&bins[bin], 1); +} diff --git a/tests/opencl/histogram/main.cc b/tests/opencl/histogram/main.cc new file mode 100644 index 0000000000..d6b1d3b879 --- /dev/null +++ b/tests/opencl/histogram/main.cc @@ -0,0 +1,178 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "common.h" + +#define KERNEL_NAME "histogram" + +#define CL_CHECK(_expr) \ + do { \ + cl_int _err = _expr; \ + if (_err == CL_SUCCESS) \ + break; \ + printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \ + cleanup(); \ + exit(-1); \ + } while (0) + +#define CL_CHECK2(_expr) \ + ({ \ + cl_int _err = CL_INVALID_VALUE; \ + decltype(_expr) _ret = _expr; \ + if (_err != CL_SUCCESS) { \ + printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \ + cleanup(); \ + exit(-1); \ + } \ + _ret; \ + }) + +static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) { + if (nullptr == filename || nullptr == data || 0 == size) + return -1; + + FILE* fp = fopen(filename, "r"); + if (NULL == fp) { + fprintf(stderr, "Failed to load kernel."); + return -1; + } + fseek(fp , 0 , SEEK_END); + long fsize = ftell(fp); + rewind(fp); + + *data = (uint8_t*)malloc(fsize); + *size = fread(*data, 1, fsize, fp); + + fclose(fp); + + return 0; +} + +cl_device_id device_id = NULL; +cl_context context = NULL; +cl_command_queue commandQueue = NULL; +cl_program program = NULL; +cl_kernel kernel = NULL; +cl_mem data_memobj = NULL; +cl_mem bins_memobj = NULL; +uint8_t *kernel_bin = NULL; + +static void cleanup() { + if (commandQueue) clReleaseCommandQueue(commandQueue); + if (kernel) clReleaseKernel(kernel); + if (program) clReleaseProgram(program); + if (data_memobj) clReleaseMemObject(data_memobj); + if (bins_memobj) clReleaseMemObject(bins_memobj); + if (context) clReleaseContext(context); + if (device_id) clReleaseDevice(device_id); + if (kernel_bin) free(kernel_bin); +} + +uint32_t size = 1024; + +static void show_usage() { + printf("Usage: [-n size] [-h: help]\n"); +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "n:h")) != -1) { + switch (c) { + case 'n': + size = atoi(optarg); + break; + case 'h': + show_usage(); + exit(0); + break; + default: + show_usage(); + exit(-1); + } + } + printf("Workload size=%d, bins=%d\n", size, NUM_BINS); +} + +int main (int argc, char **argv) { + parse_args(argc, argv); + + cl_platform_id platform_id; + size_t kernel_size; + + CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL)); + CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL)); + + printf("Create context\n"); + context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err)); + + printf("Allocate device buffers\n"); + size_t data_bytes = size * sizeof(int); + size_t bins_bytes = NUM_BINS * sizeof(int); + data_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, data_bytes, NULL, &_err)); + bins_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_WRITE, bins_bytes, NULL, &_err)); + + printf("Create program from kernel source\n"); + if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size)) + return -1; + program = CL_CHECK2(clCreateProgramWithSource( + context, 1, (const char**)&kernel_bin, &kernel_size, &_err)); + + CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL)); + + kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err)); + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&data_memobj)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&bins_memobj)); + + // Generate input values and the reference histogram. + std::vector h_data(size); + std::vector h_bins(NUM_BINS, 0); + std::vector h_ref(NUM_BINS, 0); + for (uint32_t i = 0; i < size; ++i) { + h_data[i] = rand() % 1000; + h_ref[h_data[i] % NUM_BINS] += 1; + } + + commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err)); + + printf("Upload source buffers\n"); + CL_CHECK(clEnqueueWriteBuffer(commandQueue, data_memobj, CL_TRUE, 0, data_bytes, h_data.data(), 0, NULL, NULL)); + CL_CHECK(clEnqueueWriteBuffer(commandQueue, bins_memobj, CL_TRUE, 0, bins_bytes, h_bins.data(), 0, NULL, NULL)); + + printf("Execute the kernel\n"); + size_t global_work_size[1] = {size}; + size_t local_work_size[1] = {1}; + auto time_start = std::chrono::high_resolution_clock::now(); + CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL)); + CL_CHECK(clFinish(commandQueue)); + auto time_end = std::chrono::high_resolution_clock::now(); + double elapsed = std::chrono::duration_cast(time_end - time_start).count(); + printf("Elapsed time: %lg ms\n", elapsed); + + printf("Download destination buffer\n"); + CL_CHECK(clEnqueueReadBuffer(commandQueue, bins_memobj, CL_TRUE, 0, bins_bytes, h_bins.data(), 0, NULL, NULL)); + + printf("Verify result\n"); + int errors = 0; + for (int i = 0; i < NUM_BINS; ++i) { + if (h_bins[i] != h_ref[i]) { + if (errors < 100) + printf("*** error: bin[%d] expected=%d, actual=%d\n", i, h_ref[i], h_bins[i]); + ++errors; + } + } + if (0 == errors) { + printf("PASSED!\n"); + } else { + printf("FAILED! - %d errors\n", errors); + } + + cleanup(); + + return errors; +} diff --git a/tests/regression/amo/kernel.cpp b/tests/regression/amo/kernel.cpp index 79cb750df6..a513261a66 100644 --- a/tests/regression/amo/kernel.cpp +++ b/tests/regression/amo/kernel.cpp @@ -9,11 +9,10 @@ typedef void (*PFN_Kernel)(kernel_arg_t* __UNIFORM__ arg); -// Globally-unique hart id matching the simulator's make_hart_id(): -// (cid * VX_CFG_NUM_WARPS + wid) * VX_CFG_NUM_THREADS + tid. +// Globally-unique hart id (mhartid): (cid * NUM_WARPS + wid) * NUM_THREADS +// + tid, matching the LLC reservation table's make_hart_id. static inline uint32_t hart_id() { - return (vx_core_id() * vx_num_warps() + vx_warp_id()) * vx_num_threads() - + vx_thread_id(); + return (uint32_t)vx_hart_id(); } // 1) AMOADD hammer.