diff --git a/VX_config.toml b/VX_config.toml
index d7cb2f0b2a..1072c1250d 100644
--- a/VX_config.toml
+++ b/VX_config.toml
@@ -200,7 +200,7 @@ VX_CFG_L1_MEM_PORTS = "expr: min($VX_CFG_DCACHE_NUM_BANKS, $VX_CFG_PLATFORM_MEMO
 [l2cache]
 VX_CFG_L2_CACHE_SIZE = 1048576
 VX_CFG_L2_NUM_WAYS = 8
-VX_CFG_L2_WRITEBACK = 0
+VX_CFG_L2_WRITEBACK = 1
 VX_CFG_L2_DIRTYBYTES = "expr: $VX_CFG_L2_WRITEBACK"
 VX_CFG_L2_REPL_POLICY = "expr: $__cache_repl_fifo"
 VX_CFG_L2_MSHR_SIZE = 16
@@ -214,7 +214,7 @@ VX_CFG_L2_MEM_PORTS = "expr: min($VX_CFG_L2_NUM_BANKS, $VX_CFG_PLATFORM_MEMORY_N
 [l3cache]
 VX_CFG_L3_CACHE_SIZE = 2097152
 VX_CFG_L3_NUM_WAYS = 8
-VX_CFG_L3_WRITEBACK = 0
+VX_CFG_L3_WRITEBACK = 1
 VX_CFG_L3_DIRTYBYTES = "expr: $VX_CFG_L3_WRITEBACK"
 VX_CFG_L3_REPL_POLICY = "expr: $__cache_repl_fifo"
 VX_CFG_L3_MSHR_SIZE = 16
diff --git a/VX_types.toml b/VX_types.toml
index f833a940bb..58e6c63b7b 100644
--- a/VX_types.toml
+++ b/VX_types.toml
@@ -390,15 +390,23 @@ VX_CSR_CTA_CLUSTER_SIZE = 0xCE0
 VX_CSR_CTA_ENTRY = 0xCE1  # kernel entry PC, supplied per-CTA by the KMU
 
 [dcr_mpm_class]
-VX_DCR_MPM_CLASS_BASE   = 0
-VX_DCR_MPM_CLASS_CORE   = 1
-VX_DCR_MPM_CLASS_MEM    = 2
-VX_DCR_MPM_CLASS_TEX    = 3
-VX_DCR_MPM_CLASS_RASTER = 4
-VX_DCR_MPM_CLASS_OM     = 5
-VX_DCR_MPM_CLASS_DXA    = 6
-VX_DCR_MPM_CLASS_TCU    = 7
-VX_DCR_MPM_CLASS_VM     = 8
+VX_DCR_MPM_CLASS_BASE    = 0
+VX_DCR_MPM_CLASS_CORE    = 1
+VX_DCR_MPM_CLASS_RESERVED1= 2
+VX_DCR_MPM_CLASS_ICACHE  = 3
+VX_DCR_MPM_CLASS_DCACHE  = 4
+VX_DCR_MPM_CLASS_L2CACHE = 5
+VX_DCR_MPM_CLASS_L3CACHE = 6
+VX_DCR_MPM_CLASS_MEM     = 7
+VX_DCR_MPM_CLASS_RESERVED2= 8
+VX_DCR_MPM_CLASS_RESERVED3= 9
+VX_DCR_MPM_CLASS_RESERVED4= 10
+VX_DCR_MPM_CLASS_TCU     = 11
+VX_DCR_MPM_CLASS_RASTER  = 12
+VX_DCR_MPM_CLASS_TEX     = 13
+VX_DCR_MPM_CLASS_OM      = 14
+VX_DCR_MPM_CLASS_RTU     = 15
+VX_DCR_MPM_CLASS_DXA     = 16
 
 [csr_mpm_base]
 VX_CSR_MCYCLE = 0xB00
@@ -476,87 +484,97 @@ VX_CSR_MPM_LOAD_LT_H    = 0xB9D
 VX_CSR_MPM_STORES       = 0xB1E # total LSU store requests
 VX_CSR_MPM_STORES_H     = 0xB9E
 
-[csr_mpm_vm]
-# PERF: VM (TLB/PTW). Hardware sums icache + dcache MMU counters.
-VX_CSR_MPM_TLB_READS    = 0xB03 # total TLB lookups
-VX_CSR_MPM_TLB_READS_H  = 0xB83
-VX_CSR_MPM_TLB_HITS     = 0xB04 # TLB hits
-VX_CSR_MPM_TLB_HITS_H   = 0xB84
-VX_CSR_MPM_TLB_MISSES   = 0xB05 # TLB misses (triggered PTW)
-VX_CSR_MPM_TLB_MISSES_H = 0xB85
-VX_CSR_MPM_TLB_EVICTS   = 0xB06 # TLB evictions on fill
-VX_CSR_MPM_TLB_EVICTS_H = 0xB86
-VX_CSR_MPM_PTW_WALKS    = 0xB07 # PTW walks completed
-VX_CSR_MPM_PTW_WALKS_H  = 0xB87
-VX_CSR_MPM_PTW_LATENCY  = 0xB08 # PTW total latency cycles
-VX_CSR_MPM_PTW_LATENCY_H = 0xB88
+# VM/MMU counters are part of the MEM class (see [csr_mpm_mem] below).
 
-[csr_mpm_mem]
-# PERF: icache
+# Each cache level is its own MPM class (re-based at 0xB03), so every level
+# gets the full standard hpmcounter window with room for evictions.
+[csr_mpm_icache]
 VX_CSR_MPM_ICACHE_READS = 0xB03  # total reads
 VX_CSR_MPM_ICACHE_READS_H = 0xB83
 VX_CSR_MPM_ICACHE_MISS_R = 0xB04  # read misses
 VX_CSR_MPM_ICACHE_MISS_R_H = 0xB84
 VX_CSR_MPM_ICACHE_MSHR_ST = 0xB05  # MSHR stalls
 VX_CSR_MPM_ICACHE_MSHR_ST_H = 0xB85
-# PERF: dcache
-VX_CSR_MPM_DCACHE_READS = 0xB06  # total reads
-VX_CSR_MPM_DCACHE_READS_H = 0xB86
-VX_CSR_MPM_DCACHE_WRITES = 0xB07  # total writes
-VX_CSR_MPM_DCACHE_WRITES_H = 0xB87
-VX_CSR_MPM_DCACHE_MISS_R = 0xB08  # read misses
-VX_CSR_MPM_DCACHE_MISS_R_H = 0xB88
-VX_CSR_MPM_DCACHE_MISS_W = 0xB09  # write misses
-VX_CSR_MPM_DCACHE_MISS_W_H = 0xB89
-VX_CSR_MPM_DCACHE_BANK_ST = 0xB0A  # bank conflicts
-VX_CSR_MPM_DCACHE_BANK_ST_H = 0xB8A
-VX_CSR_MPM_DCACHE_MSHR_ST = 0xB0B  # MSHR stalls
-VX_CSR_MPM_DCACHE_MSHR_ST_H = 0xB8B
-# PERF: l2cache
-VX_CSR_MPM_L2CACHE_READS = 0xB0C  # total reads
-VX_CSR_MPM_L2CACHE_READS_H = 0xB8C
-VX_CSR_MPM_L2CACHE_WRITES = 0xB0D  # total writes
-VX_CSR_MPM_L2CACHE_WRITES_H = 0xB8D
-VX_CSR_MPM_L2CACHE_MISS_R = 0xB0E  # read misses
-VX_CSR_MPM_L2CACHE_MISS_R_H = 0xB8E
-VX_CSR_MPM_L2CACHE_MISS_W = 0xB0F  # write misses
-VX_CSR_MPM_L2CACHE_MISS_W_H = 0xB8F
-VX_CSR_MPM_L2CACHE_BANK_ST = 0xB10  # bank conflicts
-VX_CSR_MPM_L2CACHE_BANK_ST_H = 0xB90
-VX_CSR_MPM_L2CACHE_MSHR_ST = 0xB11  # MSHR stalls
-VX_CSR_MPM_L2CACHE_MSHR_ST_H = 0xB91
-# PERF: l3cache
-VX_CSR_MPM_L3CACHE_READS = 0xB12  # total reads
-VX_CSR_MPM_L3CACHE_READS_H = 0xB92
-VX_CSR_MPM_L3CACHE_WRITES = 0xB13  # total writes
-VX_CSR_MPM_L3CACHE_WRITES_H = 0xB93
-VX_CSR_MPM_L3CACHE_MISS_R = 0xB14  # read misses
-VX_CSR_MPM_L3CACHE_MISS_R_H = 0xB94
-VX_CSR_MPM_L3CACHE_MISS_W = 0xB15  # write misses
-VX_CSR_MPM_L3CACHE_MISS_W_H = 0xB95
-VX_CSR_MPM_L3CACHE_BANK_ST = 0xB16  # bank conflicts
-VX_CSR_MPM_L3CACHE_BANK_ST_H = 0xB96
-VX_CSR_MPM_L3CACHE_MSHR_ST = 0xB17  # MSHR stalls
-VX_CSR_MPM_L3CACHE_MSHR_ST_H = 0xB97
-# PERF: memory
-VX_CSR_MPM_MEM_READS = 0xB18  # total reads
-VX_CSR_MPM_MEM_READS_H = 0xB98
-VX_CSR_MPM_MEM_WRITES = 0xB19  # total writes
-VX_CSR_MPM_MEM_WRITES_H = 0xB99
-VX_CSR_MPM_MEM_LT = 0xB1A  # memory latency
-VX_CSR_MPM_MEM_LT_H = 0xB9A
-VX_CSR_MPM_MEM_BANK_ST = 0xB1E  # bank conflicts
-VX_CSR_MPM_MEM_BANK_ST_H = 0xB9E
-# PERF: lmem
-VX_CSR_MPM_LMEM_READS = 0xB1B  # memory reads
-VX_CSR_MPM_LMEM_READS_H = 0xB9B
-VX_CSR_MPM_LMEM_WRITES = 0xB1C  # memory writes
-VX_CSR_MPM_LMEM_WRITES_H = 0xB9C
-VX_CSR_MPM_LMEM_BANK_ST = 0xB1D  # bank conflicts
-VX_CSR_MPM_LMEM_BANK_ST_H = 0xB9D
-# PERF: coalescer
-VX_CSR_MPM_COALESCER_MISS = 0xB1F  # coalescer misses
-VX_CSR_MPM_COALESCER_MISS_H = 0xB9F
+
+[csr_mpm_dcache]
+VX_CSR_MPM_DCACHE_READS = 0xB03  # total reads
+VX_CSR_MPM_DCACHE_READS_H = 0xB83
+VX_CSR_MPM_DCACHE_WRITES = 0xB04  # total writes
+VX_CSR_MPM_DCACHE_WRITES_H = 0xB84
+VX_CSR_MPM_DCACHE_MISS_R = 0xB05  # read misses
+VX_CSR_MPM_DCACHE_MISS_R_H = 0xB85
+VX_CSR_MPM_DCACHE_MISS_W = 0xB06  # write misses
+VX_CSR_MPM_DCACHE_MISS_W_H = 0xB86
+VX_CSR_MPM_DCACHE_EVICTS = 0xB07  # dirty-line evictions
+VX_CSR_MPM_DCACHE_EVICTS_H = 0xB87
+VX_CSR_MPM_DCACHE_BANK_ST = 0xB08  # bank conflicts
+VX_CSR_MPM_DCACHE_BANK_ST_H = 0xB88
+VX_CSR_MPM_DCACHE_MSHR_ST = 0xB09  # MSHR stalls
+VX_CSR_MPM_DCACHE_MSHR_ST_H = 0xB89
+
+[csr_mpm_l2cache]
+VX_CSR_MPM_L2CACHE_READS = 0xB03  # total reads
+VX_CSR_MPM_L2CACHE_READS_H = 0xB83
+VX_CSR_MPM_L2CACHE_WRITES = 0xB04  # total writes
+VX_CSR_MPM_L2CACHE_WRITES_H = 0xB84
+VX_CSR_MPM_L2CACHE_MISS_R = 0xB05  # read misses
+VX_CSR_MPM_L2CACHE_MISS_R_H = 0xB85
+VX_CSR_MPM_L2CACHE_MISS_W = 0xB06  # write misses
+VX_CSR_MPM_L2CACHE_MISS_W_H = 0xB86
+VX_CSR_MPM_L2CACHE_EVICTS = 0xB07  # dirty-line evictions
+VX_CSR_MPM_L2CACHE_EVICTS_H = 0xB87
+VX_CSR_MPM_L2CACHE_BANK_ST = 0xB08  # bank conflicts
+VX_CSR_MPM_L2CACHE_BANK_ST_H = 0xB88
+VX_CSR_MPM_L2CACHE_MSHR_ST = 0xB09  # MSHR stalls
+VX_CSR_MPM_L2CACHE_MSHR_ST_H = 0xB89
+
+[csr_mpm_l3cache]
+VX_CSR_MPM_L3CACHE_READS = 0xB03  # total reads
+VX_CSR_MPM_L3CACHE_READS_H = 0xB83
+VX_CSR_MPM_L3CACHE_WRITES = 0xB04  # total writes
+VX_CSR_MPM_L3CACHE_WRITES_H = 0xB84
+VX_CSR_MPM_L3CACHE_MISS_R = 0xB05  # read misses
+VX_CSR_MPM_L3CACHE_MISS_R_H = 0xB85
+VX_CSR_MPM_L3CACHE_MISS_W = 0xB06  # write misses
+VX_CSR_MPM_L3CACHE_MISS_W_H = 0xB86
+VX_CSR_MPM_L3CACHE_EVICTS = 0xB07  # dirty-line evictions
+VX_CSR_MPM_L3CACHE_EVICTS_H = 0xB87
+VX_CSR_MPM_L3CACHE_BANK_ST = 0xB08  # bank conflicts
+VX_CSR_MPM_L3CACHE_BANK_ST_H = 0xB88
+VX_CSR_MPM_L3CACHE_MSHR_ST = 0xB09  # MSHR stalls
+VX_CSR_MPM_L3CACHE_MSHR_ST_H = 0xB89
+
+# Off-chip memory + local memory + coalescer + VM/MMU (one memory-subsystem class).
+[csr_mpm_mem]
+VX_CSR_MPM_MEM_READS = 0xB03  # total reads
+VX_CSR_MPM_MEM_READS_H = 0xB83
+VX_CSR_MPM_MEM_WRITES = 0xB04  # total writes
+VX_CSR_MPM_MEM_WRITES_H = 0xB84
+VX_CSR_MPM_MEM_LT = 0xB05  # memory latency
+VX_CSR_MPM_MEM_LT_H = 0xB85
+VX_CSR_MPM_MEM_BANK_ST = 0xB06  # bank conflicts
+VX_CSR_MPM_MEM_BANK_ST_H = 0xB86
+VX_CSR_MPM_LMEM_READS = 0xB07  # local memory reads
+VX_CSR_MPM_LMEM_READS_H = 0xB87
+VX_CSR_MPM_LMEM_WRITES = 0xB08  # local memory writes
+VX_CSR_MPM_LMEM_WRITES_H = 0xB88
+VX_CSR_MPM_LMEM_BANK_ST = 0xB09  # bank conflicts
+VX_CSR_MPM_LMEM_BANK_ST_H = 0xB89
+VX_CSR_MPM_COALESCER_MISS = 0xB0A  # coalescer misses
+VX_CSR_MPM_COALESCER_MISS_H = 0xB8A
+# VM/MMU (per-core TLB/PTW). Hardware sums icache + dcache MMU counters.
+VX_CSR_MPM_TLB_READS    = 0xB0B # total TLB lookups
+VX_CSR_MPM_TLB_READS_H  = 0xB8B
+VX_CSR_MPM_TLB_HITS     = 0xB0C # TLB hits
+VX_CSR_MPM_TLB_HITS_H   = 0xB8C
+VX_CSR_MPM_TLB_MISSES   = 0xB0D # TLB misses (triggered PTW)
+VX_CSR_MPM_TLB_MISSES_H = 0xB8D
+VX_CSR_MPM_TLB_EVICTS   = 0xB0E # TLB evictions on fill
+VX_CSR_MPM_TLB_EVICTS_H = 0xB8E
+VX_CSR_MPM_PTW_WALKS    = 0xB0F # PTW walks completed
+VX_CSR_MPM_PTW_WALKS_H  = 0xB8F
+VX_CSR_MPM_PTW_LATENCY  = 0xB10 # PTW total latency cycles
+VX_CSR_MPM_PTW_LATENCY_H = 0xB90
 
 [csr_mpm_dxa]
 # PERF: DXA copy engine (cluster-level, same value on all cores in cluster)
diff --git a/ci/blackbox.sh b/ci/blackbox.sh
index 886a8cdcef..271ddd1550 100755
--- a/ci/blackbox.sh
+++ b/ci/blackbox.sh
@@ -28,7 +28,8 @@ show_help()
     echo "  where"
     echo "--driver: gpu, simx, rtlsim, oape, xrt"
     echo "--app: any subfolder test under regression, graphics, mpi, opencl, or hip"
-    echo "--class: 0=disable, 1=pipeline, 2=memsys"
+    echo "--perf: 0=disable, 1=core, 3=icache, 4=dcache, 5=l2cache, 6=l3cache, 7=mem,"
+    echo "        11=tcu, 12=raster, 13=tex, 14=om, 15=rtu, 16=dxa"
     echo "--nohup: build and run in temp directory"
 }
 
diff --git a/ci/regression.sh.in b/ci/regression.sh.in
index be49db1000..2d03f7fffa 100755
--- a/ci/regression.sh.in
+++ b/ci/regression.sh.in
@@ -339,8 +339,9 @@ amo()
     # fill/probe ordering that single-core runs never hit.
     # 4x L1 -> 1x L2 (multi-core base configuration).
     CONFIGS="-DVX_CFG_EXT_A_ENABLE" ./ci/blackbox.sh --driver=simx --cores=4 --l2cache --args=-n8 --app=amo
-    # 4x L2 -> 1x L3 (two-level shared hierarchy).
-    CONFIGS="-DVX_CFG_EXT_A_ENABLE" ./ci/blackbox.sh --driver=simx --cores=4 --l2cache --l3cache --args=-n8 --app=amo
+    # 4x L2 -> 1x L3 (two-level shared hierarchy). L3 is the LLC where atomics
+    # resolve, so L2 (above it) must be write-through for AMO correctness.
+    CONFIGS="-DVX_CFG_EXT_A_ENABLE -DVX_CFG_L2_WRITEBACK=0" ./ci/blackbox.sh --driver=simx --cores=4 --l2cache --l3cache --args=-n8 --app=amo
 
     echo "amo tests done!"
 }
diff --git a/docs/designs/graphics_fixed_function_pipeline.md b/docs/designs/graphics_fixed_function_pipeline.md
index 30331b212e..67e0346363 100644
--- a/docs/designs/graphics_fixed_function_pipeline.md
+++ b/docs/designs/graphics_fixed_function_pipeline.md
@@ -73,9 +73,9 @@ arbiters, cores, caches, and DCR fan-out.
   blend-mode/func/const, logic-op, [`:255-276`](../../VX_types.toml#L255)).
   DCRs are broadcast to all cluster instances; each raster instance
   self-selects its tile stripe.
-- **Perf** MPM classes TEX=3, RASTER=4, OM=5
-  ([`VX_types.toml:392-394`](../../VX_types.toml#L392)); reported via
-  [`legacy_perf.cpp:229-231`](../../sw/runtime/common/legacy_perf.cpp#L229).
+- **Perf** MPM classes RASTER=12, TEX=13, OM=14
+  ([`VX_types.toml:393-409`](../../VX_types.toml#L393)); reported via
+  [`legacy_perf.cpp`](../../sw/runtime/common/legacy_perf.cpp).
 - **Counts** ([`VX_config.toml`](../../VX_config.toml)): `NUM_TEX_CORES`,
   `NUM_RASTER_CORES`, `NUM_OM_CORES`, and `NUM_{TCACHES,RCACHES,OCACHES}`.
 
diff --git a/docs/designs/virtual_memory_subsystem.md b/docs/designs/virtual_memory_subsystem.md
index 72e95b3611..f365e5d404 100644
--- a/docs/designs/virtual_memory_subsystem.md
+++ b/docs/designs/virtual_memory_subsystem.md
@@ -67,9 +67,10 @@ CP does.
 - **TLB sizing**: a single flat `VX_CFG_TLB_SIZE`-entry (32) fully-
   associative CAM, one per dcache MMU + one per icache MMU per core
   ([`VX_config.toml:160`](../../VX_config.toml#L160)). No L2/L3.
-- **Perf**: 6 VM perf CSRs `[csr_mpm_vm]` 0xB03–0xB08 (+_H mirrors),
-  class `VX_DCR_MPM_CLASS_VM = 8`
-  ([`VX_types.toml:475-488`](../../VX_types.toml#L475)).
+- **Perf**: 6 VM perf CSRs in the memory-subsystem class
+  `VX_DCR_MPM_CLASS_MEM = 7` at 0xB0B–0xB10 (+_H mirrors), alongside
+  off-chip memory / lmem / coalescer (`[csr_mpm_mem]` in
+  [VX_types.toml](../../VX_types.toml)).
 - **Runtime caps**: `VX_CAPS_VM_SUPPORT`, `VX_MEM_PHYS = 0x8`
   ([`vortex2.h:74,121`](../../sw/runtime/include/vortex2.h#L74)).
 
diff --git a/docs/proposals/mmu_optimization_proposal.md b/docs/proposals/mmu_optimization_proposal.md
index eeb5cfe768..4d56bff18a 100644
--- a/docs/proposals/mmu_optimization_proposal.md
+++ b/docs/proposals/mmu_optimization_proposal.md
@@ -660,7 +660,7 @@ Regression integration:
 
 Performance signals:
 
-- Perf counter dump (`VX_DCR_MPM_CLASS_VM`) reports per-level
+- Perf counter dump (VM counters in `VX_DCR_MPM_CLASS_MEM`) reports per-level
   reads/hits/misses, MSHR occupancy, walker occupancy, walk-cache hit
   rate. Compare against baseline (current per-core MMU) and document
   the gain.
diff --git a/docs/vm.md b/docs/vm.md
index 5486fa64c8..7e6c7b014b 100644
--- a/docs/vm.md
+++ b/docs/vm.md
@@ -80,9 +80,10 @@ falls back to sequential allocation so progress is guaranteed.
 
 ## Perf counters
 
-Six MMU-related counters live in their own MPM class
-(`VX_DCR_MPM_CLASS_VM`). The hardware sums the icache and dcache MMU
-counters into one bank exposed via `pipeline_perf.mmu` in
+Six MMU-related counters live in the memory-subsystem MPM class
+(`VX_DCR_MPM_CLASS_MEM`, alongside off-chip memory, lmem, and the
+coalescer). The hardware sums the icache and dcache MMU counters into one
+bank exposed via `pipeline_perf.mmu` in
 [VX_gpu_pkg.sv](../hw/rtl/VX_gpu_pkg.sv).
 
 | CSR | Meaning |
@@ -94,9 +95,10 @@ counters into one bank exposed via `pipeline_perf.mmu` in
 | `VX_CSR_MPM_PTW_WALKS` | Completed PTW walks |
 | `VX_CSR_MPM_PTW_LATENCY` | Total PTW latency in cycles (avg = LATENCY / WALKS) |
 
-[stub/perf.cpp](../sw/runtime/stub/perf.cpp) reads these and prints a
-`vm:` line in the per-core report when `--perf=1` (CORE class) is passed
-to `blackbox.sh`. Example:
+[common/legacy_perf.cpp](../sw/runtime/common/legacy_perf.cpp) reads these
+(from the `VX_DCR_MPM_CLASS_MEM` class) and prints a per-core `vm:` line in
+the memory report when `--perf=7` (MEM class) is passed to `blackbox.sh`.
+Example:
 
 ```
 PERF: vm: tlb_reads=96, hit=96%, evicts=0, ptw_walks=4, ptw_avg_lat=84.75
diff --git a/hw/rtl/VX_gpu_pkg.sv b/hw/rtl/VX_gpu_pkg.sv
index 5b8da9cfcd..3131a99c72 100644
--- a/hw/rtl/VX_gpu_pkg.sv
+++ b/hw/rtl/VX_gpu_pkg.sv
@@ -1011,6 +1011,7 @@ package VX_gpu_pkg;
         logic [PERF_CTR_BITS-1:0] writes;
         logic [PERF_CTR_BITS-1:0] read_misses;
         logic [PERF_CTR_BITS-1:0] write_misses;
+        logic [PERF_CTR_BITS-1:0] evictions;
         logic [PERF_CTR_BITS-1:0] bank_stalls;
         logic [PERF_CTR_BITS-1:0] mshr_stalls;
         logic [PERF_CTR_BITS-1:0] mem_stalls;
diff --git a/hw/rtl/cache/VX_amo_alu.sv b/hw/rtl/cache/VX_amo_alu.sv
index 9425dc0753..981b1030a4 100644
--- a/hw/rtl/cache/VX_amo_alu.sv
+++ b/hw/rtl/cache/VX_amo_alu.sv
@@ -18,48 +18,65 @@
 //   - new_word: the value to write back for store-bearing AMOs (all except LR).
 //   - ret_word: the original loaded value, sign-extended into rd.
 //               For SC the bank overrides this with 0/1 outside this module.
-// width selects W (32-bit) or D (64-bit). Sign-extension at the word
-// boundary is needed for signed comparisons (MIN/MAX).
-module VX_amo_alu import VX_gpu_pkg::*; (
-    input  amo_op_e                 op,
-    input  wire                     amo_unsigned, // selects MIN/MAX variant
-    input  wire [1:0]               width,        // 2 = .W, 3 = .D
-    input  wire [63:0]              old_word,
-    input  wire [63:0]              rhs,
-    output reg  [63:0]              new_word,
-    output wire [63:0]              ret_word
+// DATA_WIDTH is the synthesized operand width (= the cache word width, capped
+// at 64): a 32-bit-word cache can only carry .W atomics, so the adder and
+// comparators are built 32-bit rather than 64-bit. width selects .W vs .D and
+// is only meaningful when DATA_WIDTH > 32.
+module VX_amo_alu import VX_gpu_pkg::*; #(
+    parameter DATA_WIDTH = 64
+) (
+    input  amo_op_e     op,
+    input  wire         is_unsigned, // selects MIN/MAX variant
+    input  wire [1:0]   width,        // 2 = .W, 3 = .D
+    input  wire [63:0]  old_word,
+    input  wire [63:0]  rhs,
+    output wire [63:0]  new_word,
+    output wire [63:0]  ret_word
 );
+    localparam AW = DATA_WIDTH;
 
-    wire is_w = (width == 2'd2);
+    // .W and .D only differ when the datapath is wider than 32 bits;
+    // a <= 32-bit operand width can only ever be a .W atomic.
+    wire is_w = (AW > 32) ? (width == 2'd2) : 1'b1;
+    if (AW <= 32) begin : g_w_only
+        `UNUSED_VAR (width)
+    end
+    if (AW < 64) begin : g_hi_unused
+        `UNUSED_VAR (old_word[63:AW])
+        `UNUSED_VAR (rhs[63:AW])
+    end
+
+    wire [AW-1:0] a = old_word[AW-1:0];
+    wire [AW-1:0] b = rhs[AW-1:0];
 
-    // Mask both inputs to width-sized values; sign-extend for MIN/MAX.
-    wire [63:0] a_u = is_w ? {32'h0, old_word[31:0]} : old_word;
-    wire [63:0] b_u = is_w ? {32'h0, rhs[31:0]}      : rhs;
-    wire signed [63:0] a_s = is_w ? {{32{old_word[31]}}, old_word[31:0]} : old_word;
-    wire signed [63:0] b_s = is_w ? {{32{rhs[31]}},      rhs[31:0]}      : rhs;
+    // Mask to width-sized values; sign-extend at the 32-bit boundary for MIN/MAX.
+    wire [AW-1:0] a_u = is_w ? {{(AW-32){1'b0}},  a[31:0]} : a;
+    wire [AW-1:0] b_u = is_w ? {{(AW-32){1'b0}},  b[31:0]} : b;
+    wire signed [AW-1:0] a_s = is_w ? {{(AW-32){a[31]}}, a[31:0]} : a;
+    wire signed [AW-1:0] b_s = is_w ? {{(AW-32){b[31]}}, b[31:0]} : b;
 
+    reg [AW-1:0] res;
     always @(*) begin
         case (op)
-            AMO_OP_LR:    new_word = a_u;
-            AMO_OP_SC:    new_word = b_u;
-            AMO_OP_SWAP:  new_word = b_u;
-            AMO_OP_ADD:   new_word = a_u + b_u;
-            AMO_OP_AND:   new_word = a_u & b_u;
-            AMO_OP_OR:    new_word = a_u | b_u;
-            AMO_OP_XOR:   new_word = a_u ^ b_u;
-            AMO_OP_MIN:   new_word = amo_unsigned
-                                  ? ((a_u < b_u) ? a_u : b_u)
-                                  : ((a_s < b_s) ? a_s : b_s);
-            AMO_OP_MAX:   new_word = amo_unsigned
-                                  ? ((a_u > b_u) ? a_u : b_u)
-                                  : ((a_s > b_s) ? a_s : b_s);
-            default:      new_word = a_u;
+            AMO_OP_LR:    res = a_u;
+            AMO_OP_SC:    res = b_u;
+            AMO_OP_SWAP:  res = b_u;
+            AMO_OP_ADD:   res = a_u + b_u;
+            AMO_OP_AND:   res = a_u & b_u;
+            AMO_OP_OR:    res = a_u | b_u;
+            AMO_OP_XOR:   res = a_u ^ b_u;
+            AMO_OP_MIN:   res = is_unsigned ? ((a_u < b_u) ? a_u : b_u)
+                                            : ((a_s < b_s) ? a_s : b_s);
+            AMO_OP_MAX:   res = is_unsigned ? ((a_u > b_u) ? a_u : b_u)
+                                            : ((a_s > b_s) ? a_s : b_s);
+            default:      res = a_u;
         endcase
-        if (is_w) new_word = {32'h0, new_word[31:0]};
+        if (is_w) res = {{(AW-32){1'b0}}, res[31:0]};
     end
 
-    // Return value: original loaded word at width (LSU sext at width
-    // gives rd). For SC, bank overrides this with 0/1.
-    assign ret_word = is_w ? {32'h0, old_word[31:0]} : old_word;
+    // Zero-extend the AW-sized results back to the 64-bit port.
+    // For SC the bank overrides ret_word with 0/1.
+    assign new_word = 64'(res);
+    assign ret_word = 64'(is_w ? {{(AW-32){1'b0}}, a[31:0]} : a);
 
 endmodule
diff --git a/hw/rtl/cache/VX_amo_unit.sv b/hw/rtl/cache/VX_amo_unit.sv
index 92c42df8a5..73d3199ac7 100644
--- a/hw/rtl/cache/VX_amo_unit.sv
+++ b/hw/rtl/cache/VX_amo_unit.sv
@@ -32,14 +32,15 @@
 // hart's LR), which guarantees LR/SC forward progress under contention.
 module VX_amo_unit import VX_gpu_pkg::*; #(
     parameter NUM_RES_ENTRIES = 4,
-    parameter LINE_ADDR_BITS  = 32
+    parameter LINE_ADDR_BITS  = 32,
+    parameter DATA_WIDTH      = 64  // ALU operand width (cache word, capped at 64)
 ) (
     input  wire                          clk,
     input  wire                          reset,
 
     // Combinational compute kernel.
     input  amo_op_e                      compute_op,
-    input  wire                          compute_amo_unsigned,
+    input  wire                          compute_unsigned,
     input  wire [1:0]                    compute_width,
     input  wire [63:0]                   compute_old,
     input  wire [63:0]                   compute_rhs,
@@ -56,9 +57,11 @@ module VX_amo_unit import VX_gpu_pkg::*; #(
 );
 
     // Pure ALU (no state, no clock).
-    VX_amo_alu alu (
+    VX_amo_alu #(
+        .DATA_WIDTH (DATA_WIDTH)
+    ) alu (
         .op       (compute_op),
-        .amo_unsigned (compute_amo_unsigned),
+        .is_unsigned (compute_unsigned),
         .width    (compute_width),
         .old_word (compute_old),
         .rhs      (compute_rhs),
diff --git a/hw/rtl/cache/VX_cache.sv b/hw/rtl/cache/VX_cache.sv
index 69ad8d6619..759237bdcf 100644
--- a/hw/rtl/cache/VX_cache.sv
+++ b/hw/rtl/cache/VX_cache.sv
@@ -119,6 +119,7 @@ module VX_cache import VX_gpu_pkg::*; #(
 `ifdef PERF_ENABLE
     wire [NUM_BANKS-1:0] perf_read_miss_per_bank;
     wire [NUM_BANKS-1:0] perf_write_miss_per_bank;
+    wire [NUM_BANKS-1:0] perf_evictions_per_bank;
     wire [NUM_BANKS-1:0] perf_mshr_stall_per_bank;
 `endif
 
@@ -401,6 +402,7 @@ module VX_cache import VX_gpu_pkg::*; #(
         `ifdef PERF_ENABLE
             .perf_read_miss     (perf_read_miss_per_bank[bank_id]),
             .perf_write_miss    (perf_write_miss_per_bank[bank_id]),
+            .perf_evictions     (perf_evictions_per_bank[bank_id]),
             .perf_mshr_stall    (perf_mshr_stall_per_bank[bank_id]),
         `endif
 
@@ -612,6 +614,7 @@ module VX_cache import VX_gpu_pkg::*; #(
     wire [`CLOG2(NUM_REQS+1)-1:0]  perf_crsp_stall_per_cycle;
     wire [`CLOG2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle;
     wire [`CLOG2(NUM_BANKS+1)-1:0] perf_write_miss_per_cycle;
+    wire [`CLOG2(NUM_BANKS+1)-1:0] perf_evictions_per_cycle;
     wire [`CLOG2(NUM_BANKS+1)-1:0] perf_mshr_stall_per_cycle;
     wire [`CLOG2(MEM_PORTS+1)-1:0] perf_mem_stall_per_cycle;
 
@@ -619,6 +622,7 @@ module VX_cache import VX_gpu_pkg::*; #(
     `POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_req);
     `POP_COUNT(perf_read_miss_per_cycle, perf_read_miss_per_bank);
     `POP_COUNT(perf_write_miss_per_cycle, perf_write_miss_per_bank);
+    `POP_COUNT(perf_evictions_per_cycle, perf_evictions_per_bank);
     `POP_COUNT(perf_mshr_stall_per_cycle, perf_mshr_stall_per_bank);
     `POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_req);
     `POP_COUNT(perf_mem_stall_per_cycle, perf_mem_stall_per_port);
@@ -627,6 +631,7 @@ module VX_cache import VX_gpu_pkg::*; #(
     reg [PERF_CTR_BITS-1:0] perf_core_writes;
     reg [PERF_CTR_BITS-1:0] perf_read_misses;
     reg [PERF_CTR_BITS-1:0] perf_write_misses;
+    reg [PERF_CTR_BITS-1:0] perf_evictions;
     reg [PERF_CTR_BITS-1:0] perf_mshr_stalls;
     reg [PERF_CTR_BITS-1:0] perf_mem_stalls;
     reg [PERF_CTR_BITS-1:0] perf_crsp_stalls;
@@ -637,6 +642,7 @@ module VX_cache import VX_gpu_pkg::*; #(
             perf_core_writes  <= '0;
             perf_read_misses  <= '0;
             perf_write_misses <= '0;
+            perf_evictions    <= '0;
             perf_mshr_stalls  <= '0;
             perf_mem_stalls   <= '0;
             perf_crsp_stalls  <= '0;
@@ -645,6 +651,7 @@ module VX_cache import VX_gpu_pkg::*; #(
             perf_core_writes  <= perf_core_writes  + PERF_CTR_BITS'(perf_core_writes_per_cycle);
             perf_read_misses  <= perf_read_misses  + PERF_CTR_BITS'(perf_read_miss_per_cycle);
             perf_write_misses <= perf_write_misses + PERF_CTR_BITS'(perf_write_miss_per_cycle);
+            perf_evictions    <= perf_evictions    + PERF_CTR_BITS'(perf_evictions_per_cycle);
             perf_mshr_stalls  <= perf_mshr_stalls  + PERF_CTR_BITS'(perf_mshr_stall_per_cycle);
             perf_mem_stalls   <= perf_mem_stalls   + PERF_CTR_BITS'(perf_mem_stall_per_cycle);
             perf_crsp_stalls  <= perf_crsp_stalls  + PERF_CTR_BITS'(perf_crsp_stall_per_cycle);
@@ -655,6 +662,7 @@ module VX_cache import VX_gpu_pkg::*; #(
     assign cache_perf.writes       = perf_core_writes;
     assign cache_perf.read_misses  = perf_read_misses;
     assign cache_perf.write_misses = perf_write_misses;
+    assign cache_perf.evictions    = perf_evictions;
     assign cache_perf.bank_stalls  = perf_collisions;
     assign cache_perf.mshr_stalls  = perf_mshr_stalls;
     assign cache_perf.mem_stalls   = perf_mem_stalls;
diff --git a/hw/rtl/cache/VX_cache_amo.sv b/hw/rtl/cache/VX_cache_amo.sv
new file mode 100644
index 0000000000..d45d4de051
--- /dev/null
+++ b/hw/rtl/cache/VX_cache_amo.sv
@@ -0,0 +1,481 @@
+// Copyright © 2019-2023
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+`include "VX_cache_define.vh"
+
+// Per-bank AMO engine, instantiated by the bank only when atomics are
+// enabled. A bank plays exactly one of two roles:
+//
+//   IS_LLC=1 (commit): perform the read-modify-write on the line word
+//     resident at S1, maintain the per-hart reservation table (via
+//     VX_amo_unit), and inject the result back through the bank pipeline
+//     as a single-outstanding synthetic writeback.
+//
+//   IS_LLC=0 (passthrough): forward the AMO downstream (the LLC does the
+//     RMW), latch the returned result word, and replay it back to the
+//     requester. Also enforces same-hart program order at the bank input.
+//
+// The two roles are mutually exclusive, so each ties off the other's
+// outputs and the synthesizer keeps only the selected datapath.
+module VX_cache_amo import VX_gpu_pkg::*; #(
+    parameter IS_LLC          = 0,
+    parameter NUM_RES_ENTRIES = 4,
+    parameter LINE_ADDR_BITS  = 32,
+    parameter WORD_WIDTH      = 32,
+    parameter WORD_SIZE       = 4,
+    parameter WORD_SEL_WIDTH  = 1,
+    parameter TAG_WIDTH       = 1,
+    parameter REQ_SEL_WIDTH   = 1,
+    parameter ATTR_WIDTH      = 1,
+    parameter MSHR_SIZE       = 1,
+    parameter MSHR_ADDR_WIDTH = 1,
+    parameter WORDS_PER_LINE  = 1
+) (
+    input  wire                          clk,
+    input  wire                          reset,
+    input  wire                          pipe_stall,
+
+    // pipeline view
+    input  amo_req_t                     amo_st0,
+    input  wire                          valid_st0,
+    input  wire                          is_creq_st0,
+    input  wire                          is_hit_st0,
+    input  wire                          is_replay_st0,
+    input  amo_req_t                     amo_st1,
+    input  wire                          valid_st1,
+    input  wire                          is_creq_st1,
+    input  wire                          is_hit_st1,
+    input  wire                          is_replay_st1,
+    input  wire                          do_write_st1,
+    input  wire [WORD_WIDTH-1:0]         read_word_st1,
+    input  wire [WORD_SIZE-1:0]          byteen_st1,
+    input  wire [WORD_WIDTH-1:0]         write_word_st1,
+    input  wire [WORD_SEL_WIDTH-1:0]     word_idx_st0,
+    input  wire [WORD_SEL_WIDTH-1:0]     word_idx_st1,
+    input  wire [LINE_ADDR_BITS-1:0]     addr_st0,
+    input  wire [LINE_ADDR_BITS-1:0]     addr_st1,
+    input  wire [TAG_WIDTH-1:0]          tag_st1,
+    input  wire [REQ_SEL_WIDTH-1:0]      req_idx_st1,
+    input  wire [ATTR_WIDTH-1:0]         attr_st1,
+
+    // commit handshake: the bank grants the synthetic writeback this cycle
+    input  wire                          wb_fire,
+
+    // mshr / memory fill (passthrough)
+    input  wire                          mshr_allocate_st0,
+    input  wire [MSHR_ADDR_WIDTH-1:0]    mshr_alloc_id_st0,
+    input  wire [MSHR_ADDR_WIDTH-1:0]    mshr_id_st1,
+    input  wire                          mem_rsp_fire,
+    input  wire [MSHR_ADDR_WIDTH-1:0]    mem_rsp_id,
+    input  wire [WORDS_PER_LINE*WORD_WIDTH-1:0] mem_rsp_data,
+    input  wire                          is_fill_sel,
+
+    // input arbitration (passthrough age-ordering)
+    input  wire                          core_req_valid,
+    input  wire                          core_req_is_amo,
+    input  wire                          core_req_rw,
+    input  wire [LINE_ADDR_BITS-1:0]     core_req_addr,
+    input  wire                          rw_st0,
+    input  wire                          mshr_probe_pending_ld,
+    input  wire                          mshr_probe_pending_amo,
+
+    // commit outputs (tied off when IS_LLC=0)
+    output wire                          amo_hit_st1,    // AMO commits locally at S1
+    output wire                          commit_busy,    // commit in flight
+    output wire                          chain_stall,    // pace same-line chained AMO
+    output wire                          wb_pending,     // writeback request live
+    output wire [WORD_WIDTH-1:0]         rsp_data,       // response word on amo_hit_st1
+    output wire [LINE_ADDR_BITS-1:0]     wb_addr,
+    output wire [WORD_SEL_WIDTH-1:0]     wb_word_idx,
+    output wire [WORD_SIZE-1:0]          wb_byteen,
+    output wire [WORD_WIDTH-1:0]         wb_data,
+    output wire [TAG_WIDTH-1:0]          wb_tag,
+    output wire [REQ_SEL_WIDTH-1:0]      wb_idx,
+    output wire [ATTR_WIDTH-1:0]         wb_attr,
+
+    // passthrough outputs (tied off when IS_LLC=1)
+    output wire                          is_amo_fwd_st0,    // AMO first pass (S0)
+    output wire                          is_amo_fwd_st1,    // AMO first pass (S1)
+    output wire                          is_amo_replay_st1, // result replay
+    output wire                          is_passthru_fill_sel,
+    output wire [WORD_WIDTH-1:0]         amo_ptw_word_st1,
+    output wire                          req_input_defer
+);
+    if (IS_LLC != 0) begin : g_commit
+        // ----------------------------------------------------------------
+        // LLC commit: RMW on the resident line + synthetic writeback
+        // ----------------------------------------------------------------
+        localparam BIT_OFF_BITS = `CLOG2(WORD_WIDTH);
+        localparam AMO_OLD_BITS = (WORD_WIDTH < 64) ? WORD_WIDTH : 64;
+
+        // Writeback queue (depth 2): a completed AMO pushes its result here
+        // instead of overwriting a still-draining different-line writeback. The
+        // head (slot 0) drains through the bank's synthetic-write path; pushes
+        // never clobber a pending entry, so different-line AMOs pipeline without
+        // stalling any replay (coalescer-safe) or the pipe (deadlock-free).
+        localparam WBQ_SIZE = 2;
+        localparam WBQ_CNTW = `CLOG2(WBQ_SIZE+1);
+        localparam WBQ_IDXW = `CLOG2(WBQ_SIZE);
+        reg [WBQ_CNTW-1:0]           wbq_count;
+        reg [LINE_ADDR_BITS-1:0]     wbq_addr   [WBQ_SIZE];
+        reg [WORD_SEL_WIDTH-1:0]     wbq_wsel   [WBQ_SIZE];
+        reg [WORD_SIZE-1:0]          wbq_byteen [WBQ_SIZE];
+        reg [WORD_WIDTH-1:0]         wbq_data   [WBQ_SIZE];
+        reg [TAG_WIDTH-1:0]          wbq_tag    [WBQ_SIZE];
+        reg [REQ_SEL_WIDTH-1:0]      wbq_idx    [WBQ_SIZE];
+        reg [ATTR_WIDTH-1:0]         wbq_attr   [WBQ_SIZE];
+
+        // Head aliases (slot 0 = oldest = the entry currently draining).
+        wire                         wb_pending_r  = (wbq_count != '0);
+        wire [LINE_ADDR_BITS-1:0]    wb_addr_r     = wbq_addr[0];
+        wire [WORD_SEL_WIDTH-1:0]    wb_word_idx_r = wbq_wsel[0];
+        wire [WORD_SIZE-1:0]         wb_byteen_r   = wbq_byteen[0];
+        wire [WORD_WIDTH-1:0]        wb_data_r     = wbq_data[0];
+        wire [TAG_WIDTH-1:0]         wb_tag_r      = wbq_tag[0];
+        wire [REQ_SEL_WIDTH-1:0]     wb_idx_r      = wbq_idx[0];
+        wire [ATTR_WIDTH-1:0]        wb_attr_r     = wbq_attr[0];
+
+        // BRAM-settle window: a fired writeback takes a couple cycles to land
+        // in cache_data; commit_busy stays high across it so the next AMO reads
+        // the committed line. post_wb_{addr,data} hold the just-drained entry.
+        reg [1:0]                    post_wb_age;
+        reg [LINE_ADDR_BITS-1:0]     post_wb_addr;
+        reg [WORD_WIDTH-1:0]         post_wb_data;
+        wire                         post_wb_valid = (post_wb_age != 2'd0);
+
+        // Compute stage: S1 latches the aligned operands, the RMW ALU + the
+        // re-align shift run the next cycle, off the S1 critical path. AMO
+        // commits are serialized by commit_busy (the bank holds off core
+        // requests and replays), so the stage holds at most one operation and
+        // each AMO reads the freshly written line (no operand forwarding).
+        reg                          cmp_valid;
+        reg [63:0]                   cmp_old, cmp_rhs;
+        amo_op_e                     cmp_op;
+        reg [1:0]                    cmp_width;
+        reg                          cmp_unsigned;
+        reg [BIT_OFF_BITS-1:0]       cmp_bit_off;
+        reg [LINE_ADDR_BITS-1:0]     cmp_addr;
+        reg [WORD_SIZE-1:0]          cmp_byteen;
+        reg [WORD_SEL_WIDTH-1:0]     cmp_wsel;
+        reg [TAG_WIDTH-1:0]          cmp_tag;
+        reg [REQ_SEL_WIDTH-1:0]      cmp_idx;
+        reg [ATTR_WIDTH-1:0]         cmp_attr;
+
+        // Byte-offset alignment: shift the target down to bit 0 for compute,
+        // and shift results back for response/writeback.
+        wire [`UP(`CLOG2(WORD_SIZE))-1:0] byte_off_st1;
+        VX_priority_encoder #(
+            .N (WORD_SIZE)
+        ) byte_off_enc (
+            .data_in    (byteen_st1),
+            .index_out  (byte_off_st1),
+            `UNUSED_PIN (valid_out),
+            `UNUSED_PIN (onehot_out)
+        );
+        wire [BIT_OFF_BITS-1:0] bit_off_st1 = BIT_OFF_BITS'({byte_off_st1, 3'b0});
+
+        // Forward an in-flight (or just-fired) writeback on the same line back
+        // into the operand: chained same-line AMOs are paced one cycle apart
+        // (chain_stall) so the prior result already sits in wb_data_r while
+        // read_word_st1 may still be stale.
+        // Forward the newest in-flight value for this line: scan the queue
+        // newest-first, then the just-drained (settling) entry, else the array.
+        wire fwd_q1 = (wbq_count > 1) && (wbq_addr[1] == addr_st1);
+        wire fwd_q0 = (wbq_count > 0) && (wbq_addr[0] == addr_st1);
+        wire fwd_pw = post_wb_valid && (post_wb_addr == addr_st1);
+        wire [WORD_WIDTH-1:0] line_word_st1 = fwd_q1 ? wbq_data[1]
+                                            : fwd_q0 ? wbq_data[0]
+                                            : fwd_pw ? post_wb_data
+                                            : read_word_st1;
+        wire [WORD_WIDTH-1:0] line_word_shifted_st1 = line_word_st1 >> bit_off_st1;
+        wire [WORD_WIDTH-1:0] rhs_word_shifted_st1  = write_word_st1 >> bit_off_st1;
+
+        // width from byteen popcount (.W -> 4 bytes, .D -> 8); operands top at .D.
+        wire [1:0] width_st1 = ($countones(byteen_st1) == 8) ? 2'd3 : 2'd2;
+        wire [63:0] rhs_st1 = (width_st1 == 2'd2)
+                            ? 64'({32'h0, rhs_word_shifted_st1[31:0]})
+                            : 64'(rhs_word_shifted_st1[AMO_OLD_BITS-1:0]);
+        wire [63:0] old_st1 = (width_st1 == 2'd2)
+                            ? 64'({32'h0, line_word_shifted_st1[31:0]})
+                            : 64'(line_word_shifted_st1[AMO_OLD_BITS-1:0]);
+        if (WORD_WIDTH > 64) begin : g_upper_unused
+            `UNUSED_VAR (line_word_shifted_st1[WORD_WIDTH-1:64])
+            `UNUSED_VAR (rhs_word_shifted_st1[WORD_WIDTH-1:64])
+        end
+
+        wire        res_check;
+
+        // commit conditions (from the original AMO at S1; amo_st1.hart_id is
+        // valid there, not on the compute/writeback cycle).
+        wire amo_hit_w = amo_st1.amo_valid && is_hit_st1 && valid_st1 && is_creq_st1;
+        wire sc_fail_st1 = (amo_st1.amo_op == AMO_OP_SC) && ~res_check;
+        wire do_store_st1 = amo_hit_w && (amo_st1.amo_op != AMO_OP_LR) && ~sc_fail_st1;
+        wire do_store_st0 = amo_st0.amo_valid && valid_st0 && is_creq_st0 && is_hit_st0
+                         && (amo_st0.amo_op != AMO_OP_LR);
+
+        wire res_reserve    = amo_hit_w && (amo_st1.amo_op == AMO_OP_LR);
+        wire res_clear      = amo_hit_w && (amo_st1.amo_op == AMO_OP_SC);
+        // any committed write to the line breaks other harts' reservations;
+        // AMOs ride the load path (rw=0) so do_write_st1 is plain stores only.
+        wire res_invalidate = do_store_st1 || do_write_st1;
+
+        // RMW ALU runs on the registered compute-stage operands (off the S1
+        // path); the reservation table is driven from S1 so the SC outcome is
+        // ready for the response. ret_word is unused — the response old value
+        // comes straight from S1 (no ALU).
+        wire [63:0] new_word;
+        wire [63:0] ret_word_unused;
+        VX_amo_unit #(
+            .NUM_RES_ENTRIES (NUM_RES_ENTRIES),
+            .LINE_ADDR_BITS  (LINE_ADDR_BITS),
+            .DATA_WIDTH      (AMO_OLD_BITS)  // 32-bit word cache -> 32-bit RMW datapath
+        ) amo_unit (
+            .clk           (clk),
+            .reset         (reset),
+            .compute_op    (cmp_op),
+            .compute_unsigned (cmp_unsigned),
+            .compute_width (cmp_width),
+            .compute_old   (cmp_old),
+            .compute_rhs   (cmp_rhs),
+            .compute_new_word (new_word),
+            .compute_ret_word (ret_word_unused),
+            .res_reserve   (res_reserve),
+            .res_clear     (res_clear),
+            .res_invalidate(res_invalidate),
+            .res_hart_id   (amo_st1.hart_id),
+            .res_line_addr (addr_st1),
+            .res_check     (res_check)
+        );
+        `UNUSED_VAR (ret_word_unused)
+
+        // place the computed word at its byte offset within the cache word
+        wire [WORD_WIDTH-1:0] wb_data_w = WORD_WIDTH'(new_word) << cmp_bit_off;
+
+        // Compute finished this cycle (result ready to enqueue): the compute
+        // stage is occupied and not being reloaded by a fresh latch.
+        wire wb_push = cmp_valid && ~(do_store_st1 && ~pipe_stall);
+        // A same-line result coalesces into its existing entry (only the latest
+        // value must reach the array; earlier ones are forwarded), so a same-line
+        // burst stays at a single entry. A new-line result enqueues at the tail.
+        // The head cannot be coalesced into the cycle it drains.
+        wire wb_coal0    = (wbq_count > 0) && (wbq_addr[0] == cmp_addr) && ~wb_fire;
+        wire wb_coal1    = (wbq_count > 1) && (wbq_addr[1] == cmp_addr);
+        wire wb_coalesce = wb_coal1 || wb_coal0;
+        wire [WBQ_IDXW-1:0] wb_coal_idx = wb_coal1 ? WBQ_IDXW'(1) : WBQ_IDXW'(0);
+        // New entry lands at the post-pop tail; a coalesce slot shifts down on a pop.
+        wire [WBQ_IDXW-1:0] wb_new_idx  = WBQ_IDXW'(wb_fire ? (wbq_count - WBQ_CNTW'(1)) : wbq_count);
+        wire [WBQ_IDXW-1:0] wb_slot     = wb_coalesce ? WBQ_IDXW'(wb_fire ? (wb_coal_idx - WBQ_IDXW'(1)) : wb_coal_idx)
+                                                      : wb_new_idx;
+
+        always @(posedge clk) begin
+            if (reset) begin
+                cmp_valid   <= 1'b0;
+                wbq_count   <= '0;
+                post_wb_age <= 2'd0;
+            end else begin
+                if (wb_fire) begin
+                    post_wb_age  <= 2'd2;
+                    post_wb_addr <= wbq_addr[0];
+                    post_wb_data <= wbq_data[0];
+                end else if (post_wb_valid) begin
+                    post_wb_age <= post_wb_age - 2'd1;
+                end
+
+                // Compute stage (single): latch a new AMO, else retire the result.
+                if (do_store_st1 && ~pipe_stall) begin
+                    cmp_valid    <= 1'b1;
+                    cmp_old      <= old_st1;
+                    cmp_rhs      <= rhs_st1;
+                    cmp_op       <= amo_st1.amo_op;
+                    cmp_width    <= width_st1;
+                    cmp_unsigned <= amo_st1.amo_unsigned;
+                    cmp_bit_off  <= bit_off_st1;
+                    cmp_addr     <= addr_st1;
+                    cmp_byteen   <= byteen_st1;
+                    cmp_wsel     <= word_idx_st1;
+                    cmp_tag      <= tag_st1;
+                    cmp_idx      <= req_idx_st1;
+                    cmp_attr     <= attr_st1;
+                end else if (cmp_valid) begin
+                    cmp_valid <= 1'b0;
+                end
+
+                // Writeback queue: a drain (wb_fire) shifts the head out; a
+                // completed compute (wb_push) enqueues at the tail. The push is
+                // written after the shift so it wins when both hit the same slot.
+                if (wb_fire) begin
+                    wbq_addr[0]   <= wbq_addr[1];
+                    wbq_wsel[0]   <= wbq_wsel[1];
+                    wbq_byteen[0] <= wbq_byteen[1];
+                    wbq_data[0]   <= wbq_data[1];
+                    wbq_tag[0]    <= wbq_tag[1];
+                    wbq_idx[0]    <= wbq_idx[1];
+                    wbq_attr[0]   <= wbq_attr[1];
+                end
+                if (wb_push) begin
+                    wbq_addr[wb_slot]   <= cmp_addr;
+                    wbq_wsel[wb_slot]   <= cmp_wsel;
+                    wbq_byteen[wb_slot] <= cmp_byteen;
+                    wbq_data[wb_slot]   <= wb_data_w;
+                    wbq_tag[wb_slot]    <= cmp_tag;
+                    wbq_idx[wb_slot]    <= cmp_idx;
+                    wbq_attr[wb_slot]   <= cmp_attr;
+                end
+                // Count grows only on a new (non-coalescing) enqueue; a coalesce
+                // updates in place. Pop removes the head.
+                if (wb_push && ~wb_coalesce && ~wb_fire)
+                    wbq_count <= wbq_count + WBQ_CNTW'(1);
+                else if (~(wb_push && ~wb_coalesce) && wb_fire)
+                    wbq_count <= wbq_count - WBQ_CNTW'(1);
+            end
+        end
+
+        // response (fired at S1): SC -> 0/1; other -> old value (LSU sexts).
+        // The old value is available at S1 directly, no ALU needed.
+        wire [63:0] rsp_word = (amo_st1.amo_op == AMO_OP_SC) ? {63'h0, sc_fail_st1} : old_st1;
+        if (WORD_WIDTH < 64) begin : g_rsp_upper_unused
+            `UNUSED_VAR (rsp_word[63:WORD_WIDTH])
+        end
+
+        assign amo_hit_st1 = amo_hit_w;
+        assign rsp_data    = WORD_WIDTH'(rsp_word) << bit_off_st1;
+        // Commit in flight: holds off new core-request admission from the S0
+        // prediction through the compute stage and the writeback. Replays are
+        // NOT blocked (the MSHR streams coalesced same-line AMOs back to back);
+        // those are paced instead by chain_stall.
+        assign commit_busy = do_store_st0 || do_store_st1 || cmp_valid || wb_pending_r;
+        // Pace any same-line request sitting behind an in-flight compute by one
+        // cycle, so the result lands in wb_data_r and forwards cleanly. Gated on
+        // cmp_valid (an AMO is computing), so it never fires for baseline traffic.
+        assign chain_stall = cmp_valid && valid_st1 && is_creq_st1 && (cmp_addr == addr_st1);
+
+        // Invariants: a store-bearing AMO is only ever accepted into a free
+        // compute stage (the queue absorbs different-line writebacks behind it),
+        // and the writeback queue must never overflow.
+        `RUNTIME_ASSERT (~(do_store_st1 && ~pipe_stall && cmp_valid),
+            ("%t: AMO compute-stage overwrite (addr=0x%0h)", $time, addr_st1))
+        `RUNTIME_ASSERT (~(wb_push && ~wb_coalesce && ~wb_fire && (wbq_count == WBQ_CNTW'(WBQ_SIZE))),
+            ("%t: AMO writeback queue overflow (addr=0x%0h)", $time, cmp_addr))
+        assign wb_pending  = wb_pending_r;
+        assign wb_addr     = wb_addr_r;
+        assign wb_word_idx = wb_word_idx_r;
+        assign wb_byteen   = wb_byteen_r;
+        assign wb_data     = wb_data_r;
+        assign wb_tag      = wb_tag_r;
+        assign wb_idx      = wb_idx_r;
+        assign wb_attr     = wb_attr_r;
+
+        // passthrough outputs unused in this role
+        assign is_amo_fwd_st0       = 1'b0;
+        assign is_amo_fwd_st1       = 1'b0;
+        assign is_amo_replay_st1    = 1'b0;
+        assign is_passthru_fill_sel = 1'b0;
+        assign amo_ptw_word_st1     = '0;
+        assign req_input_defer      = 1'b0;
+
+        `UNUSED_VAR (amo_st0) // only amo_valid/amo_op are consumed at S0
+        `UNUSED_VAR (is_replay_st0)
+        `UNUSED_VAR (is_replay_st1)
+        `UNUSED_VAR (word_idx_st0)
+        `UNUSED_VAR (addr_st0)
+        `UNUSED_VAR (mshr_allocate_st0)
+        `UNUSED_VAR (mshr_alloc_id_st0)
+        `UNUSED_VAR (mshr_id_st1)
+        `UNUSED_VAR (mem_rsp_fire)
+        `UNUSED_VAR (mem_rsp_id)
+        `UNUSED_VAR (mem_rsp_data)
+        `UNUSED_VAR (is_fill_sel)
+        `UNUSED_VAR (core_req_valid)
+        `UNUSED_VAR (core_req_is_amo)
+        `UNUSED_VAR (core_req_rw)
+        `UNUSED_VAR (core_req_addr)
+        `UNUSED_VAR (rw_st0)
+        `UNUSED_VAR (mshr_probe_pending_ld)
+        `UNUSED_VAR (mshr_probe_pending_amo)
+    end else begin : g_passthru
+        // ----------------------------------------------------------------
+        // Non-LLC passthrough: forward downstream, replay the result word
+        // ----------------------------------------------------------------
+        assign is_amo_fwd_st0    = amo_st0.amo_valid && valid_st0 && is_creq_st0 && ~is_replay_st0;
+        assign is_amo_fwd_st1    = amo_st1.amo_valid && valid_st1 && is_creq_st1 && ~is_replay_st1;
+        assign is_amo_replay_st1 = amo_st1.amo_valid && valid_st1 && is_creq_st1 && is_replay_st1;
+
+        reg [MSHR_SIZE-1:0]      ptw_flag;   // entry awaits a passthru fill
+        reg [WORD_SEL_WIDTH-1:0] ptw_wsel [MSHR_SIZE];
+        reg [WORD_WIDTH-1:0]     ptw_word [MSHR_SIZE];
+
+        wire [WORDS_PER_LINE-1:0][WORD_WIDTH-1:0] mem_rsp_words = mem_rsp_data;
+
+        assign is_passthru_fill_sel = is_fill_sel && ptw_flag[mem_rsp_id];
+        assign amo_ptw_word_st1     = ptw_word[mshr_id_st1];
+
+        always @(posedge clk) begin
+            if (reset) begin
+                ptw_flag <= '0;
+            end else begin
+                // mark the AMO's MSHR entry on allocation
+                if (is_amo_fwd_st0 && mshr_allocate_st0 && ~pipe_stall) begin
+                    ptw_flag[mshr_alloc_id_st0] <= 1'b1;
+                    ptw_wsel[mshr_alloc_id_st0] <= word_idx_st0;
+                end
+                // latch the result word on the passthru fill, clear the flag
+                if (mem_rsp_fire && ptw_flag[mem_rsp_id]) begin
+                    ptw_word[mem_rsp_id] <= mem_rsp_words[ptw_wsel[mem_rsp_id]];
+                    ptw_flag[mem_rsp_id] <= 1'b0;
+                end
+            end
+        end
+
+        // catch a same-line request mid-allocation at S0 (not yet visible to
+        // the MSHR probe in the window between admit and allocate).
+        wire alloc_same_line = mshr_allocate_st0 && ~pipe_stall && (addr_st0 == core_req_addr);
+        wire st0_ld_alloc    = alloc_same_line && ~amo_st0.amo_valid && ~rw_st0;
+        wire st0_amo_alloc   = alloc_same_line &&  amo_st0.amo_valid;
+
+        wire amo_input_defer  = core_req_valid && core_req_is_amo
+                             && (mshr_probe_pending_ld || st0_ld_alloc);
+        wire load_input_defer = core_req_valid && ~core_req_is_amo && ~core_req_rw
+                             && (mshr_probe_pending_amo || st0_amo_alloc);
+        assign req_input_defer = amo_input_defer || load_input_defer;
+
+        // commit outputs unused in this role
+        assign amo_hit_st1 = 1'b0;
+        assign commit_busy = 1'b0;
+        assign chain_stall = 1'b0;
+        assign wb_pending  = 1'b0;
+        assign rsp_data    = '0;
+        assign wb_addr     = '0;
+        assign wb_word_idx = '0;
+        assign wb_byteen   = '0;
+        assign wb_data     = '0;
+        assign wb_tag      = '0;
+        assign wb_idx      = '0;
+        assign wb_attr     = '0;
+
+        `UNUSED_VAR (amo_st0) // only amo_valid gates the passthru path
+        `UNUSED_VAR (amo_st1)
+        `UNUSED_VAR (is_hit_st0)
+        `UNUSED_VAR (is_hit_st1)
+        `UNUSED_VAR (do_write_st1)
+        `UNUSED_VAR (read_word_st1)
+        `UNUSED_VAR (byteen_st1)
+        `UNUSED_VAR (write_word_st1)
+        `UNUSED_VAR (word_idx_st1)
+        `UNUSED_VAR (addr_st1)
+        `UNUSED_VAR (tag_st1)
+        `UNUSED_VAR (req_idx_st1)
+        `UNUSED_VAR (attr_st1)
+        `UNUSED_VAR (wb_fire)
+    end
+
+endmodule
diff --git a/hw/rtl/cache/VX_cache_bank.sv b/hw/rtl/cache/VX_cache_bank.sv
index 1a24470c38..8a6f75b327 100644
--- a/hw/rtl/cache/VX_cache_bank.sv
+++ b/hw/rtl/cache/VX_cache_bank.sv
@@ -62,12 +62,10 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
     // Memory request output buffer (TO_OUT_BUF_* encoding)
     parameter MEM_OUT_BUF       = 0,
 
-    // AMO: this bank belongs to the last-level cache. Reservation
-    // table + AMO commit logic synthesize only when 1.
+    // This bank is the last-level cache (AMOs commit locally here).
     parameter IS_LLC            = 0,
 
-    // AMO: this bank supports atomic ops. Default 0 — AMO is generated
-    // away. Pulled in from the cache wrapper hierarchy.
+    // This bank supports atomic ops (AMO logic synthesizes only when 1).
     parameter AMO_ENABLE        = 0,
 
     parameter MSHR_ADDR_WIDTH   = `LOG2UP(MSHR_SIZE),
@@ -81,6 +79,7 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
 `ifdef PERF_ENABLE
     output wire perf_read_miss,
     output wire perf_write_miss,
+    output wire perf_evictions,
     output wire perf_mshr_stall,
 `endif
 
@@ -93,7 +92,7 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
     input wire [`CS_WORD_WIDTH-1:0]     core_req_data,  // data to be written
     input wire [TAG_WIDTH-1:0]          core_req_tag,   // identifier of the request (request id)
     input wire [REQ_SEL_WIDTH-1:0]      core_req_idx,   // index of the request in the core request array
-    input wire [`UP(MEM_ATTR_WIDTH)-1:0]   core_req_attr,
+    input wire [`UP(MEM_ATTR_WIDTH)-1:0] core_req_attr,
     output wire                         core_req_ready,
 
     // Core Response
@@ -110,7 +109,7 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
     output wire [LINE_SIZE-1:0]         mem_req_byteen,
     output wire [`CS_LINE_WIDTH-1:0]    mem_req_data,
     output wire [MEM_TAG_WIDTH-1:0]     mem_req_tag,
-    output wire [`UP(MEM_ATTR_WIDTH)-1:0]  mem_req_attr,
+    output wire [`UP(MEM_ATTR_WIDTH)-1:0] mem_req_attr,
     input  wire                         mem_req_ready,
 
     // Memory response
@@ -131,8 +130,7 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
     // straight through, so it is unused locally.
     `UNUSED_PARAM (MRSQ_SIZE)
 
-    // Extract the AMO sideband from the attr field at the fixed offset.
-    // amo_req_t is always defined; AMO_ENABLE gates the actual AMO logic.
+    // AMO sideband, extracted from the attr field (gated by AMO_ENABLE).
     amo_req_t core_req_amo;
     assign core_req_amo = AMO_ENABLE ?
         amo_req_t'(core_req_attr[MEM_ATTR_AMO_OFFS +: AMO_REQ_BITS])
@@ -190,15 +188,29 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
     wire                            is_hit_st0, is_hit_st1;
     wire [`UP(MEM_ATTR_WIDTH)-1:0] attr_sel, attr_st0, attr_st1;
     amo_req_t                       amo_sel, amo_st0, amo_st1;
-    // Forward decls — assigned in the IS_LLC block / S1 commit logic.
-    // Needed earlier (in wb_fire gating) to suppress the synthetic
-    // writeback fire on cycles where another AMO is committing at S1
-    // (chain or fresh-latch). Without this gate, wb_fire latches a
-    // stale amo_wb_data_r into pipe_reg0 while the chain branch updates
-    // it, losing the chained AMO's effect.
-    wire                            amo_hit_st1;
-    wire                            sc_fail_st1;
-    wire                            amo_do_store_st1;
+
+    // AMO interconnect (driven by the VX_cache_amo engine, tied off when the
+    // bank carries no AMO logic). Declared here because the input arbitration
+    // and sel mux consume them ahead of the instantiation.
+    wire                            amo_hit_st1;       // AMO commits locally at S1 (LLC)
+    wire                            amo_commit_busy;   // LLC commit in flight
+    wire                            amo_chain_stall;   // pace same-line chained AMO
+    wire                            amo_wb_pending;    // synthetic writeback request live
+    wire [`CS_WORD_WIDTH-1:0]       amo_rsp_data;      // LLC AMO response word
+    wire [`CS_LINE_ADDR_WIDTH-1:0]  amo_wb_addr;
+    wire [WORD_SEL_WIDTH-1:0]       amo_wb_word_idx;
+    wire [WORD_SIZE-1:0]            amo_wb_byteen;
+    wire [`CS_WORD_WIDTH-1:0]       amo_wb_data;
+    wire [TAG_WIDTH-1:0]            amo_wb_tag;
+    wire [REQ_SEL_WIDTH-1:0]        amo_wb_idx;
+    wire [`UP(MEM_ATTR_WIDTH)-1:0]  amo_wb_attr;
+    wire                            is_amo_fwd_st0;    // non-LLC AMO first pass (S0)
+    wire                            is_amo_fwd_st1;    // non-LLC AMO first pass (S1)
+    wire                            is_amo_replay_st1; // non-LLC AMO result replay
+    wire                            is_passthru_fill_sel;
+    wire [`CS_WORD_WIDTH-1:0]       amo_ptw_word_st1;
+    wire                            req_input_defer;   // non-LLC age-ordering hold
+
     wire                            mshr_pending_raw_st0;
     wire                            mshr_pending_st0, mshr_pending_st1;
     wire [MSHR_ADDR_WIDTH-1:0]      mshr_previd_st0, mshr_previd_st1;
@@ -235,7 +247,10 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
         .bank_empty  (no_pending_req)
     );
 
-    wire pipe_stall = crsp_queue_stall;
+    // amo_chain_stall paces a same-line AMO behind an in-flight commit by one
+    // cycle so the prior result reaches the writeback register; it is 0 for all
+    // non-AMO traffic, so the baseline pipe is unaffected.
+    wire pipe_stall = crsp_queue_stall || amo_chain_stall;
 
     // inputs arbitration:
     // mshr replay has highest priority to maximize utilization since there is no miss.
@@ -251,43 +266,12 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
     wire flush_enable = flush_grant && flush_valid;
 
     wire creq_grant  = ~init_valid && ~replay_enable && ~fill_enable && ~flush_enable;
-    // creq fires from real core_req OR from a pending AMO writeback
-    // (the synthetic write produced after an AMO commit at S1). The
-    // two paths are mutually exclusive: when amo_wb_pending=1 the sel
-    // mux feeds wb data, so a simultaneous real core_req cannot make
-    // it into pipe_reg0 — gate accordingly to avoid letting a stuck
-    // upstream core_req_valid pull the wb data into the pipe twice.
-    // When AMO_ENABLE=0 the writeback machinery is tied off so amo_wb_path=0.
-    //
-    // Age-ordering (non-LLC AMO configs only). Two symmetric holds at the
-    // input, both releasing when the in-flight entry retires:
-    //   * an incoming AMO waits while a load fill is pending for its line,
-    //     so its local invalidate lands on the installed (else stale) line;
-    //   * an incoming plain load waits while an AMO passthrough is pending
-    //     for its line, so the load observes that AMO (same-hart same-
-    //     address program order).
-    // Also catch a same-line request that was just admitted and is
-    // allocating its MSHR entry at S0 this cycle — it is not yet visible
-    // to the MSHR probe (a 1-cycle window between admit and allocate).
-    wire amo_cfg = (AMO_ENABLE != 0) && (IS_LLC == 0);
-    wire st0_alloc_same_line = mshr_allocate_st0 && ~pipe_stall && (addr_st0 == core_req_addr);
-    wire st0_ld_alloc  = st0_alloc_same_line && ~amo_st0.amo_valid && ~rw_st0;
-    wire st0_amo_alloc = st0_alloc_same_line &&  amo_st0.amo_valid;
-    wire amo_input_defer  = amo_cfg && core_req_valid && core_req_amo.amo_valid
-                         && (mshr_probe_pending_ld  || st0_ld_alloc);
-    wire load_input_defer = amo_cfg && core_req_valid && ~core_req_amo.amo_valid && ~core_req_rw
-                         && (mshr_probe_pending_amo || st0_amo_alloc);
-    wire req_input_defer  = amo_input_defer || load_input_defer;
-    // The LLC AMO-commit writeback is single-outstanding. An AMO that will
-    // commit occupies the S0->S1->writeback path exclusively; admitting a
-    // second AMO (to another line) during that window lets it reach S1 while
-    // the writeback is busy, dropping its store. Block new admissions across
-    // the whole window: a committing AMO at S0 (predicted from its hit, non-LR),
-    // at S1 (do_store), or its writeback in flight. All terms are 0 at non-LLC
-    // banks (amo_hit/do_store gated to IS_LLC), so this only gates the LLC.
-    wire amo_do_store_st0 = (IS_LLC != 0) && amo_st0.amo_valid && valid_st0
-                         && is_creq_st0 && is_hit_st0 && (amo_st0.amo_op != AMO_OP_LR);
-    wire amo_commit_busy = amo_wb_pending || amo_do_store_st1 || amo_do_store_st0;
+    // creq fires from a real core_req or from a pending LLC AMO writeback
+    // (the synthetic write injected after a commit); the two are mutually
+    // exclusive. amo_commit_busy holds off new admits while a single-
+    // outstanding LLC commit is in flight; req_input_defer enforces non-LLC
+    // age-ordering. Both, plus amo_wb_pending/amo_hit_st1, are driven by the
+    // AMO engine below and tie to 0 when the bank carries no AMO logic.
     wire amo_creq_path = core_req_valid && ~amo_commit_busy && ~req_input_defer;
     wire amo_wb_path   = amo_wb_pending && ~amo_hit_st1;
     wire creq_enable   = creq_grant && (amo_creq_path || amo_wb_path);
@@ -316,13 +300,8 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
     wire replay_fire   = replay_valid && replay_ready;
     wire mem_rsp_fire  = mem_rsp_valid && mem_rsp_ready;
     wire flush_fire    = flush_valid && flush_ready;
-    // wb fire: held off when another AMO is committing at S1 this
-    // cycle (either chain or different-line). Without this gate the
-    // wb would latch the stale pre-chain amo_wb_data_r into pipe_reg0
-    // while the chain branch updates the register, losing the chained
-    // AMO. core_req_fire uses the SAME gate so the wb either fires or
-    // stays out of the pipe — never gets captured while wb is
-    // suppressed (which would inject ghost wb's).
+    // amo_wb_path already excludes the cycle a fresh AMO commits at S1
+    // (amo_hit_st1), so the writeback never races the chain update.
     wire amo_wb_fire   = amo_wb_path && creq_grant
                       && ~mreq_queue_alm_full && ~mshr_alm_full && ~pipe_stall;
     wire core_req_fire = (amo_creq_path || amo_wb_path) && creq_grant
@@ -350,70 +329,29 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
         assign flush_tag = '0;
     end
 
-    // ============================================================
-    // AMO writeback state machine
-    // ============================================================
-    // When AMO commits at S1 with do_store, we need to write the
-    // computed new_word back into the cache line. cache_data writes
-    // fire at st0 (write addr/data arrive at st0, commit on next clk
-    // edge), but new_word isn't ready until st1. We close the timing
-    // gap by injecting a synthetic core-req-like write at sel that
-    // flows through st0/st1 normally. The bank stalls accepting new
-    // core_req during the writeback (1-2 cycles) so wb commits before
-    // the next AMO can race.
-    reg                              amo_wb_pending;
-    reg [`CS_LINE_ADDR_WIDTH-1:0]    amo_wb_addr_r;
-    reg [WORD_SEL_WIDTH-1:0]         amo_wb_word_idx_r;
-    reg [WORD_SIZE-1:0]              amo_wb_byteen_r;
-    reg [`CS_WORD_WIDTH-1:0]         amo_wb_data_r;
-    reg [TAG_WIDTH-1:0]              amo_wb_tag_r;
-    reg [REQ_SEL_WIDTH-1:0]          amo_wb_idx_r;
-    reg [`UP(MEM_ATTR_WIDTH)-1:0]   amo_wb_attr_r;
-    // Post-wb forwarding: cache_data writes at S0 of the wb, so an AMO
-    // whose S0-read fired before the wb's S0 still has pre-wb data in
-    // read_data_st1. Keep amo_wb_data_r/addr_r forwardable for 2 more
-    // cycles after amo_wb_fire so those in-flight AMOs see the fresh
-    // value, and re-arm pending on chain so a follow-up wb commits the
-    // chained value to BRAM.
-    reg [1:0]                        amo_post_wb_age;
-    wire                             amo_post_wb_valid = (amo_post_wb_age != 2'd0);
-    `ifndef SC_FAIL_ST1_DECLARED
-    `define SC_FAIL_ST1_DECLARED
-    `endif
-
+    // Input arbitration mux. The AMO writeback fields tie to 0 when no LLC
+    // commit engine is present, so the wb arms prune away for non-AMO banks.
     assign valid_sel   = init_fire || replay_fire || mem_rsp_fire || flush_fire || core_req_fire;
-    // The amo_wb_* paths collapse to 0 when AMO_ENABLE=0 (the writeback
-    // FSM never fires), so the same mux works whether or not AMOs are
-    // enabled — the synthesizer prunes the dead arms.
     assign rw_sel      = replay_valid ? replay_rw
                        : (amo_wb_pending ? 1'b1 : core_req_rw);
     assign byteen_sel  = replay_valid ? replay_byteen
-                       : (amo_wb_pending ? amo_wb_byteen_r : core_req_byteen);
+                       : (amo_wb_pending ? amo_wb_byteen : core_req_byteen);
     assign addr_sel    = (init_valid | flush_valid) ? `CS_LINE_ADDR_WIDTH'(flush_sel) :
                             (replay_valid ? replay_addr : (mem_rsp_valid ? mem_rsp_addr :
-                                (amo_wb_pending ? amo_wb_addr_r : core_req_addr)));
+                                (amo_wb_pending ? amo_wb_addr : core_req_addr)));
     assign word_idx_sel= replay_valid ? replay_wsel
-                       : (amo_wb_pending ? amo_wb_word_idx_r : core_req_wsel);
+                       : (amo_wb_pending ? amo_wb_word_idx : core_req_wsel);
     assign req_idx_sel = replay_valid ? replay_idx
-                       : (amo_wb_pending ? amo_wb_idx_r : core_req_idx);
+                       : (amo_wb_pending ? amo_wb_idx : core_req_idx);
     assign tag_sel     = (init_valid | flush_valid) ? (flush_valid ? flush_tag : '0) :
                             (replay_valid ? replay_tag : (mem_rsp_valid ? mem_rsp_tag_s :
-                                (amo_wb_pending ? amo_wb_tag_r : core_req_tag)));
-    assign attr_sel   = amo_wb_pending ? amo_wb_attr_r
+                                (amo_wb_pending ? amo_wb_tag : core_req_tag)));
+    assign attr_sel   = amo_wb_pending ? amo_wb_attr
                        : (core_req_valid ? core_req_attr : '0);
-    // AMO sideband at sel:
-    //   * writeback synthetic creq: amo.valid=0 (plain write, no
-    //     double-commit at S1).
-    //   * replay: amo carried through MSHR — allocate-time amo_st0 is
-    //     restored on dequeue, so the AMO re-enters the pipe with the
-    //     same op/width/rhs/hart_id and commits on the now-warm line.
-    //   * real core_req: amo from the LSU sideband.
-    //
-    // Priority must match the sel mux's source priority (replay > wb >
-    // core_req): replay can fire during a pending wb (chained AMO
-    // replays from MSHR after a fill), so replay_amo must NOT be
-    // clobbered to zero by amo_wb_pending. A pending wb only zeros
-    // amo_sel when it actually wins arbitration (no replay/core_req).
+    // AMO sideband priority must match the sel mux (replay > wb > core_req):
+    // a replay can fire during a pending wb (chained AMO replays from MSHR
+    // after a fill), so it must not be cleared by amo_wb_pending. The
+    // synthetic writeback carries amo.valid=0 so it never re-commits at S1.
     assign amo_sel = replay_valid  ? replay_amo
                    : (amo_wb_pending ? amo_req_t'('0)
                    : (core_req_valid ? core_req_amo : amo_req_t'('0)));
@@ -423,7 +361,7 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
             if (i < `CS_WORD_WIDTH) begin : g_lo
                 assign data_sel[i] = replay_valid ? replay_data[i] :
                                       (mem_rsp_valid ? mem_rsp_data[i] :
-                                       (amo_wb_pending ? amo_wb_data_r[i] : core_req_data[i]));
+                                       (amo_wb_pending ? amo_wb_data[i] : core_req_data[i]));
             end else begin : g_hi
                 assign data_sel[i] = mem_rsp_data[i]; // only the memory response fills the upper words of data_sel
             end
@@ -432,9 +370,7 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
         assign data_sel = mem_rsp_data;
         `UNUSED_VAR (core_req_data)
         `UNUSED_VAR (replay_data)
-        // icache (WRITE_ENABLE=0) doesn't read amo_wb_data_r since the
-        // wb data only flows through the writable g_data_sel branch.
-        `UNUSED_VAR (amo_wb_data_r)
+        `UNUSED_VAR (amo_wb_data) // read-only banks have no writeback data
     end
 
     if (UUID_WIDTH != 0) begin : g_req_uuid_sel
@@ -450,19 +386,14 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
     wire is_replay_sel = replay_enable;
 
     VX_pipe_register #(
-        .DATAW  (1 + 1 + 1 + 1 + 1 + 1 + 1 + `UP(MEM_ATTR_WIDTH) + `CS_WAY_SEL_WIDTH + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH
-                + AMO_REQ_BITS),
+        .DATAW  (1 + 1 + 1 + 1 + 1 + 1 + 1 + `UP(MEM_ATTR_WIDTH) + `CS_WAY_SEL_WIDTH + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + AMO_REQ_BITS),
         .RESETW (1)
     ) pipe_reg0 (
         .clk      (clk),
         .reset    (reset),
         .enable   (~pipe_stall),
-        .data_in  ({valid_sel, is_init_sel, is_fill_sel, is_flush_sel, is_creq_sel, is_replay_sel, is_passthru_fill_sel, attr_sel, flush_way,     addr_sel, data_sel, rw_sel, byteen_sel, word_idx_sel, req_idx_sel, tag_sel, replay_id
-                  , amo_sel
-                  }),
-        .data_out ({valid_st0, is_init_st0, is_fill_st0, is_flush_st0, is_creq_st0, is_replay_st0, is_passthru_fill_st0, attr_st0, flush_way_st0, addr_st0, data_st0, rw_st0, byteen_st0, word_idx_st0, req_idx_st0, tag_st0, replay_id_st0
-                  , amo_st0
-                  })
+        .data_in  ({valid_sel, is_init_sel, is_fill_sel, is_flush_sel, is_creq_sel, is_replay_sel, is_passthru_fill_sel, attr_sel, flush_way,     addr_sel, data_sel, rw_sel, byteen_sel, word_idx_sel, req_idx_sel, tag_sel, replay_id,     amo_sel}),
+        .data_out ({valid_st0, is_init_st0, is_fill_st0, is_flush_st0, is_creq_st0, is_replay_st0, is_passthru_fill_st0, attr_st0, flush_way_st0, addr_st0, data_st0, rw_st0, byteen_st0, word_idx_st0, req_idx_st0, tag_st0, replay_id_st0, amo_st0})
     );
 
     if (UUID_WIDTH != 0) begin : g_req_uuid_st0
@@ -567,19 +498,14 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
     assign mshr_id_st0 = is_replay_st0 ? replay_id_st0 : mshr_alloc_id_st0;
 
     VX_pipe_register #(
-        .DATAW  (1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `UP(MEM_ATTR_WIDTH) + `CS_WAY_SEL_WIDTH + `CS_TAG_SEL_BITS + `CS_TAG_SEL_BITS + `CS_LINE_SEL_BITS + `CS_WORD_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1
-                + AMO_REQ_BITS),
+        .DATAW  (1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `UP(MEM_ATTR_WIDTH) + `CS_WAY_SEL_WIDTH + `CS_TAG_SEL_BITS + `CS_TAG_SEL_BITS + `CS_LINE_SEL_BITS + `CS_WORD_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1 + AMO_REQ_BITS),
         .RESETW (1)
     ) pipe_reg1 (
         .clk      (clk),
         .reset    (reset),
         .enable   (~pipe_stall),
-        .data_in  ({valid_st0, is_fill_st0, is_flush_st0, is_creq_st0, is_replay_st0, is_dirty_st0, is_hit_st0, rw_st0, attr_st0, way_idx_st0, evict_tag_st0, line_tag_st0, line_idx_st0, write_word_st0, byteen_st0, word_idx_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_previd_st0, mshr_pending_st0
-                  , amo_st0
-                  }),
-        .data_out ({valid_st1, is_fill_st1, is_flush_st1, is_creq_st1, is_replay_st1, is_dirty_st1, is_hit_st1, rw_st1, attr_st1, way_idx_st1, evict_tag_st1, line_tag_st1, line_idx_st1, write_word_st1, byteen_st1, word_idx_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_previd_st1, mshr_pending_st1
-                  , amo_st1
-                  })
+        .data_in  ({valid_st0, is_fill_st0, is_flush_st0, is_creq_st0, is_replay_st0, is_dirty_st0, is_hit_st0, rw_st0, attr_st0, way_idx_st0, evict_tag_st0, line_tag_st0, line_idx_st0, write_word_st0, byteen_st0, word_idx_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_previd_st0, mshr_pending_st0, amo_st0}),
+        .data_out ({valid_st1, is_fill_st1, is_flush_st1, is_creq_st1, is_replay_st1, is_dirty_st1, is_hit_st1, rw_st1, attr_st1, way_idx_st1, evict_tag_st1, line_tag_st1, line_idx_st1, write_word_st1, byteen_st1, word_idx_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_previd_st1, mshr_pending_st1, amo_st1})
     );
 
     if (UUID_WIDTH != 0) begin : g_req_uuid_st1
@@ -674,8 +600,7 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
         .MSHR_SIZE   (MSHR_SIZE),
         .WRITEBACK   (WRITEBACK),
         .AMO_ENABLE  ((AMO_ENABLE != 0) && (IS_LLC == 0)),
-        .DATA_WIDTH  (WORD_SEL_WIDTH + WORD_SIZE + `CS_WORD_WIDTH + TAG_WIDTH + REQ_SEL_WIDTH
-                    + AMO_REQ_BITS)
+        .DATA_WIDTH  (WORD_SEL_WIDTH + WORD_SIZE + `CS_WORD_WIDTH + TAG_WIDTH + REQ_SEL_WIDTH + AMO_REQ_BITS)
     ) cache_mshr (
         .clk            (clk),
         .reset          (reset),
@@ -691,16 +616,14 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
 
         // probe: pending entries for the incoming request's line, by type.
         .probe_addr     (core_req_addr),
-        .probe_pending_ld  (mshr_probe_pending_ld),
+        .probe_pending_ld (mshr_probe_pending_ld),
         .probe_pending_amo (mshr_probe_pending_amo),
 
         // dequeue
         .dequeue_valid  (replay_valid),
         .dequeue_addr   (replay_addr),
         .dequeue_rw     (replay_rw),
-        .dequeue_data   ({replay_wsel, replay_byteen, replay_data, replay_tag, replay_idx
-                         , replay_amo
-                         }),
+        .dequeue_data   ({replay_wsel, replay_byteen, replay_data, replay_tag, replay_idx, replay_amo}),
         .dequeue_id     (replay_id),
         .dequeue_ready  (replay_ready),
 
@@ -712,9 +635,7 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
         // round-trip). At the LLC, same-line AMOs coalesce and serialize
         // their commits on the single filled line.
         .allocate_is_amo((AMO_ENABLE && !IS_LLC) ? amo_st0.amo_valid : 1'b0),
-        .allocate_data  ({word_idx_st0, byteen_st0, write_word_st0, tag_st0, req_idx_st0
-                         , amo_st0
-                         }),
+        .allocate_data  ({word_idx_st0, byteen_st0, write_word_st0, tag_st0, req_idx_st0, amo_st0}),
         .allocate_id    (mshr_alloc_id_st0),
         .allocate_pending(mshr_pending_raw_st0),
         .allocate_previd(mshr_previd_st0),
@@ -729,190 +650,102 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
     );
 
     // ============================================================
-    // Non-LLC AMO passthrough
-    // ============================================================
-    // An AMO at a non-LLC bank is forwarded downstream (the LLC performs
-    // the RMW) and its result word is routed back to the requester — it is
-    // never committed locally. We reuse the miss->fill->replay path with
-    // three twists, all gated to non-LLC AMO entries:
-    //   1) the AMO never coalesces (MSHR amo_table),
-    //   2) its fill does NOT install a line (the response is a result
-    //      word, not a cacheable line), and
-    //   3) the replay emits the latched result word as the core response.
-    wire is_amo_fwd_st0;     // AMO first pass at a non-LLC bank (st0)
-    wire is_amo_fwd_st1;     // AMO first pass at a non-LLC bank (st1)
-    wire is_amo_replay_st1;  // replay of a forwarded AMO (carries result)
-    wire is_passthru_fill_sel;   // incoming fill targets a passthru entry
-    wire [`CS_WORD_WIDTH-1:0] amo_ptw_word_st1; // latched result word @ st1
-
-    if ((AMO_ENABLE != 0) && (IS_LLC == 0)) begin : g_amo_ptw
-        assign is_amo_fwd_st0 = amo_st0.amo_valid && valid_st0 && is_creq_st0 && ~is_replay_st0;
-        assign is_amo_fwd_st1 = amo_st1.amo_valid && valid_st1 && is_creq_st1 && ~is_replay_st1;
-        assign is_amo_replay_st1 = amo_st1.amo_valid && valid_st1 && is_creq_st1 && is_replay_st1;
-
-        reg [MSHR_SIZE-1:0]      amo_ptw_flag;   // entry awaits a passthru fill
-        reg [WORD_SEL_WIDTH-1:0] amo_ptw_wsel [MSHR_SIZE];
-        reg [`CS_WORD_WIDTH-1:0] amo_ptw_word [MSHR_SIZE];
-
-        wire [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] mem_rsp_words = mem_rsp_data;
-
-        assign is_passthru_fill_sel = is_fill_sel && amo_ptw_flag[mem_rsp_id];
-        assign amo_ptw_word_st1 = amo_ptw_word[mshr_id_st1];
-
-        always @(posedge clk) begin
-            if (reset) begin
-                amo_ptw_flag <= '0;
-            end else begin
-                // mark the AMO's MSHR entry on allocation
-                if (is_amo_fwd_st0 && mshr_allocate_st0 && ~pipe_stall) begin
-                    amo_ptw_flag[mshr_alloc_id_st0] <= 1'b1;
-                    amo_ptw_wsel[mshr_alloc_id_st0] <= word_idx_st0;
-                end
-                // latch the result word on the passthru fill, clear the flag
-                if (mem_rsp_fire && amo_ptw_flag[mem_rsp_id]) begin
-                    amo_ptw_word[mem_rsp_id] <= mem_rsp_words[amo_ptw_wsel[mem_rsp_id]];
-                    amo_ptw_flag[mem_rsp_id] <= 1'b0;
-                end
-            end
-        end
-    end else begin : g_no_amo_ptw
-        assign is_amo_fwd_st0 = 1'b0;
-        assign is_amo_fwd_st1 = 1'b0;
-        assign is_amo_replay_st1 = 1'b0;
-        assign is_passthru_fill_sel = 1'b0;
-        assign amo_ptw_word_st1 = '0;
-    end
-
-    // Force the AMO requester non-pending so it never coalesces onto a
-    // prior (plain-read) entry for the same line — each atomic must take
-    // its own downstream round-trip.
-    assign mshr_pending_st0 = mshr_pending_raw_st0 && ~is_amo_fwd_st0;
-
-    // Passthru replay is treated as a hit (its line was never installed):
-    // fires the core response, allocates no mreq, releases the MSHR entry.
-    wire eff_hit_st1 = is_hit_st1 || is_amo_replay_st1;
-
-    // ============================================================
-    // AMO commit path
+    // AMO engine
     // ============================================================
-    //
-    // VX_amo_unit instantiated only at the LLC bank. The compute
-    // kernel is combinational (read line word at S1 → ret_word /
-    // new_word). `res_check` from the reservation table drives the SC
-    // outcome. Compute outputs feed the response data mux below.
-    //
-    // Response data mux: when amo.valid && hit at S1, override
-    // crsp_queue_data with amo_alu.ret_word (LR/AMO* old value at
-    // byte offset) or with 0/1 for SC.
-    wire [63:0] amo_compute_new;
-    wire [63:0] amo_compute_ret;
-    wire        amo_res_check;
-    // Bit-offset of the AMO target within the cache word.
-    // Set inside the IS_LLC generate; tied to 0 for non-LLC banks.
-    wire [`CLOG2(`CS_WORD_WIDTH)-1:0] amo_bit_off_st1;
-    // Forward in-flight wb back into compute_old when a new AMO targets
-    // the same line. Driven inside the IS_LLC generate.
-    wire        amo_fwd_active_st1;
-
-    if (IS_LLC) begin : g_amo_unit
-        // Byte-offset alignment: the AMO target may sit at a non-zero
-        // byte offset within the cache word. byteen_st1 has the targeted
-        // bytes set; the lowest set bit gives byte_off. We shift the
-        // cache word right by byte_off*8 to expose the operand at bit 0,
-        // and shift compute results back to that offset for the response
-        // and writeback.
-        localparam BYTE_OFF_BITS = `CLOG2(WORD_SIZE);
-        localparam BIT_OFF_BITS  = `CLOG2(`CS_WORD_WIDTH);
-        localparam AMO_OLD_BITS  = (`CS_WORD_WIDTH < 64) ? `CS_WORD_WIDTH : 64;
-
-        wire [`UP(BYTE_OFF_BITS)-1:0] amo_byte_off_st1;
-        VX_priority_encoder #(
-            .N (WORD_SIZE)
-        ) amo_byte_off_enc (
-            .data_in    (byteen_st1),
-            .index_out  (amo_byte_off_st1),
-            `UNUSED_PIN (valid_out),
-            `UNUSED_PIN (onehot_out)
-        );
-        assign amo_bit_off_st1 = BIT_OFF_BITS'({amo_byte_off_st1, 3'b0});
-
-        // Forward amo_wb_data_r when an in-flight writeback (or a
-        // recently-fired one) targets the same line. cache_data isn't
-        // updated until the synthetic wb fires at S0; an AMO whose S0
-        // read pre-dates the wb's S0 still sees pre-wb data in
-        // read_data_st1. amo_post_wb_valid extends forwarding for 2
-        // cycles past amo_wb_fire to cover that window.
-        assign amo_fwd_active_st1 = (amo_wb_pending || amo_post_wb_valid)
-                                 && (amo_wb_addr_r == addr_st1);
-        wire [`CS_WORD_WIDTH-1:0] amo_line_word_raw_st1 = read_data_st1[word_idx_st1];
-        wire [`CS_WORD_WIDTH-1:0] amo_line_word_st1 =
-            amo_fwd_active_st1 ? amo_wb_data_r : amo_line_word_raw_st1;
-        wire [`CS_WORD_WIDTH-1:0] amo_line_word_shifted_st1 =
-            amo_line_word_st1 >> amo_bit_off_st1;
-        // Width / rhs are no longer carried in amo_req_t (slim form). Derive
-        // width from byteen popcount (.W → 4 bytes set, .D → 8) and rhs by
-        // shifting the request data word by the AMO bit-offset like the
-        // old line word. AMO operands top out at .D (64 bits); upper bits
-        // of the shifted cache word are unused.
-        wire [1:0] amo_width_st1 = ($countones(byteen_st1) == 8) ? 2'd3 : 2'd2;
-        wire [`CS_WORD_WIDTH-1:0] amo_rhs_word_shifted_st1 = write_word_st1 >> amo_bit_off_st1;
-        wire [63:0] amo_rhs_st1 = (amo_width_st1 == 2'd2)
-                                ? 64'({32'h0, amo_rhs_word_shifted_st1[31:0]})
-                                : 64'(amo_rhs_word_shifted_st1[AMO_OLD_BITS-1:0]);
-        wire [63:0] amo_compute_old = (amo_width_st1 == 2'd2)
-                                    ? 64'({32'h0, amo_line_word_shifted_st1[31:0]})
-                                    : 64'(amo_line_word_shifted_st1[AMO_OLD_BITS-1:0]);
-        if (`CS_WORD_WIDTH > 64) begin : g_amo_upper_unused
-            `UNUSED_VAR (amo_line_word_shifted_st1[`CS_WORD_WIDTH-1:64])
-            `UNUSED_VAR (amo_rhs_word_shifted_st1[`CS_WORD_WIDTH-1:64])
-        end
+    wire [`CS_WORD_WIDTH-1:0] read_word_st1 = read_data_st1[word_idx_st1];
 
-        // Drive reservation activity from S1 commit conditions.
-        // All three fire from the original AMO at S1 (amo_st1.hart_id
-        // is valid only there, not on the writeback cycle).
-        wire amo_res_reserve_w    = amo_hit_st1
-                                  && (amo_st1.amo_op == AMO_OP_LR);
-        wire amo_res_clear_w      = amo_hit_st1
-                                  && (amo_st1.amo_op == AMO_OP_SC);
-        // Break other harts' reservations on every committed write to the
-        // line, not just AMO stores: a plain store from another hart must
-        // fail a racing SC. AMOs ride the load path (rw=0), so do_write_st1
-        // captures plain stores only; amo_st1.hart_id is the requester's id
-        // (the LSU sets it for every lane) so the writer is excluded.
-        wire amo_res_invalidate_w = amo_do_store_st1 || do_write_st1;
-
-        VX_amo_unit #(
+    if (AMO_ENABLE) begin : g_amo
+        VX_cache_amo #(
+            .IS_LLC          (IS_LLC),
             .NUM_RES_ENTRIES (`VX_CFG_AMO_RS_SIZE),
-            .LINE_ADDR_BITS  (`CS_LINE_ADDR_WIDTH)
-        ) amo_unit (
-            .clk           (clk),
-            .reset         (reset),
-            .compute_op           (amo_st1.amo_op),
-            .compute_amo_unsigned (amo_st1.amo_unsigned),
-            .compute_width        (amo_width_st1),
-            .compute_old          (amo_compute_old),
-            .compute_rhs          (amo_rhs_st1),
-            .compute_new_word (amo_compute_new),
-            .compute_ret_word (amo_compute_ret),
-            .res_reserve   (amo_res_reserve_w),
-            .res_clear     (amo_res_clear_w),
-            .res_invalidate(amo_res_invalidate_w),
-            .res_hart_id   (amo_st1.hart_id),
-            .res_line_addr (addr_st1),
-            .res_check     (amo_res_check)
+            .LINE_ADDR_BITS  (`CS_LINE_ADDR_WIDTH),
+            .WORD_WIDTH      (`CS_WORD_WIDTH),
+            .WORD_SIZE       (WORD_SIZE),
+            .WORD_SEL_WIDTH  (WORD_SEL_WIDTH),
+            .TAG_WIDTH       (TAG_WIDTH),
+            .REQ_SEL_WIDTH   (REQ_SEL_WIDTH),
+            .ATTR_WIDTH      (`UP(MEM_ATTR_WIDTH)),
+            .MSHR_SIZE       (MSHR_SIZE),
+            .MSHR_ADDR_WIDTH (MSHR_ADDR_WIDTH),
+            .WORDS_PER_LINE  (`CS_WORDS_PER_LINE)
+        ) amo (
+            .clk             (clk),
+            .reset           (reset),
+            .pipe_stall      (pipe_stall),
+            .amo_st0         (amo_st0),
+            .valid_st0       (valid_st0),
+            .is_creq_st0     (is_creq_st0),
+            .is_hit_st0      (is_hit_st0),
+            .is_replay_st0   (is_replay_st0),
+            .amo_st1         (amo_st1),
+            .valid_st1       (valid_st1),
+            .is_creq_st1     (is_creq_st1),
+            .is_hit_st1      (is_hit_st1),
+            .is_replay_st1   (is_replay_st1),
+            .do_write_st1    (do_write_st1),
+            .read_word_st1   (read_word_st1),
+            .byteen_st1      (byteen_st1),
+            .write_word_st1  (write_word_st1),
+            .word_idx_st0    (word_idx_st0),
+            .word_idx_st1    (word_idx_st1),
+            .addr_st0        (addr_st0),
+            .addr_st1        (addr_st1),
+            .tag_st1         (tag_st1),
+            .req_idx_st1     (req_idx_st1),
+            .attr_st1        (attr_st1),
+            .wb_fire         (amo_wb_fire),
+            .mshr_allocate_st0 (mshr_allocate_st0),
+            .mshr_alloc_id_st0 (mshr_alloc_id_st0),
+            .mshr_id_st1     (mshr_id_st1),
+            .mem_rsp_fire    (mem_rsp_fire),
+            .mem_rsp_id      (mem_rsp_id),
+            .mem_rsp_data    (mem_rsp_data),
+            .is_fill_sel     (is_fill_sel),
+            .core_req_valid  (core_req_valid),
+            .core_req_is_amo (core_req_amo.amo_valid),
+            .core_req_rw     (core_req_rw),
+            .core_req_addr   (core_req_addr),
+            .rw_st0          (rw_st0),
+            .mshr_probe_pending_ld (mshr_probe_pending_ld),
+            .mshr_probe_pending_amo (mshr_probe_pending_amo),
+            .amo_hit_st1     (amo_hit_st1),
+            .commit_busy     (amo_commit_busy),
+            .chain_stall     (amo_chain_stall),
+            .wb_pending      (amo_wb_pending),
+            .rsp_data        (amo_rsp_data),
+            .wb_addr         (amo_wb_addr),
+            .wb_word_idx     (amo_wb_word_idx),
+            .wb_byteen       (amo_wb_byteen),
+            .wb_data         (amo_wb_data),
+            .wb_tag          (amo_wb_tag),
+            .wb_idx          (amo_wb_idx),
+            .wb_attr         (amo_wb_attr),
+            .is_amo_fwd_st0  (is_amo_fwd_st0),
+            .is_amo_fwd_st1  (is_amo_fwd_st1),
+            .is_amo_replay_st1 (is_amo_replay_st1),
+            .is_passthru_fill_sel (is_passthru_fill_sel),
+            .amo_ptw_word_st1 (amo_ptw_word_st1),
+            .req_input_defer (req_input_defer)
         );
-    end else begin : g_no_amo_unit
-        // Non-LLC banks: AMO commit machinery is not synthesized.
-        // Sink the conditionally-unused inputs so lint stays clean.
-        assign amo_compute_new = '0;
-        assign amo_compute_ret = '0;
-        assign amo_res_check   = 1'b0;
-        assign amo_bit_off_st1 = '0;
-        assign amo_fwd_active_st1 = 1'b0;
+    end else begin : g_no_amo
+        assign {amo_hit_st1, amo_commit_busy, amo_wb_pending, amo_chain_stall} = '0;
+        assign {amo_rsp_data, amo_wb_addr, amo_wb_word_idx, amo_wb_byteen} = '0;
+        assign {amo_wb_data, amo_wb_tag, amo_wb_idx, amo_wb_attr} = '0;
+        assign {is_amo_fwd_st0, is_amo_fwd_st1, is_amo_replay_st1} = '0;
+        assign {is_passthru_fill_sel, amo_ptw_word_st1, req_input_defer} = '0;
         `UNUSED_VAR (amo_st1)
-        `UNUSED_VAR (addr_st1)
+        `UNUSED_VAR (amo_wb_fire)
+        `UNUSED_VAR (mshr_probe_pending_ld)
+        `UNUSED_VAR (mshr_probe_pending_amo)
     end
-    `UNUSED_VAR (amo_compute_new)
+
+    // Force the AMO requester non-pending so it never coalesces onto a prior
+    // entry for the same line — each atomic takes its own downstream trip.
+    assign mshr_pending_st0 = mshr_pending_raw_st0 && ~is_amo_fwd_st0;
+
+    // Passthru replay counts as a hit (its line was never installed): fires
+    // the core response, allocates no mreq, releases the MSHR entry.
+    wire eff_hit_st1 = is_hit_st1 || is_amo_replay_st1;
 
     // schedule core response
 
@@ -921,90 +754,20 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
     wire [REQ_SEL_WIDTH-1:0] crsp_queue_idx;
     wire [TAG_WIDTH-1:0] crsp_queue_tag;
 
-    // Fire response for AMO requests on hit at S1. AMO bits are carried
-    // through the MSHR so a replay re-enters with valid amo fields and
-    // commits on the freshly filled line.
-    // Local AMO commit happens only at the LLC. Non-LLC banks forward the
-    // AMO downstream (passthrough) and never commit/writeback it locally.
-    assign amo_hit_st1 = amo_st1.amo_valid && is_hit_st1 && valid_st1 && is_creq_st1 && (IS_LLC != 0);
-    assign sc_fail_st1 = (amo_st1.amo_op == AMO_OP_SC) && ~amo_res_check;
-
-    // do_store: AMO commit other than LR or sc_fail. Triggers the
-    // writeback state machine to inject a synthetic write at sel.
-    assign amo_do_store_st1 = amo_hit_st1
-                           && (amo_st1.amo_op != AMO_OP_LR)
-                           && ~sc_fail_st1;
-
-    // Writeback data: byteen is the AMO request's own byteen,
-    // already at the correct offset within the cache word.
-    wire [WORD_SIZE-1:0] amo_wb_byteen_w = byteen_st1;
-    // Place new_word at the byte-offset slot within the cache word.
-    wire [`CS_WORD_WIDTH-1:0] amo_wb_data_w =
-        `CS_WORD_WIDTH'(amo_compute_new) << amo_bit_off_st1;
-
-    always @(posedge clk) begin
-        if (reset) begin
-            amo_wb_pending  <= 1'b0;
-            amo_post_wb_age <= 2'd0;
-        end else begin
-            // amo_post_wb_age counts down from 2 after amo_wb_fire,
-            // keeping amo_fwd_active alive across the BRAM settle window.
-            if (amo_wb_fire) begin
-                amo_post_wb_age <= 2'd2;
-            end else if (amo_post_wb_valid) begin
-                amo_post_wb_age <= amo_post_wb_age - 2'd1;
-            end
-
-            if (amo_do_store_st1 && ~pipe_stall && ~amo_wb_pending && ~amo_fwd_active_st1) begin
-                // Fresh wb: no in-flight or recently-fired wb on this line.
-                amo_wb_pending    <= 1'b1;
-                amo_wb_addr_r     <= addr_st1;
-                amo_wb_word_idx_r <= word_idx_st1;
-                amo_wb_byteen_r   <= amo_wb_byteen_w;
-                amo_wb_data_r     <= amo_wb_data_w;
-                amo_wb_tag_r      <= tag_st1;
-                amo_wb_idx_r      <= req_idx_st1;
-                amo_wb_attr_r    <= attr_st1;
-            end else if (amo_do_store_st1 && ~pipe_stall && amo_fwd_active_st1) begin
-                // Chain into existing wb on the same line. amo_compute_new
-                // was computed against the forwarded value, so overwriting
-                // folds this AMO's effect into the next wb. Re-arm pending
-                // so the chained value commits to BRAM.
-                amo_wb_data_r  <= amo_wb_data_w;
-                amo_wb_pending <= 1'b1;
-            end else if (amo_wb_fire) begin
-                amo_wb_pending <= 1'b0;
-            end
-        end
-    end
-
     // crsp_queue fires for reads and AMO commits at S1 on hit, but not
     // for the synthetic writeback write (rw=1). A non-LLC AMO's first
     // pass forwards downstream and must NOT respond locally; its result
     // returns later via the passthru replay (eff_hit covers that replay).
-    assign crsp_queue_valid = do_read_st1 && eff_hit_st1 && ~is_amo_fwd_st1;
+    // Suppress the response while a same-line AMO is chain-stalled at S1, so a
+    // read held for the extra pacing cycle enqueues its response exactly once
+    // (it fires when the op advances). amo_chain_stall is 0 for non-AMO traffic.
+    assign crsp_queue_valid = do_read_st1 && eff_hit_st1 && ~is_amo_fwd_st1 && ~amo_chain_stall;
     assign crsp_queue_idx   = req_idx_st1;
-    // Response data mux:
-    //   * AMO-SC at hit: 0 (success) or 1 (fail).
-    //   * AMO-other at hit: compute_ret_word (old value). LSU sexts to XLEN.
-    //   * Plain load: read_data_st1[word_idx_st1].
-    // When AMO_ENABLE=0, amo_hit_st1 is tied off and the AMO arms collapse.
-    // amo_rsp_word is 64-bit wide for .D AMOs; upper bits truncated by cast.
-    wire [63:0] amo_rsp_word  = (amo_st1.amo_op == AMO_OP_SC)
-                              ? {63'h0, sc_fail_st1}
-                              : amo_compute_ret;
-    if (`CS_WORD_WIDTH < 64) begin : g_amo_rsp_upper_unused
-        `UNUSED_VAR (amo_rsp_word[63:`CS_WORD_WIDTH])
-    end
-    // Place the AMO response at the byte-offset slot the LSU extracts.
-    // Without this shift the response only lands correctly for line-aligned addresses.
-    wire [`CS_WORD_WIDTH-1:0] amo_rsp_aligned =
-        `CS_WORD_WIDTH'(amo_rsp_word) << amo_bit_off_st1;
-    // A passthru-AMO replay returns the result word latched from the
-    // downstream response (no installed line to read).
+    // Response data: passthru replay returns the latched downstream result,
+    // an LLC AMO commit returns its formatted result word, else plain load.
     assign crsp_queue_data  = is_amo_replay_st1 ? amo_ptw_word_st1
-                            : (amo_hit_st1 ? amo_rsp_aligned
-                            : read_data_st1[word_idx_st1]);
+                            : (amo_hit_st1 ? amo_rsp_data
+                            : read_word_st1);
     assign crsp_queue_tag   = tag_st1;
 
     VX_elastic_buffer #(
@@ -1088,8 +851,7 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
         end
     end else begin : g_mreq_queue_ro
         // issue a fill request on a read miss
-        assign mreq_queue_push = (do_read_st1 && ~is_hit_st1 && ~mshr_pending_st1)
-                              && ~pipe_stall;
+        assign mreq_queue_push = (do_read_st1 && ~is_hit_st1 && ~mshr_pending_st1) && ~pipe_stall;
         assign mreq_queue_addr = addr_st1;
         assign mreq_queue_rw = 0;
         assign mreq_queue_data = '0;
@@ -1138,10 +900,10 @@ module VX_cache_bank import VX_gpu_pkg::*; #(
 `ifdef PERF_ENABLE
     assign perf_read_miss  = do_read_st1 && ~is_hit_st1;
     assign perf_write_miss = do_write_st1 && ~is_hit_st1;
+    assign perf_evictions  = do_writeback_st1; // dirty-line writeback eviction
     assign perf_mshr_stall = mshr_alm_full;
 `endif
 
-
 `ifdef DBG_TRACE_CACHE
     wire crsp_queue_fire = crsp_queue_valid && crsp_queue_ready;
     wire input_stall = (replay_valid || mem_rsp_valid || core_req_valid || flush_valid)
diff --git a/hw/rtl/cache/VX_cache_define.vh b/hw/rtl/cache/VX_cache_define.vh
index 6988f4b7f7..42772ca02b 100644
--- a/hw/rtl/cache/VX_cache_define.vh
+++ b/hw/rtl/cache/VX_cache_define.vh
@@ -65,6 +65,7 @@
     `PERF_COUNTER_ADD (dst, src, writes, PERF_CTR_BITS, count, (count > 1)) \
     `PERF_COUNTER_ADD (dst, src, read_misses, PERF_CTR_BITS, count, (count > 1)) \
     `PERF_COUNTER_ADD (dst, src, write_misses, PERF_CTR_BITS, count, (count > 1)) \
+    `PERF_COUNTER_ADD (dst, src, evictions, PERF_CTR_BITS, count, (count > 1)) \
     `PERF_COUNTER_ADD (dst, src, bank_stalls, PERF_CTR_BITS, count, (count > 1)) \
     `PERF_COUNTER_ADD (dst, src, mshr_stalls, PERF_CTR_BITS, count, (count > 1)) \
     `PERF_COUNTER_ADD (dst, src, mem_stalls, PERF_CTR_BITS, count, (count > 1)) \
diff --git a/hw/rtl/cache/VX_cache_wrap.sv b/hw/rtl/cache/VX_cache_wrap.sv
index 3f4fdcb061..f5511cc2fd 100644
--- a/hw/rtl/cache/VX_cache_wrap.sv
+++ b/hw/rtl/cache/VX_cache_wrap.sv
@@ -262,6 +262,7 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
         assign cache_perf.writes       = perf_core_writes;
         assign cache_perf.read_misses  = '0;
         assign cache_perf.write_misses = '0;
+        assign cache_perf.evictions    = '0;
         assign cache_perf.bank_stalls  = '0;
         assign cache_perf.mshr_stalls  = '0;
         assign cache_perf.mem_stalls   = perf_mem_stalls;
diff --git a/hw/rtl/core/VX_csr_data.sv b/hw/rtl/core/VX_csr_data.sv
index 4856437b00..696441f858 100644
--- a/hw/rtl/core/VX_csr_data.sv
+++ b/hw/rtl/core/VX_csr_data.sv
@@ -293,9 +293,7 @@ import VX_fpu_pkg::*;
                         // PERF: branches
                         `CSR_READ_64(`VX_CSR_MPM_BRANCHES, read_data_ro_w, pipeline_perf.sched.branches);
                         `CSR_READ_64(`VX_CSR_MPM_DIVERGENCE, read_data_ro_w, pipeline_perf.sched.divergence);
-                        // PERF: memory
-                        `CSR_READ_64(`VX_CSR_MPM_MEM_READS, read_data_ro_w, sysmem_perf.mem.reads);
-                        `CSR_READ_64(`VX_CSR_MPM_MEM_WRITES, read_data_ro_w, sysmem_perf.mem.writes);
+                        // PERF: memory (core-issued requests; DRAM traffic is in the MEM class)
                         `CSR_READ_64(`VX_CSR_MPM_IFETCHES, read_data_ro_w, pipeline_perf.ifetches);
                         `CSR_READ_64(`VX_CSR_MPM_LOADS, read_data_ro_w, pipeline_perf.loads);
                         `CSR_READ_64(`VX_CSR_MPM_STORES, read_data_ro_w, pipeline_perf.stores);
@@ -304,57 +302,71 @@ import VX_fpu_pkg::*;
                         default:;
                         endcase
                     end
-                `ifdef VX_CFG_VM_ENABLE
-                    `VX_DCR_MPM_CLASS_VM: begin
+                    `VX_DCR_MPM_CLASS_ICACHE: begin
                         case (read_addr)
-                        // PERF: VM/MMU (icache + dcache MMU summed)
-                        `CSR_READ_64(`VX_CSR_MPM_TLB_READS,   read_data_ro_w, pipeline_perf.mmu.tlb_reads);
-                        `CSR_READ_64(`VX_CSR_MPM_TLB_HITS,    read_data_ro_w, pipeline_perf.mmu.tlb_hits);
-                        `CSR_READ_64(`VX_CSR_MPM_TLB_MISSES,  read_data_ro_w, pipeline_perf.mmu.tlb_misses);
-                        `CSR_READ_64(`VX_CSR_MPM_TLB_EVICTS,  read_data_ro_w, pipeline_perf.mmu.tlb_evictions);
-                        `CSR_READ_64(`VX_CSR_MPM_PTW_WALKS,   read_data_ro_w, pipeline_perf.mmu.ptw_walks);
-                        `CSR_READ_64(`VX_CSR_MPM_PTW_LATENCY, read_data_ro_w, pipeline_perf.mmu.ptw_latency);
+                        `CSR_READ_64(`VX_CSR_MPM_ICACHE_READS, read_data_ro_w, sysmem_perf.icache.reads);
+                        `CSR_READ_64(`VX_CSR_MPM_ICACHE_MISS_R, read_data_ro_w, sysmem_perf.icache.read_misses);
+                        `CSR_READ_64(`VX_CSR_MPM_ICACHE_MSHR_ST, read_data_ro_w, sysmem_perf.icache.mshr_stalls);
                         default:;
                         endcase
                     end
-                `endif
-                    `VX_DCR_MPM_CLASS_MEM: begin
+                    `VX_DCR_MPM_CLASS_DCACHE: begin
                         case (read_addr)
-                        // PERF: icache
-                        `CSR_READ_64(`VX_CSR_MPM_ICACHE_READS, read_data_ro_w, sysmem_perf.icache.reads);
-                        `CSR_READ_64(`VX_CSR_MPM_ICACHE_MISS_R, read_data_ro_w, sysmem_perf.icache.read_misses);
-                        `CSR_READ_64(`VX_CSR_MPM_ICACHE_MSHR_ST, read_data_ro_w, sysmem_perf.icache.mshr_stalls);
-                        // PERF: dcache
                         `CSR_READ_64(`VX_CSR_MPM_DCACHE_READS, read_data_ro_w, sysmem_perf.dcache.reads);
                         `CSR_READ_64(`VX_CSR_MPM_DCACHE_WRITES, read_data_ro_w, sysmem_perf.dcache.writes);
                         `CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_R, read_data_ro_w, sysmem_perf.dcache.read_misses);
                         `CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_W, read_data_ro_w, sysmem_perf.dcache.write_misses);
+                        `CSR_READ_64(`VX_CSR_MPM_DCACHE_EVICTS, read_data_ro_w, sysmem_perf.dcache.evictions);
                         `CSR_READ_64(`VX_CSR_MPM_DCACHE_BANK_ST, read_data_ro_w, sysmem_perf.dcache.bank_stalls);
                         `CSR_READ_64(`VX_CSR_MPM_DCACHE_MSHR_ST, read_data_ro_w, sysmem_perf.dcache.mshr_stalls);
-                        // PERF: lmem
-                        `CSR_READ_64(`VX_CSR_MPM_LMEM_READS, read_data_ro_w, sysmem_perf.lmem.reads);
-                        `CSR_READ_64(`VX_CSR_MPM_LMEM_WRITES, read_data_ro_w, sysmem_perf.lmem.writes);
-                        `CSR_READ_64(`VX_CSR_MPM_LMEM_BANK_ST, read_data_ro_w, sysmem_perf.lmem.bank_stalls);
-                        // PERF: l2cache
+                        default:;
+                        endcase
+                    end
+                    `VX_DCR_MPM_CLASS_L2CACHE: begin
+                        case (read_addr)
                         `CSR_READ_64(`VX_CSR_MPM_L2CACHE_READS, read_data_ro_w, sysmem_perf.l2cache.reads);
                         `CSR_READ_64(`VX_CSR_MPM_L2CACHE_WRITES, read_data_ro_w, sysmem_perf.l2cache.writes);
                         `CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_R, read_data_ro_w, sysmem_perf.l2cache.read_misses);
                         `CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_W, read_data_ro_w, sysmem_perf.l2cache.write_misses);
+                        `CSR_READ_64(`VX_CSR_MPM_L2CACHE_EVICTS, read_data_ro_w, sysmem_perf.l2cache.evictions);
                         `CSR_READ_64(`VX_CSR_MPM_L2CACHE_BANK_ST, read_data_ro_w, sysmem_perf.l2cache.bank_stalls);
                         `CSR_READ_64(`VX_CSR_MPM_L2CACHE_MSHR_ST, read_data_ro_w, sysmem_perf.l2cache.mshr_stalls);
-                        // PERF: l3cache
+                        default:;
+                        endcase
+                    end
+                    `VX_DCR_MPM_CLASS_L3CACHE: begin
+                        case (read_addr)
                         `CSR_READ_64(`VX_CSR_MPM_L3CACHE_READS, read_data_ro_w, sysmem_perf.l3cache.reads);
                         `CSR_READ_64(`VX_CSR_MPM_L3CACHE_WRITES, read_data_ro_w, sysmem_perf.l3cache.writes);
                         `CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_R, read_data_ro_w, sysmem_perf.l3cache.read_misses);
                         `CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_W, read_data_ro_w, sysmem_perf.l3cache.write_misses);
+                        `CSR_READ_64(`VX_CSR_MPM_L3CACHE_EVICTS, read_data_ro_w, sysmem_perf.l3cache.evictions);
                         `CSR_READ_64(`VX_CSR_MPM_L3CACHE_BANK_ST, read_data_ro_w, sysmem_perf.l3cache.bank_stalls);
                         `CSR_READ_64(`VX_CSR_MPM_L3CACHE_MSHR_ST, read_data_ro_w, sysmem_perf.l3cache.mshr_stalls);
-                        // PERF: memory
+                        default:;
+                        endcase
+                    end
+                    `VX_DCR_MPM_CLASS_MEM: begin
+                        case (read_addr)
+                        // PERF: off-chip memory
                         `CSR_READ_64(`VX_CSR_MPM_MEM_READS, read_data_ro_w, sysmem_perf.mem.reads);
                         `CSR_READ_64(`VX_CSR_MPM_MEM_WRITES, read_data_ro_w, sysmem_perf.mem.writes);
                         `CSR_READ_64(`VX_CSR_MPM_MEM_LT, read_data_ro_w, sysmem_perf.mem.latency);
+                        // PERF: lmem
+                        `CSR_READ_64(`VX_CSR_MPM_LMEM_READS, read_data_ro_w, sysmem_perf.lmem.reads);
+                        `CSR_READ_64(`VX_CSR_MPM_LMEM_WRITES, read_data_ro_w, sysmem_perf.lmem.writes);
+                        `CSR_READ_64(`VX_CSR_MPM_LMEM_BANK_ST, read_data_ro_w, sysmem_perf.lmem.bank_stalls);
                         // PERF: coalescer
                         `CSR_READ_64(`VX_CSR_MPM_COALESCER_MISS, read_data_ro_w, sysmem_perf.coalescer.misses);
+                    `ifdef VX_CFG_VM_ENABLE
+                        // PERF: VM/MMU (icache + dcache MMU summed)
+                        `CSR_READ_64(`VX_CSR_MPM_TLB_READS,   read_data_ro_w, pipeline_perf.mmu.tlb_reads);
+                        `CSR_READ_64(`VX_CSR_MPM_TLB_HITS,    read_data_ro_w, pipeline_perf.mmu.tlb_hits);
+                        `CSR_READ_64(`VX_CSR_MPM_TLB_MISSES,  read_data_ro_w, pipeline_perf.mmu.tlb_misses);
+                        `CSR_READ_64(`VX_CSR_MPM_TLB_EVICTS,  read_data_ro_w, pipeline_perf.mmu.tlb_evictions);
+                        `CSR_READ_64(`VX_CSR_MPM_PTW_WALKS,   read_data_ro_w, pipeline_perf.mmu.ptw_walks);
+                        `CSR_READ_64(`VX_CSR_MPM_PTW_LATENCY, read_data_ro_w, pipeline_perf.mmu.ptw_latency);
+                    `endif
                         default:;
                         endcase
                     end
diff --git a/hw/scripts/saif_filter.py b/hw/scripts/saif_filter.py
new file mode 100755
index 0000000000..9b36f14a15
--- /dev/null
+++ b/hw/scripts/saif_filter.py
@@ -0,0 +1,181 @@
+#!/usr/bin/env python3
+# Extract an instance subtree from a Verilator master SAIF and re-root it under a
+# synthesis top module, so the result can be annotated directly by the hw/syn flows
+# (Vivado read_saif, Synopsys read_saif, OpenSTA read_saif).
+#
+# The master SAIF produced by Verilator (--trace-saif) is rooted at the simulation
+# top (e.g. TOP/.../core/execute/tcu_unit/...) and contains the full design.  The
+# synthesis DUT is a standalone wrapper (e.g. VX_tcu_unit_top) whose top-level child
+# instance matches the extracted scope.  This tool slices the requested subtree
+# verbatim -- preserving every nested generate scope and net, regardless of tile size
+# or NUM_THREADS -- and wraps it under the named top module.
+#
+# Usage:
+#   saif_filter.py --instance <path> [--top <module>] [-o out.saif] master.saif
+#   saif_filter.py --list [--list-depth N] master.saif
+#
+#   --instance   slash-separated instance path to extract.  Matched as a suffix of the
+#                full hierarchy path, so "execute/tcu_unit" or just "tcu_unit" both work.
+#   --top        wrapper instance name placed above the extracted subtree (and written
+#                as the SAIF DESIGN).  Omit to emit the extracted instance as the root.
+#   --all        extract every match instead of only the first (names disambiguated).
+#   --list       print the instance hierarchy (for discovering paths) and exit.
+
+import argparse
+import re
+import sys
+
+INSTANCE_RE = re.compile(r'^(\s*)\(INSTANCE\s+(\S+)\s*$')
+HEADER_KEYS = ('DIVIDER', 'TIMESCALE', 'DURATION', 'DIRECTION', 'SAIFVERSION')
+
+
+def paren_delta(line):
+    """Net change in paren depth on a line, ignoring anything inside double quotes."""
+    delta = 0
+    in_str = False
+    for c in line:
+        if c == '"':
+            in_str = not in_str
+        elif in_str:
+            continue
+        elif c == '(':
+            delta += 1
+        elif c == ')':
+            delta -= 1
+    return delta
+
+
+def iter_instances(lines):
+    """Yield (start_idx, path) for every (INSTANCE ...) opener, tracking hierarchy.
+
+    Depth is the running paren balance *before* the line.  An instance opens one paren
+    on its own line, so its body lives one level deeper and closes when the balance
+    returns to the opener's level.
+    """
+    balance = 0
+    stack = []  # (name, level) for currently-open instances
+    for idx, line in enumerate(lines):
+        while stack and balance <= stack[-1][1]:
+            stack.pop()
+        m = INSTANCE_RE.match(line)
+        if m:
+            name = m.group(2)
+            path = [s[0] for s in stack] + [name]
+            yield idx, path
+            stack.append((name, balance))
+        balance += paren_delta(line)
+
+
+def capture_subtree(lines, start_idx):
+    """Return the list of lines forming the instance subtree opened at start_idx."""
+    depth = 0
+    out = []
+    for line in lines[start_idx:]:
+        out.append(line)
+        depth += paren_delta(line)
+        if depth == 0:
+            break
+    else:
+        raise ValueError("unterminated INSTANCE subtree (unbalanced parens)")
+    return out
+
+
+def dedent(block, pad):
+    """Strip the leading indentation of the root line from every line, then re-pad."""
+    root_indent = len(block[0]) - len(block[0].lstrip())
+    out = []
+    for line in block:
+        stripped = line[root_indent:] if line[:root_indent].isspace() else line.lstrip()
+        out.append(pad + stripped if stripped.strip() else stripped)
+    return out
+
+
+def grab_header(lines):
+    """Collect header fields to carry over into the emitted SAIF."""
+    hdr = {}
+    for line in lines:
+        s = line.strip()
+        if s.startswith('(INSTANCE'):
+            break
+        for key in HEADER_KEYS:
+            if s.startswith('(' + key):
+                hdr[key] = s
+    return hdr
+
+
+def emit(out, header, top, subtrees):
+    w = out.write
+    w('(SAIFILE\n')
+    w(header.get('SAIFVERSION', '(SAIFVERSION "2.0")') + '\n')
+    w(header.get('DIRECTION', '(DIRECTION "backward")') + '\n')
+    if top:
+        w('(DESIGN "%s")\n' % top)
+    w('(VENDOR "Verilator")\n')
+    w('(PROGRAM_NAME "Verilator")\n')
+    w(header.get('DIVIDER', '(DIVIDER / )') + '\n')
+    w(header.get('TIMESCALE', '(TIMESCALE 1ps)') + '\n')
+    w(header.get('DURATION', '(DURATION 0)') + '\n')
+    if top:
+        w(' (INSTANCE %s\n' % top)
+        pad = '  '
+    else:
+        pad = ' '
+    for block in subtrees:
+        for line in dedent(block, pad):
+            w(line if line.endswith('\n') else line + '\n')
+    if top:
+        w(' )\n')
+    w(')\n')
+
+
+def main():
+    ap = argparse.ArgumentParser(description=__doc__,
+                                 formatter_class=argparse.RawDescriptionHelpFormatter)
+    ap.add_argument('saif', help='master SAIF produced by Verilator --trace-saif')
+    ap.add_argument('--instance', help='slash-separated instance path to extract (suffix match)')
+    ap.add_argument('--top', help='wrapper top-module/instance name for the extracted subtree')
+    ap.add_argument('--all', action='store_true', help='extract every match, not just the first')
+    ap.add_argument('-o', '--output', help='output SAIF path (default: stdout)')
+    ap.add_argument('--list', action='store_true', help='print the instance hierarchy and exit')
+    ap.add_argument('--list-depth', type=int, default=6, help='max depth for --list (default 6)')
+    args = ap.parse_args()
+
+    with open(args.saif) as f:
+        lines = f.readlines()
+
+    if args.list:
+        for idx, path in iter_instances(lines):
+            if len(path) <= args.list_depth:
+                print('  ' * (len(path) - 1) + path[-1])
+        return 0
+
+    if not args.instance:
+        ap.error('--instance is required unless --list is given')
+
+    target = [p for p in args.instance.strip('/').split('/') if p]
+    n = len(target)
+    matches = [idx for idx, path in iter_instances(lines) if path[-n:] == target]
+    if not matches:
+        sys.stderr.write('ERROR: instance path not found: %s\n' % args.instance)
+        return 1
+    if not args.all:
+        matches = matches[:1]
+
+    subtrees = [capture_subtree(lines, idx) for idx in matches]
+    header = grab_header(lines)
+
+    out = open(args.output, 'w') if args.output else sys.stdout
+    try:
+        emit(out, header, args.top, subtrees)
+    finally:
+        if args.output:
+            out.close()
+
+    fedp = sum(1 for b in subtrees for ln in b if INSTANCE_RE.match(ln) and INSTANCE_RE.match(ln).group(2) == 'fedp')
+    sys.stderr.write('saif_filter: extracted %d instance(s) under "%s"; %d fedp leaf instance(s)\n'
+                     % (len(subtrees), args.top or target[-1], fedp))
+    return 0
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/hw/syn/synopsys/Makefile b/hw/syn/synopsys/Makefile
index f5cc84b343..b3337681f5 100644
--- a/hw/syn/synopsys/Makefile
+++ b/hw/syn/synopsys/Makefile
@@ -24,6 +24,13 @@ ASAP7_LIB_ROOT  := $(ASAP7_ROOT)/asap7sc7p5t_28/LIB/NLDM
 ASAP7_SRAM_LIB  := $(ASAP7_ROOT)/ASAP7_SRAM_0p0/generated/LIB/srambank_128x4x32_6t122.lib
 ASAP7_SRAM_DB   := $(ASAP7_ROOT)/ASAP7_SRAM_0p0/generated/LIB/srambank_128x4x32_6t122.db
 ASAP7_NAME      := "ASAP7 (7nm)"
+# physical libs for layout/images (fc_shell): 4x-scaled LEF + .tf are 4000 DBU/um
+ASAP7_TF        := $(ASAP7_ROOT)/asap7_snps/icc/asap07_icc.tf
+ASAP7_LEFS      := $(wildcard $(ASAP7_ROOT)/asap7sc7p5t_28/LEF/scaled/asap7sc7p5t_28_*_4x_220121a.lef)
+ASAP7_TLU       := $(ASAP7_ROOT)/asap7_snps/starrc/asap07.tluplus
+ASAP7_DBS       := $(wildcard $(ASAP7_ROOT)/asap7sc7p5t_28/LIB/NLDM/*_TT_nldm.db)
+ASAP7_PSRAM_DBS := $(wildcard $(ASAP7_ROOT)/ASAP7_SRAM_0p0/generated/LIB/*.db)
+ASAP7_SCALE     := 4000
 
 SAED14_ROOT     := /mnt/nas0/eda.libs/saed14/EDK_03_2025
 SAED14_LIB_ROOT := $(SAED14_ROOT)/SAED14nm_EDK_STD_SLVT/liberty/nldm
@@ -46,6 +53,15 @@ ifeq ($(LIB_NAME),)
     $(error Invalid LIB_TYPE=$(LIB_TYPE).)
 endif
 
+# physical PDK selection for the layout/images flow (empty for libs not yet wired)
+PDK_TF       := $($(LIB_TYPE)_TF)
+PDK_LEFS     := $($(LIB_TYPE)_LEFS)
+PDK_TLU      := $($(LIB_TYPE)_TLU)
+PDK_DBS      := $($(LIB_TYPE)_DBS)
+PDK_SRAM_DBS := $($(LIB_TYPE)_PSRAM_DBS)
+PDK_SCALE    := $(or $($(LIB_TYPE)_SCALE),10000)
+IMG_UTIL     ?= 0.60
+
 $(info [Make] Using library $(LIB_NAME))
 
 DEFAULT_LIB := $(LIB_DIR)/NanGate_15nm_OCL.db
@@ -176,7 +192,7 @@ CFLAGS += -DVX_CFG_XLEN=$(XLEN) -DVX_CFG_XLEN_$(XLEN)
 CFLAGS += $(CONFIGS)
 CFLAGS += $(RTL_INCLUDE)
 
-.PHONY: clean gen-sources synthesis-nosram synthesis-estsram synthesis
+.PHONY: clean gen-sources synthesis-nosram synthesis-estsram synthesis images power-grid gds-render
 
 all: synthesis
 
@@ -234,5 +250,50 @@ synthesis-estsram: $(BUILD_DIR)/sources.txt
 	WALL_IGNORE=$(WALL_IGNORE) \
 	dc_shell -f $(SRC_DIR)/project.tcl
 
+# Layout images (placement, hierarchy, cell-density + power-density heat maps).
+# Requires a completed synthesis (out/$(TOP).mapped.v). Uses fc_shell under Xvfb since
+# icc2_shell is not installed. SAIF_FILE is optional -> power falls back to vectorless.
+images:
+	cd $(BUILD_DIR); \
+	TOP=$(TOP_LEVEL_ENTITY) \
+	NETLIST=out/$(TOP_LEVEL_ENTITY).mapped.v \
+	SDC=out/$(TOP_LEVEL_ENTITY).post_compile.sdc \
+	OUT_DIR=images \
+	SAIF_FILE="$(SAIF_FILE)" \
+	SAIF_INST="$(SAIF_INST)" \
+	PDK_TF="$(PDK_TF)" \
+	PDK_LEFS="$(PDK_LEFS)" \
+	PDK_TLU="$(PDK_TLU)" \
+	PDK_DBS="$(PDK_DBS)" \
+	PDK_SRAM_DBS="$(PDK_SRAM_DBS)" \
+	PDK_SCALE="$(PDK_SCALE)" \
+	UTIL="$(IMG_UTIL)" \
+	GDS="$(GDS)" \
+	GDS_PLACE_OPT="$(GDS_PLACE_OPT)" \
+	xvfb-run -a fc_shell -f $(SRC_DIR)/run_icc2.tcl > images.log 2>&1
+
+# Spatial grid power-density map (paper/EDA-tool style): dump per-cell power from the
+# placed block (images/<TOP>.dlib, so run 'make images' first), then grid + render.
+# SAIF_FILE drives activity (else vectorless). Tune grid with POWER_GRID_N.
+POWER_GRID_N ?= 64
+power-grid:
+	cd $(BUILD_DIR); \
+	TOP=$(TOP_LEVEL_ENTITY) \
+	OUT_DIR=images \
+	SAIF_FILE="$(SAIF_FILE)" \
+	SAIF_INST="$(SAIF_INST)" \
+	PDK_DBS="$(PDK_DBS)" \
+	CSV=images/cellpower.csv \
+	xvfb-run -a fc_shell -f $(SRC_DIR)/dump_power.tcl > power_grid.log 2>&1
+	cd $(BUILD_DIR); \
+	python3 $(SRC_DIR)/powergrid.py images/cellpower.csv images/power_grid.png $(POWER_GRID_N) "$(TOP_LEVEL_ENTITY)" >> power_grid.log 2>&1
+
+# Render a streamed GDS to layout images (full-chip overview + zoom) and a complete,
+# self-contained GDS (ASAP7 cell polygons merged in). Needs the GDS from 'make images GDS=1'.
+GDS_ZOOM_UM ?= 30
+gds-render:
+	cd $(BUILD_DIR); \
+	python3 $(SRC_DIR)/gds_render.py images/$(TOP_LEVEL_ENTITY).gds gdsimg $(GDS_ZOOM_UM) > gds_render.log 2>&1
+
 clean:
 	$(RMDIR) $(BUILD_DIR)
diff --git a/hw/syn/synopsys/dump_power.tcl b/hw/syn/synopsys/dump_power.tcl
new file mode 100644
index 0000000000..0984ed391c
--- /dev/null
+++ b/hw/syn/synopsys/dump_power.tcl
@@ -0,0 +1,36 @@
+## dump_power.tcl -- reopen a placed block, annotate power (SAIF or vectorless),
+## and dump per-cell (x, y, total_power, area) to CSV for a spatial power-density grid.
+## Pairs with powergrid.py. Run: xvfb-run -a fc_shell -f dump_power.tcl
+## env: OUT_DIR (dir with <TOP>.dlib), SAIF_FILE, SAIF_INST, CSV, PDK_DBS
+proc ev {n d} { if {[info exists ::env($n)] && [string trim $::env($n)] ne ""} { return $::env($n) }; return $d }
+set OUT_DIR [ev OUT_DIR images]
+set TOP     [ev TOP ""]
+set SAIF    [ev SAIF_FILE ""]
+set SI      [ev SAIF_INST ""]
+set CSV     [ev CSV $OUT_DIR/cellpower.csv]
+set DBS     [ev PDK_DBS ""]
+if {$DBS eq ""} { set DBS [glob -nocomplain /mnt/nas0/eda.libs/asap7/asap7sc7p5t_28/LIB/NLDM/*_TT_nldm.db] }
+set_app_var link_library [concat "*" $DBS]
+if {$TOP ne "" && [file isdirectory $OUT_DIR/${TOP}.dlib]} {
+  open_lib $OUT_DIR/${TOP}.dlib
+} else {
+  open_lib [lindex [glob $OUT_DIR/*.dlib] 0]
+}
+open_block [lindex [get_object_name [get_blocks]] 0]
+link_block
+if {$SAIF ne "" && [file exists $SAIF]} { if {[catch {read_saif $SAIF -strip_path $SI} e]} { puts "SAIFERR $e" } }
+catch { report_power > $OUT_DIR/power.rpt }
+set bb ""
+foreach a {boundary_bbox bbox} { if {$bb eq ""} { catch { set bb [get_attribute [current_block] $a] } } }
+set fh [open $CSV w]
+puts $fh "#bbox [lindex [lindex $bb 0] 0] [lindex [lindex $bb 0] 1] [lindex [lindex $bb 1] 0] [lindex [lindex $bb 1] 1]"
+puts $fh "x,y,power,area"
+set cnt 0
+foreach_in_collection c [get_cells -hierarchical -filter "is_hierarchical==false"] {
+  set o [get_attribute $c origin]; set p [get_attribute $c total_power]; set ar [get_attribute $c area]
+  if {$o eq "" || $p eq ""} continue
+  puts $fh "[lindex $o 0],[lindex $o 1],$p,$ar"; incr cnt
+}
+close $fh
+puts "DUMP DONE: $cnt cells -> $CSV"
+exit
diff --git a/hw/syn/synopsys/gds_render.py b/hw/syn/synopsys/gds_render.py
new file mode 100644
index 0000000000..5b562d7ff5
--- /dev/null
+++ b/hw/syn/synopsys/gds_render.py
@@ -0,0 +1,157 @@
+#!/usr/bin/env python3
+"""gds_render.py - complete + render an FC placement GDS.
+
+Fusion Compiler streams the design GDS with std-cell PLACEMENTS only (SREFs by
+name); the ASAP7 std-cell polygons are not embedded (LEF-only physical lib). This
+script merges the ASAP7 cell GDS (drawn at the same 4x scale as the 4x LEF used
+for placement, so coordinates align) to produce a complete, self-contained GDS,
+then rasterizes a full-chip overview and a zoomed crop with pycairo.
+
+Usage: gds_render.py <design.gds> <out_dir> [zoom_um]
+Outputs in <out_dir>: <top>.complete.gds, gds_full.png, gds_zoom.png
+"""
+import sys, os, math
+import gdstk, cairo
+
+DESIGN = sys.argv[1]
+OUT    = sys.argv[2]
+ZOOM_UM = float(sys.argv[3]) if len(sys.argv) > 3 else 30.0   # window size (4x um)
+
+GDSDIR = os.environ.get("ASAP7_GDS_DIR", "/mnt/nas0/eda.libs/asap7/asap7sc7p5t_28/GDS")
+CELLGDS = ["asap7sc7p5t_28_R_220121a.gds", "asap7sc7p5t_28_L_220121a.gds",
+           "asap7sc7p5t_28_SL_220121a.gds", "asap7sc7p5t_28_SRAM_220121a.gds"]
+# ASAP7 ships GDS at 1x but the 4x-scaled LEF was used for placement, so cells
+# must be magnified to match the 4x placement coordinates.
+MAG = float(os.environ.get("GDS_CELL_MAG", "4"))
+
+os.makedirs(OUT, exist_ok=True)
+
+# ---- load std-cell defs ----
+cellmap = {}
+for f in CELLGDS:
+    p = os.path.join(GDSDIR, f)
+    if not os.path.exists(p):
+        continue
+    for c in gdstk.read_gds(p).cells:
+        cellmap.setdefault(c.name, c)
+print("[gds_render] ASAP7 cell defs: %d" % len(cellmap))
+
+# ---- load design ----
+d = gdstk.read_gds(DESIGN)
+top = d.top_level()[0]
+refs = top.references
+print("[gds_render] top %s: %d placements" % (top.name, len(refs)))
+
+# ---- merge: add referenced cell defs -> complete GDS ----
+used = {}
+for r in refs:
+    n = r.cell if isinstance(r.cell, str) else r.cell.name
+    used[n] = used.get(n, 0) + 1
+resolved = [n for n in used if n in cellmap]
+missing  = [n for n in used if n not in cellmap]
+print("[gds_render] distinct cells %d  resolved %d  missing %d" % (len(used), len(resolved), len(missing)))
+if missing:
+    print("[gds_render] missing:", sorted(missing)[:20])
+# scale each cell's geometry by MAG and add under the same name, so the design's
+# SREFs (magnification 1) resolve to correctly-sized cells -> consistent GDS.
+for n in resolved:
+    src = cellmap[n]
+    nc = gdstk.Cell(n)
+    nc.add(*[p.scale(MAG, center=(0, 0)) for p in src.get_polygons(depth=None)])
+    d.add(nc)
+complete = os.path.join(OUT, top.name + ".complete.gds")
+d.write_gds(complete)
+print("[gds_render] wrote %s (%d bytes)" % (complete, os.path.getsize(complete)))
+
+# ---- layer color map (by GDS layer number) ----
+LAYER_RGB = {}
+_palette = [(0.20,0.45,0.95),(0.95,0.30,0.25),(0.20,0.75,0.35),(0.95,0.80,0.20),
+            (0.30,0.80,0.85),(0.85,0.35,0.85),(0.95,0.55,0.20),(0.70,0.70,0.75),
+            (0.55,0.85,0.45),(0.45,0.55,0.90),(0.90,0.50,0.55),(0.60,0.40,0.85)]
+def layer_color(layer):
+    if layer not in LAYER_RGB:
+        LAYER_RGB[layer] = _palette[len(LAYER_RGB) % len(_palette)]
+    return LAYER_RGB[layer]
+
+def vt_color(name):
+    if name.endswith("_SRAM"): return (0.95,0.55,0.20)
+    if name.endswith("_SL"):   return (0.95,0.30,0.25)
+    if name.endswith("_L"):    return (0.20,0.75,0.35)
+    return (0.20,0.45,0.95)  # _R
+
+(x0,y0),(x1,y1) = top.bounding_box()
+W = x1-x0; H = y1-y0
+
+# ---- full-chip overview: cell footprints colored by Vt flavor ----
+def render_full(png, px=2000):
+    sc = px/max(W,H)
+    wpx, hpx = int(W*sc)+2, int(H*sc)+2
+    surf = cairo.ImageSurface(cairo.FORMAT_ARGB32, wpx, hpx)
+    cr = cairo.Context(surf)
+    cr.set_source_rgb(0.06,0.06,0.08); cr.paint()
+    # cache cell footprint sizes
+    size = {}
+    for n in resolved:
+        (a,b),(c,e) = cellmap[n].bounding_box()
+        size[n] = ((c-a)*MAG, (e-b)*MAG)
+    for r in refs:
+        n = r.cell if isinstance(r.cell,str) else r.cell.name
+        if n not in size: continue
+        w,h = size[n]
+        rot = (r.rotation or 0.0)
+        if abs(math.sin(rot)) > 0.5: w,h = h,w   # 90/270 swap
+        ox,oy = r.origin
+        X = (ox-x0)*sc; Y = hpx-(oy-y0)*sc - h*sc   # flip Y
+        cr.set_source_rgb(*vt_color(n))
+        cr.rectangle(X, Y, max(w*sc,0.4), max(h*sc,0.4)); cr.fill()
+    surf.write_to_png(png)
+    print("[gds_render] wrote %s (%dx%d)" % (png, wpx, hpx))
+
+# ---- zoom: real polygons in a central window, colored by layer ----
+def render_zoom(png, win_um, px=1600):
+    cx,cy = (x0+x1)/2.0, (y0+y1)/2.0
+    wx0,wy0,wx1,wy1 = cx-win_um/2, cy-win_um/2, cx+win_um/2, cy+win_um/2
+    sub = gdstk.Cell("__zoom__")
+    cnt = 0
+    for r in refs:
+        ox,oy = r.origin
+        if wx0-5 <= ox <= wx1+5 and wy0-5 <= oy <= wy1+5:
+            n = r.cell if isinstance(r.cell,str) else r.cell.name
+            if n in cellmap:
+                sub.add(gdstk.Reference(cellmap[n], origin=r.origin,
+                        rotation=r.rotation or 0, x_reflection=bool(r.x_reflection),
+                        magnification=MAG))
+                cnt += 1
+    sub.flatten()
+    polys = sub.get_polygons(depth=None)
+    # top-level routing (paths/wires live in the top cell, already in 4x space)
+    route = []
+    try:
+        for p in top.get_polygons(depth=0, include_paths=True):
+            (px0,py0),(px1,py1) = p.bounding_box()
+            if px1 >= wx0 and px0 <= wx1 and py1 >= wy0 and py0 <= wy1:
+                route.append(p)
+    except Exception as e:
+        print("[gds_render] top-level routing: %s" % e)
+    polys = polys + route
+    print("[gds_render] zoom window %.1fum: %d cells, %d cell-polys + %d routing-polys" % (win_um, cnt, len(polys)-len(route), len(route)))
+    sc = px/win_um
+    surf = cairo.ImageSurface(cairo.FORMAT_ARGB32, px, px)
+    cr = cairo.Context(surf)
+    cr.set_source_rgb(0.04,0.04,0.05); cr.paint()
+    # draw lower layers first (sort by layer)
+    polys.sort(key=lambda p: p.layer)
+    for p in polys:
+        pts = p.points
+        if len(pts) < 3: continue
+        cr.set_source_rgba(*layer_color(p.layer), 0.78)
+        cr.move_to((pts[0][0]-wx0)*sc, px-(pts[0][1]-wy0)*sc)
+        for (ux,uy) in pts[1:]:
+            cr.line_to((ux-wx0)*sc, px-(uy-wy0)*sc)
+        cr.close_path(); cr.fill()
+    surf.write_to_png(png)
+    print("[gds_render] wrote %s (%dx%d)" % (png, px, px))
+
+render_full(os.path.join(OUT, "gds_full.png"))
+render_zoom(os.path.join(OUT, "gds_zoom.png"), ZOOM_UM)
+print("[gds_render] DONE")
diff --git a/hw/syn/synopsys/powergrid.py b/hw/syn/synopsys/powergrid.py
new file mode 100644
index 0000000000..63f8f7edab
--- /dev/null
+++ b/hw/syn/synopsys/powergrid.py
@@ -0,0 +1,24 @@
+import sys, numpy as np, matplotlib; matplotlib.use('Agg'); import matplotlib.pyplot as plt
+csv, out = sys.argv[1], sys.argv[2]
+N = int(sys.argv[3]) if len(sys.argv) > 3 else 64
+title = sys.argv[4] if len(sys.argv) > 4 else ""
+bb=None; xs=[]; ys=[]; ps=[]
+for ln in open(csv):
+    if ln.startswith("#bbox"): bb=[float(v) for v in ln.split()[1:5]]; continue
+    if ln.startswith("x,") or not ln.strip(): continue
+    x,y,p,a = ln.split(","); xs.append(float(x)); ys.append(float(y)); ps.append(float(p))
+xs=np.array(xs); ys=np.array(ys); ps=np.array(ps)
+x0,y0,x1,y1 = bb if bb else [xs.min(),ys.min(),xs.max(),ys.max()]
+grid=np.zeros((N,N))
+ix=np.clip(((xs-x0)/(x1-x0)*N).astype(int),0,N-1)
+iy=np.clip(((ys-y0)/(y1-y0)*N).astype(int),0,N-1)
+np.add.at(grid,(iy,ix),ps)
+tile_area=((x1-x0)/N)*((y1-y0)/N)
+dens=grid/tile_area                       # power per unit area
+fig,ax=plt.subplots(figsize=(7,6))
+vmax=np.percentile(dens[dens>0],99) if (dens>0).any() else dens.max()
+im=ax.imshow(dens,vmin=0,vmax=vmax,origin='lower',extent=[x0,x1,y0,y1],cmap='jet',interpolation='bilinear',aspect='equal')
+cb=fig.colorbar(im,ax=ax,fraction=0.046,pad=0.04); cb.set_label('power density (pW/um^2, 4x); clipped @ 99th pct')
+ax.set_xlabel('x (um, 4x)'); ax.set_ylabel('y (um, 4x)')
+ax.set_title(title+f"  ({N}x{N} grid, sum P per tile)")
+fig.tight_layout(); fig.savefig(out,dpi=130); print("wrote",out,"grid sum=%.3g max_tile=%.3g"%(grid.sum(),dens.max()))
diff --git a/hw/syn/synopsys/project.tcl b/hw/syn/synopsys/project.tcl
index ae3d8cb930..01cf154f84 100755
--- a/hw/syn/synopsys/project.tcl
+++ b/hw/syn/synopsys/project.tcl
@@ -896,6 +896,7 @@ if {[sizeof_collection $unmapped_cells] > 0} {
 # ---------------- reports ----------------
 report_qor                                      > [file join $RPT_DIR "qor.rpt"]
 report_area -hier -nosplit                      > [file join $RPT_DIR "area.rpt"]
+report_timing -delay_type max -path_type summary -max_paths 50 > [file join $RPT_DIR "timing_summary.rpt"]
 report_timing -delay_type max -path_type full_clock -max_paths 50 -nets -transition_time -capacitance > [file join $RPT_DIR "timing_max.rpt"]
 report_timing -delay_type min -path_type full_clock -max_paths 50 -nets -transition_time -capacitance > [file join $RPT_DIR "timing_min.rpt"]
 report_clock -skew                              > [file join $RPT_DIR "clock_skew.rpt"]
diff --git a/hw/syn/synopsys/run_icc2.tcl b/hw/syn/synopsys/run_icc2.tcl
new file mode 100644
index 0000000000..fc2f4f6571
--- /dev/null
+++ b/hw/syn/synopsys/run_icc2.tcl
@@ -0,0 +1,287 @@
+## run_icc2.tcl
+## Fusion Compiler placement + layout images for a synthesized netlist.
+## Produces: placement, hierarchy-colored, cell-density heat map, and power-density
+## heat map PNGs. Power uses SAIF activity if provided, else vectorless.
+##
+## NOTE: icc2_shell is not installed on this host; fc_shell shares the same ICC2 P&R
+## engine / NDM database / physical libraries. Run headless via Xvfb:
+##     xvfb-run -a fc_shell -f run_icc2.tcl
+##
+## Driven entirely by environment variables (see the Makefile 'images' target):
+##   TOP            top module name                              (required)
+##   NETLIST        gate-level netlist                            (default out/$TOP.mapped.v)
+##   SDC            constraints                                   (default out/$TOP.post_compile.sdc)
+##   OUT_DIR        output directory for the .dlib + PNGs         (default images)
+##   SAIF_FILE      switching activity (optional -> vectorless)
+##   SAIF_INST      instance scope of SAIF in the netlist         (optional)
+##   PDK_TF         technology file (.tf)                         (required)
+##   PDK_LEFS       cell/tech LEF files (space separated)         (required)
+##   PDK_DBS        timing/power .db libs (space separated)       (required for power)
+##   PDK_SRAM_DBS   SRAM .db libs (space separated)               (optional)
+##   PDK_TLU        TLUPlus parasitic file                        (optional)
+##   PDK_SCALE      internal length precision / scale factor      (default 10000)
+##   UTIL           core utilization for the floorplan           (default 0.60)
+
+proc ev {name def} {
+  if {[info exists ::env($name)] && [string trim $::env($name)] ne ""} { return $::env($name) }
+  return $def
+}
+proc note {m} { puts "\[run_icc2\] $m" }
+
+set TOP        [ev TOP ""]
+if {$TOP eq ""} { error "run_icc2: TOP not set" }
+set NETLIST    [ev NETLIST  "out/${TOP}.mapped.v"]
+set SDC        [ev SDC      "out/${TOP}.post_compile.sdc"]
+set OUT_DIR    [ev OUT_DIR  "images"]
+set SAIF_FILE  [ev SAIF_FILE ""]
+set SAIF_INST  [ev SAIF_INST ""]
+set TF         [ev PDK_TF   ""]
+set LEFS       [ev PDK_LEFS ""]
+set DBS        [ev PDK_DBS  ""]
+set SRAM_DBS   [ev PDK_SRAM_DBS ""]
+set TLU        [ev PDK_TLU  ""]
+set SCALE      [ev PDK_SCALE 10000]
+set UTIL       [ev UTIL     0.60]
+set GDS_MODE   [ev GDS      0]
+## GDS_PLACE_OPT=1 (default) uses timing-driven place_opt+CTS; =0 uses create_placement only
+## (a safe fallback — place_opt can segfault optimizing large DesignWare datapath cells).
+set GDS_PLACE_OPT [ev GDS_PLACE_OPT 1]
+set route_done 0
+
+if {![file exists $NETLIST]} { error "run_icc2: netlist not found: $NETLIST (run synthesis first)" }
+if {$TF eq "" || $LEFS eq ""} { error "run_icc2: PDK_TF / PDK_LEFS not set (unsupported LIB_TYPE for layout?)" }
+file mkdir $OUT_DIR
+set_host_options -max_cores 8
+
+## ---------------- 1. Library (timing + physical) ----------------
+## The .db are 1x native and LEF/.tf may be Nx scaled; forcing -scale_factor avoids the
+## LIB-055 precision clash. NDM cell libs are cached under ./CLIBs after the first run.
+if {$DBS ne ""} { set_app_var link_library [concat "*" $DBS $SRAM_DBS] }
+set DLIB $OUT_DIR/${TOP}.dlib
+file delete -force $DLIB
+catch { set_app_options -name lib.setting.use_tech_scale_factor -value true }
+create_lib -technology $TF -ref_libs $LEFS -scale_factor $SCALE $DLIB
+
+## ---------------- 2. Netlist ----------------
+read_verilog -top $TOP $NETLIST
+link_block
+
+## ---------------- 3. Parasitics + constraints ----------------
+if {$TLU ne "" && [file exists $TLU]} {
+  set tlu_use $TLU
+  ## ASAP7 ships asap07.tluplus gzip-compressed; decompress to OUT_DIR if needed.
+  if {![catch {exec file -L $TLU} ft] && [string match -nocase "*gzip*" $ft]} {
+    set tlu_use $OUT_DIR/parasitic.tluplus
+    if {[catch {exec sh -c "gunzip -c [list $TLU] > [list $tlu_use]"} ge]} { note "WARN gunzip tlu: $ge"; set tlu_use $TLU }
+  }
+  if {[catch {read_parasitic_tech -tlup $tlu_use -name nominal} e]} { note "WARN parasitic: $e" } \
+  else { catch { set_parasitic_parameters -late_spec nominal -early_spec nominal } }
+}
+if {[file exists $SDC]} { if {[catch {read_sdc $SDC} e]} { note "WARN sdc: $e" } }
+
+## ---------------- 4. Floorplan + placement (+ optional route for GDS) ----------------
+initialize_floorplan -core_utilization $UTIL -core_offset 5
+if {[catch {connect_pg_net -automatic} e]} { note "WARN pg: $e" }
+if {[catch {place_pins -self} e]}          { note "WARN pins: $e" }
+
+if {$GDS_MODE == 1 && $DBS ne ""} {
+  ## ---- full place-and-route to produce a routed GDS ----
+  if {$GDS_PLACE_OPT == 1} {
+    note "GDS mode: place_opt -> CTS -> route"
+    if {[catch {place_opt} e]} { note "WARN place_opt: $e"; catch {create_placement}; catch {legalize_placement} }
+    if {[catch {clock_opt} e]} { note "WARN clock_opt: $e" }
+  } else {
+    note "GDS mode (safe): create_placement -> route (skip place_opt/CTS)"
+    if {[catch {create_placement} e]}   { note "WARN place: $e" }
+    if {[catch {legalize_placement} e]} { note "WARN legalize: $e" }
+  }
+  if {[catch {route_auto} e]} { note "WARN route_auto: $e" } else { set route_done 1; note "routing complete" }
+} else {
+  if {[catch {create_placement} e]}   { note "WARN place: $e" }
+  if {[catch {legalize_placement} e]} { note "WARN legalize: $e" }
+}
+catch { redirect $OUT_DIR/placement.rpt {report_placement} }
+catch { save_block }
+
+## ---------------- 5. Power (SAIF if given, else vectorless) ----------------
+set have_power 0
+if {$DBS ne ""} {
+  if {$SAIF_FILE ne "" && [file exists $SAIF_FILE]} {
+    note "power: SAIF activity from $SAIF_FILE"
+    if {$SAIF_INST ne ""} {
+      catch { read_saif $SAIF_FILE -strip_path $SAIF_INST }
+    } else {
+      catch { read_saif $SAIF_FILE }
+    }
+  } else {
+    note "power: no SAIF -> vectorless estimation"
+  }
+  if {[catch {redirect $OUT_DIR/power.rpt {report_power}} e]} { note "WARN power: $e" } else { set have_power 1 }
+} else {
+  note "power: PDK_DBS not set -> skipping power map"
+}
+
+## ---------------- 5b. GDS ----------------
+if {$GDS_MODE == 1} {
+  if {[catch {write_gds $OUT_DIR/${TOP}.gds} e]} {
+    note "WARN write_gds: $e"
+  } else {
+    note "wrote GDS ([expr {$route_done ? {routed} : {placed-only}}]): $OUT_DIR/${TOP}.gds"
+  }
+}
+
+## ---------------- 6. Layout images ----------------
+## FC's native GUI map modes (powerDensityMap/cellDensityMap/Hierarchy) do not compute
+## in a headless/batch session (no interactive event loop). We instead render each figure
+## by coloring leaf cells per bin with gui_change_highlight, which works in batch.
+gui_start
+set WIN [lindex [gui_get_window_ids] 0]
+if {$WIN eq ""} { set WIN BlockWindow.1 }
+set VIEW Layout.1
+gui_set_active_window -window $WIN
+catch { gui_zoom -window $WIN -full }
+
+set HEAT    {blue cyan green yellow orange red}            ;# low -> high
+set PALETTE {red green blue yellow cyan magenta orange white salmon}
+set LEAF [get_cells -hierarchical -filter "is_hierarchical==false"]
+note "leaf cells: [sizeof_collection $LEAF]"
+
+proc snap {name} {
+  global OUT_DIR WIN VIEW
+  catch { gui_zoom -window $WIN -full }
+  if {[catch {gui_write_window_image -window $VIEW -format png -file $OUT_DIR/${name}.png}]} {
+    catch { gui_write_window_image -window $WIN -format png -file $OUT_DIR/${name}.png }
+  }
+}
+proc clear_hl {} { global LEAF; catch { gui_change_highlight -remove -all_colors -collection $LEAF } }
+
+## plain placement (no coloring)
+clear_hl
+snap placement
+
+## A/C-style attribute heat map: QUANTILE bins (≈equal cells per color) by $attr.
+## Quantile (rank-based) edges instead of linear, so a few high-power outliers don't
+## collapse 99% of cells into the lowest bin — the spatial distribution stays legible.
+proc heat_attr {attr name} {
+  global LEAF HEAT OUT_DIR
+  set s     [sort_collection $LEAF $attr]
+  set names [get_object_name $s]
+  set n     [llength $names]
+  if {$n == 0} { note "$name: no leaf cells (skip)"; return 0 }
+  set vmax [get_attribute [index_collection $s [expr {$n-1}]] $attr]
+  if {$vmax eq "" || $vmax <= 0} { note "$name: no positive '$attr' (skip)"; return 0 }
+  set nb [llength $HEAT]
+  set lf [open $OUT_DIR/${name}_legend.txt w]
+  puts $lf "# ${name}.png  -  '$attr' per leaf cell, QUANTILE bins (~equal cells/color), low(blue) -> high(red)"
+  puts $lf "# color    ${attr}_range                       cells"
+  clear_hl
+  for {set i 0} {$i < $nb} {incr i} {
+    set lo [expr {$n*$i/$nb}]; set hi [expr {$n*($i+1)/$nb}]
+    if {$hi <= $lo} continue
+    set sub   [get_cells [lrange $names $lo [expr {$hi-1}]]]
+    set loval [get_attribute [index_collection $s $lo] $attr]
+    set hival [get_attribute [index_collection $s [expr {$hi-1}]] $attr]
+    set col   [lindex $HEAT $i]
+    note [format "  %s %-6s \[%.4g, %.4g]: %d cells" $name $col $loval $hival [expr {$hi-$lo}]]
+    puts $lf [format "%-8s \[%.4g, %.4g\]   %d" $col $loval $hival [expr {$hi-$lo}]]
+    catch { gui_change_highlight -add -color $col -collection $sub }
+  }
+  close $lf
+  snap $name
+  clear_hl
+  return 1
+}
+
+## B: color leaf cells by their hierarchical block (first 2 path levels; flat glue grouped)
+proc color_hier {name} {
+  global LEAF PALETTE OUT_DIR
+  array unset grp
+  foreach_in_collection c $LEAF {
+    set f [get_object_name $c]
+    set parts [split $f "/"]
+    if {[llength $parts] <= 1} { set g "_top_glue_" } \
+    elseif {[llength $parts] == 2} { set g [lindex $parts 0] } \
+    else { set g "[lindex $parts 0]/[lindex $parts 1]" }
+    lappend grp($g) $f
+  }
+  set lf [open $OUT_DIR/${name}_legend.txt w]
+  puts $lf "# ${name}.png  -  color -> hierarchical block (leaf cells)."
+  puts $lf "# Colors assigned by ALPHABETICAL block name from palette (cyclic; positional, not semantic):"
+  puts $lf "#   $PALETTE"
+  puts $lf "# color    block                                cells"
+  clear_hl
+  set i 0
+  foreach g [lsort [array names grp]] {
+    set col [lindex $PALETTE [expr {$i % [llength $PALETTE]}]]
+    note "  hier $col <- $g ([llength $grp($g)] cells)"
+    puts $lf [format "%-8s %-36s %d" $col $g [llength $grp($g)]]
+    catch { gui_change_highlight -add -color $col -collection [get_cells $grp($g)] }
+    incr i
+  }
+  close $lf
+  snap $name
+  clear_hl
+}
+
+## C: placement cell-density heat map. Grid the core by cell origin, color cells by the
+## area-occupancy of their grid bin.
+proc density_map {name {N 40}} {
+  global LEAF HEAT OUT_DIR
+  set bb ""
+  foreach a {boundary_bbox bbox} { if {$bb eq ""} { catch { set bb [get_attribute [current_block] $a] } } }
+  if {$bb eq ""} { note "$name: no block bbox (skip)"; return 0 }
+  set x0 [lindex [lindex $bb 0] 0]; set y0 [lindex [lindex $bb 0] 1]
+  set x1 [lindex [lindex $bb 1] 0]; set y1 [lindex [lindex $bb 1] 1]
+  set dx [expr {($x1-$x0)/double($N)}]; set dy [expr {($y1-$y0)/double($N)}]
+  if {$dx <= 0 || $dy <= 0} { note "$name: bad bbox (skip)"; return 0 }
+  array unset barea; array unset bcells
+  foreach_in_collection c $LEAF {
+    set o [get_attribute $c origin]; set a [get_attribute $c area]
+    if {$o eq "" || $a eq ""} continue
+    set bx [expr {int(([lindex $o 0]-$x0)/$dx)}]; set by [expr {int(([lindex $o 1]-$y0)/$dy)}]
+    if {$bx < 0} {set bx 0}; if {$bx >= $N} {set bx [expr {$N-1}]}
+    if {$by < 0} {set by 0}; if {$by >= $N} {set by [expr {$N-1}]}
+    set k "$bx.$by"
+    set barea($k) [expr {([info exists barea($k)] ? $barea($k) : 0)+$a}]
+    lappend bcells($k) [get_object_name $c]
+  }
+  set binA [expr {$dx*$dy}]; set dmax 0
+  foreach k [array names barea] { set d [expr {$barea($k)/$binA}]; if {$d > $dmax} {set dmax $d} }
+  if {$dmax <= 0} { note "$name: zero density (skip)"; return 0 }
+  set n [llength $HEAT]; array unset lvl
+  foreach k [array names barea] {
+    set li [expr {int(($barea($k)/$binA)/$dmax*$n)}]; if {$li >= $n} {set li [expr {$n-1}]}
+    foreach cn $bcells($k) { lappend lvl($li) $cn }
+  }
+  set lf [open $OUT_DIR/${name}_legend.txt w]
+  puts $lf "# ${name}.png  -  cell-area occupancy per ${N}x${N} grid bin (fraction of bin area)"
+  puts $lf "# gradient low(blue) -> high(red); dmax = [format %.3f $dmax]"
+  puts $lf "# color    occupancy_range"
+  clear_hl
+  for {set i 0} {$i < $n} {incr i} {
+    puts $lf [format "%-8s \[%.3f, %.3f)" [lindex $HEAT $i] [expr {$dmax*$i/$n}] [expr {$dmax*($i+1)/$n}]]
+    if {[info exists lvl($i)]} { catch { gui_change_highlight -add -color [lindex $HEAT $i] -collection [get_cells $lvl($i)] } }
+  }
+  close $lf
+  note "  density max = [format %.3f $dmax] (grid ${N}x${N})"
+  snap $name
+  clear_hl
+  return 1
+}
+
+## B: hierarchy-colored floorplan
+color_hier hierarchy
+
+## C: cell-density heat map
+density_map cell_density 40
+
+## A: power heat map (vectorless or SAIF); fall back to leakage if no dynamic
+if {$have_power} {
+  if {![heat_attr total_power power_density]} { heat_attr leakage_power power_density }
+} else {
+  note "skipping power_density image (no power data)"
+}
+
+gui_stop
+note "DONE. Images in $OUT_DIR/"
+exit
diff --git a/hw/syn/xilinx/xrt/Makefile b/hw/syn/xilinx/xrt/Makefile
index 37653446cd..b3cda2c0f0 100644
--- a/hw/syn/xilinx/xrt/Makefile
+++ b/hw/syn/xilinx/xrt/Makefile
@@ -158,9 +158,15 @@ ifneq (,$(filter -DVX_CFG_EXT_OM_ENABLE, $(XCONFIGS)))
 	RTL_INCLUDE += -I$(RTL_DIR)/om
 endif
 
+# Kernel clock target frequency (MHz). The U55C platform default is 300;
+# lowered to 250 for timing-closure margin. Override on the command line
+# (e.g. KERNEL_FREQ=300) to retarget.
+KERNEL_FREQ ?= 250
+
 # Kernel compiler global settings
 VPP_FLAGS += --link --target $(TARGET) --platform $(PLATFORM) --save-temps --no_ip_cache
 VPP_FLAGS += --vivado.synth.jobs $(JOBS) --vivado.impl.jobs $(JOBS)
+VPP_FLAGS += --kernel_frequency $(KERNEL_FREQ)
 
 # register compilation hooks
 VPP_FLAGS += --xp "vivado_prop:run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=${SRC_DIR}/pre_opt_hook.tcl"
diff --git a/hw/unittest/cache/VX_cache_top.sv b/hw/unittest/cache/VX_cache_top.sv
index e4854187d2..dc51ae59c7 100644
--- a/hw/unittest/cache/VX_cache_top.sv
+++ b/hw/unittest/cache/VX_cache_top.sv
@@ -57,6 +57,9 @@ module VX_cache_top import VX_gpu_pkg::*; #(
     // Core response output buffer
     parameter CORE_OUT_BUF          = 3,
 
+    // Enable AMO support (tracks the A extension by default)
+    parameter AMO_ENABLE            = `VX_CFG_EXT_A_ENABLED,
+
     // Memory request output buffer
     parameter MEM_OUT_BUF           = 3,
 
@@ -167,6 +170,7 @@ module VX_cache_top import VX_gpu_pkg::*; #(
         .WRITE_ENABLE   (WRITE_ENABLE),
         .WRITEBACK      (WRITEBACK),
         .DIRTY_BYTES    (DIRTY_BYTES),
+        .AMO_ENABLE     (AMO_ENABLE),
         .CORE_OUT_BUF   (CORE_OUT_BUF),
         .MEM_OUT_BUF    (MEM_OUT_BUF)
     ) cache (
diff --git a/sim/opaesim/opae_sim.cpp b/sim/opaesim/opae_sim.cpp
index bf825e30dc..0996a2734c 100644
--- a/sim/opaesim/opae_sim.cpp
+++ b/sim/opaesim/opae_sim.cpp
@@ -38,6 +38,8 @@
 #include <vortex_opae.h>
 
 #include <future>
+#include <thread>
+#include <atomic>
 #include <list>
 #include <queue>
 #include <unordered_map>
@@ -174,8 +176,19 @@ class opae_sim::Impl {
     // launch execution thread
     future_ = std::async(std::launch::async, [&]{
       while (!stop_) {
-        std::lock_guard<std::mutex> guard(mutex_);
-        this->tick();
+        // Give host-side MMIO/mem calls absolute priority: while any host op
+        // is pending, fully back off (don't even contend for mutex_). Without
+        // this the free-running ticker re-acquires the lock in a tight loop and
+        // starves the host thread, hanging the run (it stalled at device init).
+        if (host_waiters_.load(std::memory_order_acquire) != 0) {
+          std::this_thread::yield();
+          continue;
+        }
+        {
+          std::lock_guard<std::mutex> guard(mutex_);
+          this->tick();
+        }
+        std::this_thread::yield();
       }
     });
 
@@ -221,7 +234,7 @@ class opae_sim::Impl {
   }
 
   void read_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t *value) {
-    std::lock_guard<std::mutex> guard(mutex_);
+    HostLock guard(*this);
 
     // simulate CPU-GPU latency
     for (uint32_t i = 0; i < CPU_GPU_LATENCY; ++i) {
@@ -248,7 +261,7 @@ class opae_sim::Impl {
   }
 
   void write_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t value) {
-    std::lock_guard<std::mutex> guard(mutex_);
+    HostLock guard(*this);
 
     // simulate CPU-GPU latency
     for (uint32_t i = 0; i < CPU_GPU_LATENCY; ++i) {
@@ -266,8 +279,8 @@ class opae_sim::Impl {
   }
 
   void copy(uint64_t dest, uint64_t src, uint64_t size) {
-    
-    std::lock_guard<std::mutex> guard(mutex_);
+
+    HostLock guard(*this);
 
     ram_->copy(dest, src, size);
   }
@@ -546,6 +559,24 @@ class opae_sim::Impl {
   std::list<cci_wr_req_t> cci_writes_;
 
   std::mutex mutex_;
+  // Count of host threads waiting on / holding mutex_. The free-running sim
+  // ticker backs off whenever this is non-zero so host MMIO/mem ops are never
+  // starved by the background ticker.
+  std::atomic<int> host_waiters_{0};
+
+  // RAII guard for every host-side entry point: registers intent (so the
+  // ticker yields) before acquiring mutex_, and clears it after releasing.
+  struct HostLock {
+    opae_sim::Impl& impl_;
+    explicit HostLock(opae_sim::Impl& impl) : impl_(impl) {
+      impl_.host_waiters_.fetch_add(1, std::memory_order_acquire);
+      impl_.mutex_.lock();
+    }
+    ~HostLock() {
+      impl_.mutex_.unlock();
+      impl_.host_waiters_.fetch_sub(1, std::memory_order_release);
+    }
+  };
 
   std::queue<mem_req_t*> dram_queue_;
 
diff --git a/sim/simx/csr_unit.cpp b/sim/simx/csr_unit.cpp
index f49813c844..b39f679842 100644
--- a/sim/simx/csr_unit.cpp
+++ b/sim/simx/csr_unit.cpp
@@ -173,8 +173,6 @@ Word CsrUnit::get_csr(uint32_t addr, uint32_t wid, uint32_t tid) {
       #ifdef VX_CFG_EXT_TCU_ENABLE
         CSR_READ_64(VX_CSR_MPM_INSTR_TCU, core_perf.tcu_instrs);
       #endif
-        CSR_READ_64(VX_CSR_MPM_MEM_READS, proc_perf.mem_reads);
-        CSR_READ_64(VX_CSR_MPM_MEM_WRITES, proc_perf.mem_writes);
         CSR_READ_64(VX_CSR_MPM_IFETCHES, core_perf.ifetches);
         CSR_READ_64(VX_CSR_MPM_IFETCH_LT, core_perf.ifetch_latency);
         CSR_READ_64(VX_CSR_MPM_LOADS, core_perf.loads);
@@ -182,52 +180,64 @@ Word CsrUnit::get_csr(uint32_t addr, uint32_t wid, uint32_t tid) {
         CSR_READ_64(VX_CSR_MPM_LOAD_LT, core_perf.load_latency);
         }
       } break;
-      case VX_DCR_MPM_CLASS_MEM: {
-        auto cluster_perf = core_->socket()->cluster()->perf_stats();
+      case VX_DCR_MPM_CLASS_ICACHE: {
         auto socket_perf = core_->socket()->perf_stats();
-        auto lmem_perf = core_->local_mem()->perf_stats();
-
-        uint64_t coalescer_misses = 0;
-        for (uint i = 0; i < VX_CFG_NUM_LSU_BLOCKS; ++i) {
-          coalescer_misses += core_->mem_coalescer(i)->perf_stats().misses;
-        }
-
         switch (addr) {
         CSR_READ_64(VX_CSR_MPM_ICACHE_READS, socket_perf.icache.reads);
         CSR_READ_64(VX_CSR_MPM_ICACHE_MISS_R, socket_perf.icache.read_misses);
         CSR_READ_64(VX_CSR_MPM_ICACHE_MSHR_ST, socket_perf.icache.mshr_stalls);
-
+        }
+      } break;
+      case VX_DCR_MPM_CLASS_DCACHE: {
+        auto socket_perf = core_->socket()->perf_stats();
+        switch (addr) {
         CSR_READ_64(VX_CSR_MPM_DCACHE_READS, socket_perf.dcache.reads);
         CSR_READ_64(VX_CSR_MPM_DCACHE_WRITES, socket_perf.dcache.writes);
         CSR_READ_64(VX_CSR_MPM_DCACHE_MISS_R, socket_perf.dcache.read_misses);
         CSR_READ_64(VX_CSR_MPM_DCACHE_MISS_W, socket_perf.dcache.write_misses);
+        CSR_READ_64(VX_CSR_MPM_DCACHE_EVICTS, socket_perf.dcache.evictions);
         CSR_READ_64(VX_CSR_MPM_DCACHE_BANK_ST, socket_perf.dcache.bank_stalls);
         CSR_READ_64(VX_CSR_MPM_DCACHE_MSHR_ST, socket_perf.dcache.mshr_stalls);
-
+        }
+      } break;
+      case VX_DCR_MPM_CLASS_L2CACHE: {
+        auto cluster_perf = core_->socket()->cluster()->perf_stats();
+        switch (addr) {
         CSR_READ_64(VX_CSR_MPM_L2CACHE_READS, cluster_perf.l2cache.reads);
         CSR_READ_64(VX_CSR_MPM_L2CACHE_WRITES, cluster_perf.l2cache.writes);
         CSR_READ_64(VX_CSR_MPM_L2CACHE_MISS_R, cluster_perf.l2cache.read_misses);
         CSR_READ_64(VX_CSR_MPM_L2CACHE_MISS_W, cluster_perf.l2cache.write_misses);
+        CSR_READ_64(VX_CSR_MPM_L2CACHE_EVICTS, cluster_perf.l2cache.evictions);
         CSR_READ_64(VX_CSR_MPM_L2CACHE_BANK_ST, cluster_perf.l2cache.bank_stalls);
         CSR_READ_64(VX_CSR_MPM_L2CACHE_MSHR_ST, cluster_perf.l2cache.mshr_stalls);
-
+        }
+      } break;
+      case VX_DCR_MPM_CLASS_L3CACHE: {
+        switch (addr) {
         CSR_READ_64(VX_CSR_MPM_L3CACHE_READS, proc_perf.l3cache.reads);
         CSR_READ_64(VX_CSR_MPM_L3CACHE_WRITES, proc_perf.l3cache.writes);
         CSR_READ_64(VX_CSR_MPM_L3CACHE_MISS_R, proc_perf.l3cache.read_misses);
         CSR_READ_64(VX_CSR_MPM_L3CACHE_MISS_W, proc_perf.l3cache.write_misses);
+        CSR_READ_64(VX_CSR_MPM_L3CACHE_EVICTS, proc_perf.l3cache.evictions);
         CSR_READ_64(VX_CSR_MPM_L3CACHE_BANK_ST, proc_perf.l3cache.bank_stalls);
         CSR_READ_64(VX_CSR_MPM_L3CACHE_MSHR_ST, proc_perf.l3cache.mshr_stalls);
-
+        }
+      } break;
+      case VX_DCR_MPM_CLASS_MEM: {
+        auto lmem_perf = core_->local_mem()->perf_stats();
+        uint64_t coalescer_misses = 0;
+        for (uint i = 0; i < VX_CFG_NUM_LSU_BLOCKS; ++i) {
+          coalescer_misses += core_->mem_coalescer(i)->perf_stats().misses;
+        }
+        switch (addr) {
         CSR_READ_64(VX_CSR_MPM_MEM_READS, proc_perf.mem_reads);
         CSR_READ_64(VX_CSR_MPM_MEM_WRITES, proc_perf.mem_writes);
         CSR_READ_64(VX_CSR_MPM_MEM_LT, proc_perf.mem_latency);
         CSR_READ_64(VX_CSR_MPM_MEM_BANK_ST, proc_perf.memsim.bank_stalls);
-
-        CSR_READ_64(VX_CSR_MPM_COALESCER_MISS, coalescer_misses);
-
         CSR_READ_64(VX_CSR_MPM_LMEM_READS, lmem_perf.reads);
         CSR_READ_64(VX_CSR_MPM_LMEM_WRITES, lmem_perf.writes);
         CSR_READ_64(VX_CSR_MPM_LMEM_BANK_ST, lmem_perf.bank_stalls);
+        CSR_READ_64(VX_CSR_MPM_COALESCER_MISS, coalescer_misses);
         }
       } break;
     #ifdef VX_CFG_EXT_TCU_ENABLE
diff --git a/sim/simx/mem/cache.cpp b/sim/simx/mem/cache.cpp
index e4c98cf93b..112f08a505 100644
--- a/sim/simx/mem/cache.cpp
+++ b/sim/simx/mem/cache.cpp
@@ -112,11 +112,13 @@ struct line_t {
   uint32_t lru_ctr;
   bool valid;
   bool dirty;
+  uint64_t dirty_mask;                 // per-byte dirty bits (DIRTY_BYTES model)
   std::shared_ptr<mem_block_t> data;  // line bytes
 
   void reset() {
     valid = false;
     dirty = false;
+    dirty_mask = 0;
     lru_ctr = 0;
     data.reset();
   }
@@ -423,7 +425,14 @@ class MSHR {
   mshr_entry_t &replay(uint32_t id) {
     auto &root_entry = entries_.at(id);
     assert(root_entry.bank_req.type == bank_req_t::Core);
-    assert(ready_reqs_ == 0);
+    // A prior fill's replay batch may still be draining: a stalled fill
+    // (writeback egress backpressure) can sit in the pipe while a second
+    // fill is admitted behind it, so two fills' replays can be live at once.
+    // This is safe — the MSHR coalesces misses, so the two fills are for
+    // distinct lines and this replay marks only its own line's waiters; the
+    // accumulated ready_reqs_ stays correct and dequeue preserves per-line
+    // read-before-write ordering. Double-replaying the same fill is caught by
+    // the Core-type assert above.
     for (auto &entry : entries_) {
       if (entry.bank_req.type == bank_req_t::Core && entry.set_id == root_entry.set_id && entry.addr_tag == root_entry.addr_tag) {
         entry.bank_req.type = bank_req_t::Replay;
@@ -768,6 +777,7 @@ class CacheBank : public SimObject<CacheBank> {
       line_merge(hit_line, store_block, byteen);
       if (config_.write_back) {
         hit_line.dirty = true;
+        hit_line.dirty_mask |= byteen;
       } else {
         // Write-through: emit a write of the merged word downstream.
         MemReq w;
@@ -848,7 +858,7 @@ class CacheBank : public SimObject<CacheBank> {
         wb.hart_id    = bank_req.hart_id;
         wb.uuid   = bank_req.uuid;
         wb.data   = line.data;
-        wb.byteen = ~uint64_t(0) >> (64 - VX_CFG_MEM_BLOCK_SIZE);
+        wb.byteen = line.dirty_mask;
         this->mem_req_out.send(wb);
         DT(3, this->name() << " amo-probe-wb: " << wb);
         ++perf_stats_.evictions;
@@ -857,6 +867,7 @@ class CacheBank : public SimObject<CacheBank> {
         auto &line = set.lines.at(hit_id);
         line.valid = false;
         line.dirty = false;
+        line.dirty_mask = 0;
       }
 
       // Forward AMO downstream. Tag is rewritten so the response
@@ -911,7 +922,7 @@ class CacheBank : public SimObject<CacheBank> {
         wb.hart_id    = bank_req.hart_id;
         wb.uuid   = bank_req.uuid;
         wb.data   = victim_line.data;
-        wb.byteen = ~uint64_t(0) >> (64 - VX_CFG_MEM_BLOCK_SIZE);
+        wb.byteen = victim_line.dirty_mask;
         this->mem_req_out.send(wb);
         DT(3, this->name() << " writeback: " << wb);
         ++perf_stats_.evictions;
@@ -920,6 +931,7 @@ class CacheBank : public SimObject<CacheBank> {
       victim_line.tag     = addr_tag;
       victim_line.lru_ctr = 0;
       victim_line.dirty   = false;
+      victim_line.dirty_mask = 0;
       victim_line.data    = bank_req.data;
       mshr_.replay(bank_req.mshr_id);
       pipe_req_->pop();
@@ -994,8 +1006,10 @@ class CacheBank : public SimObject<CacheBank> {
           assert(bank_req.skip_core_rsp && "WT replay without pre-sent store");
         }
         line_merge(hit_line, bank_req.data, bank_req.byteen);
-        if (config_.write_back)
+        if (config_.write_back) {
           hit_line.dirty = true;
+          hit_line.dirty_mask |= bank_req.byteen;
+        }
 #if VX_CFG_EXT_A_ENABLED
         // Write-back write-miss replay reaching the LLC tag array:
         // break other harts' reservations. For the WT wt-merge replay,
@@ -1055,6 +1069,7 @@ class CacheBank : public SimObject<CacheBank> {
           line_merge(hit_line, bank_req.data, bank_req.byteen);
           if (config_.write_back) {
             hit_line.dirty = true;
+            hit_line.dirty_mask |= bank_req.byteen;
           } else {
             MemReq w;
             w.addr   = params_.mem_addr(bank_id_, set_id, addr_tag);
@@ -1198,11 +1213,12 @@ class CacheBank : public SimObject<CacheBank> {
           mem_req.addr = params_.mem_addr(bank_id_, flush_set_idx_, line.tag);
           mem_req.op   = MemOp::ST;
           mem_req.data = line.data;
-          mem_req.byteen = ~uint64_t(0) >> (64 - VX_CFG_MEM_BLOCK_SIZE);
+          mem_req.byteen = line.dirty_mask;
           this->mem_req_out.send(mem_req);
           DT(3, this->name() << " flush-wb: " << mem_req);
           ++perf_stats_.evictions;
           line.dirty = false;
+          line.dirty_mask = 0;
         }
         ++flush_way_idx_;
       }
diff --git a/sim/simx/mem/mmu_tlb.h b/sim/simx/mem/mmu_tlb.h
index ba9aca8a82..f751a7cdd6 100644
--- a/sim/simx/mem/mmu_tlb.h
+++ b/sim/simx/mem/mmu_tlb.h
@@ -17,7 +17,7 @@
 namespace vortex {
 
 // Per-core TLB. Small fully-associative CAM of {vpn → ppn} translations
-// with MRU-style eviction. Tracks MMU perf counters (VX_DCR_MPM_CLASS_VM).
+// with MRU-style eviction. Tracks MMU perf counters (VX_DCR_MPM_CLASS_MEM).
 class Tlb {
 public:
   explicit Tlb(uint32_t size = VX_CFG_TLB_SIZE);
diff --git a/sim/xrtsim/xrt_sim.cpp b/sim/xrtsim/xrt_sim.cpp
index 4b48214e9b..21b9bff275 100644
--- a/sim/xrtsim/xrt_sim.cpp
+++ b/sim/xrtsim/xrt_sim.cpp
@@ -36,6 +36,8 @@
 
 #include <VX_config.h>
 #include <future>
+#include <thread>
+#include <atomic>
 #include <list>
 #include <queue>
 #include <unordered_map>
@@ -262,8 +264,20 @@ class xrt_sim::Impl {
     // launch execution thread
     future_ = std::async(std::launch::async, [&]{
       while (!stop_) {
-        std::lock_guard<std::mutex> guard(mutex_);
-        this->tick();
+        // Give host-side MMIO/mem calls absolute priority: while any host op
+        // is pending, fully back off (don't even contend for mutex_). Without
+        // this the free-running ticker re-acquires the lock in a tight loop and
+        // starves the host thread — it could not complete cp_init/cp_submit and
+        // the sim ticked forever (dumping VCD unboundedly), hanging the run.
+        if (host_waiters_.load(std::memory_order_acquire) != 0) {
+          std::this_thread::yield();
+          continue;
+        }
+        {
+          std::lock_guard<std::mutex> guard(mutex_);
+          this->tick();
+        }
+        std::this_thread::yield();
       }
     });
 
@@ -283,7 +297,7 @@ class xrt_sim::Impl {
   }
 
   int mem_write(uint32_t bank_id, uint64_t addr, uint64_t size, const void* data) {
-    std::lock_guard<std::mutex> guard(mutex_);
+    HostLock guard(*this);
 
     if (bank_id >= VX_CFG_PLATFORM_MEMORY_NUM_BANKS)
       return -1;
@@ -293,7 +307,7 @@ class xrt_sim::Impl {
   }
 
   int mem_read(uint32_t bank_id, uint64_t addr, uint64_t size, void* data) {
-    std::lock_guard<std::mutex> guard(mutex_);
+    HostLock guard(*this);
 
     if (bank_id >= VX_CFG_PLATFORM_MEMORY_NUM_BANKS)
       return -1;
@@ -303,7 +317,7 @@ class xrt_sim::Impl {
   }
 
   int mem_copy(uint32_t bank_id_dest , uint32_t bank_id_src, uint64_t dest_addr, uint64_t src_addr, uint64_t size) {
-    std::lock_guard<std::mutex> guard(mutex_);
+    HostLock guard(*this);
     if( bank_id_dest >= VX_CFG_PLATFORM_MEMORY_NUM_BANKS || bank_id_src >= VX_CFG_PLATFORM_MEMORY_NUM_BANKS)
       return -1;
     uint64_t dest_base_addr = bank_id_dest * mem_bank_size_ + dest_addr;
@@ -315,29 +329,29 @@ class xrt_sim::Impl {
   // ----- Host memory (XRT host-only BOs; reached by m_axi_host) -----
 
   int host_mem_alloc(uint64_t size, uint64_t* addr) {
-    std::lock_guard<std::mutex> guard(mutex_);
+    HostLock guard(*this);
     return host_alloc_->allocate(size, addr);
   }
 
   int host_mem_free(uint64_t addr) {
-    std::lock_guard<std::mutex> guard(mutex_);
+    HostLock guard(*this);
     return host_alloc_->release(addr);
   }
 
   int host_mem_write(uint64_t addr, uint64_t size, const void* data) {
-    std::lock_guard<std::mutex> guard(mutex_);
+    HostLock guard(*this);
     host_ram_->write(data, addr, size);
     return 0;
   }
 
   int host_mem_read(uint64_t addr, uint64_t size, void* data) {
-    std::lock_guard<std::mutex> guard(mutex_);
+    HostLock guard(*this);
     host_ram_->read(data, addr, size);
     return 0;
   }
 
   int register_write(uint32_t offset, uint32_t value) {
-    std::lock_guard<std::mutex> guard(mutex_);
+    HostLock guard(*this);
 
     // write address
     device_->s_axi_ctrl_awvalid = 1;
@@ -369,7 +383,7 @@ class xrt_sim::Impl {
   }
 
   int register_read(uint32_t offset, uint32_t* value) {
-    std::lock_guard<std::mutex> guard(mutex_);
+    HostLock guard(*this);
     // read address
     device_->s_axi_ctrl_arvalid = 1;
     device_->s_axi_ctrl_araddr = offset;
@@ -798,6 +812,24 @@ class xrt_sim::Impl {
   bool stop_;
 
   std::mutex mutex_;
+  // Count of host threads waiting on / holding mutex_. The free-running sim
+  // ticker backs off whenever this is non-zero so host MMIO/mem ops are never
+  // starved by the background ticker.
+  std::atomic<int> host_waiters_{0};
+
+  // RAII guard for every host-side entry point: registers intent (so the
+  // ticker yields) before acquiring mutex_, and clears it after releasing.
+  struct HostLock {
+    Impl& impl_;
+    explicit HostLock(Impl& impl) : impl_(impl) {
+      impl_.host_waiters_.fetch_add(1, std::memory_order_acquire);
+      impl_.mutex_.lock();
+    }
+    ~HostLock() {
+      impl_.mutex_.unlock();
+      impl_.host_waiters_.fetch_sub(1, std::memory_order_release);
+    }
+  };
 
   std::list<mem_req_t*> pending_mem_reqs_[VX_CFG_PLATFORM_MEMORY_NUM_BANKS];
 
diff --git a/sw/runtime/common/legacy_perf.cpp b/sw/runtime/common/legacy_perf.cpp
index b3d49630c0..b002a6c318 100644
--- a/sw/runtime/common/legacy_perf.cpp
+++ b/sw/runtime/common/legacy_perf.cpp
@@ -141,15 +141,6 @@ struct CoreCounters {
   // memory
   uint64_t mem_reads = 0;
   uint64_t mem_writes = 0;
-
-  // VM (icache + dcache MMU summed in hardware) — fields always present;
-  // populated only when the device reports VM (VX_CAPS_VM_SUPPORT).
-  uint64_t tlb_reads = 0;
-  uint64_t tlb_hits = 0;
-  uint64_t tlb_misses = 0;
-  uint64_t tlb_evicts = 0;
-  uint64_t ptw_walks = 0;
-  uint64_t ptw_latency = 0;
 };
 
 struct CacheCounters {
@@ -288,17 +279,6 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE *stream) {
       CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_LOAD_LT, core_id, &c.load_lt), { return err; });
       CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_STORES, core_id, &c.stores), { return err; });
 
-      if (vm_enabled) {
-        // VM/MMU lives in its own perf class (CLASS_VM), independent of CORE/MEM.
-        // Hardware sums icache + dcache MMU counters into one bank.
-        CHECK_ERR(vx_mpm_query(hdevice, VX_DCR_MPM_CLASS_VM, VX_CSR_MPM_TLB_READS,   core_id, &c.tlb_reads),   { return err; });
-        CHECK_ERR(vx_mpm_query(hdevice, VX_DCR_MPM_CLASS_VM, VX_CSR_MPM_TLB_HITS,    core_id, &c.tlb_hits),    { return err; });
-        CHECK_ERR(vx_mpm_query(hdevice, VX_DCR_MPM_CLASS_VM, VX_CSR_MPM_TLB_MISSES,  core_id, &c.tlb_misses),  { return err; });
-        CHECK_ERR(vx_mpm_query(hdevice, VX_DCR_MPM_CLASS_VM, VX_CSR_MPM_TLB_EVICTS,  core_id, &c.tlb_evicts),  { return err; });
-        CHECK_ERR(vx_mpm_query(hdevice, VX_DCR_MPM_CLASS_VM, VX_CSR_MPM_PTW_WALKS,   core_id, &c.ptw_walks),   { return err; });
-        CHECK_ERR(vx_mpm_query(hdevice, VX_DCR_MPM_CLASS_VM, VX_CSR_MPM_PTW_LATENCY, core_id, &c.ptw_latency), { return err; });
-      }
-
       if (num_cores > 1) {
         // Per-Core report
         const uint64_t cycles = c.cycles;
@@ -349,13 +329,6 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE *stream) {
         perf_print_core(stream, core_id, "memory: ifetch_lat=%.2f, load_lat=%.2f, loads=%" PRIu64 ", stores=%" PRIu64,
                         ifetch_avg_lt, load_avg_lt, c.loads, c.stores);
 
-        if (vm_enabled) {
-          const int tlb_hit_pct = calc_percent(c.tlb_hits, c.tlb_reads);
-          const double ptw_avg_lt = safe_div((double)c.ptw_latency, (double)c.ptw_walks);
-          perf_print_core(stream, core_id, "vm: tlb_reads=%" PRIu64 ", hit=%d%%, evicts=%" PRIu64 ", ptw_walks=%" PRIu64 ", ptw_avg_lat=%.2f",
-                          c.tlb_reads, tlb_hit_pct, c.tlb_evicts, c.ptw_walks, ptw_avg_lt);
-        }
-
         perf_print_core(stream, core_id, "instrs=%" PRIu64 ", cycles=%" PRIu64 ", IPC=%.3f", c.instrs, c.cycles, ipc);
       }
 
@@ -392,21 +365,8 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE *stream) {
       tot.loads += c.loads;
       tot.load_lt += c.load_lt;
       tot.stores += c.stores;
-
-      if (vm_enabled) {
-        tot.tlb_reads += c.tlb_reads;
-        tot.tlb_hits += c.tlb_hits;
-        tot.tlb_misses += c.tlb_misses;
-        tot.tlb_evicts += c.tlb_evicts;
-        tot.ptw_walks += c.ptw_walks;
-        tot.ptw_latency += c.ptw_latency;
-      }
     }
 
-    // Query global MPM counters
-    CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_MEM_READS, 0, &tot.mem_reads), { return err; });
-    CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_MEM_WRITES, 0, &tot.mem_writes), { return err; });
-
     // Core Summary
     uint64_t tot_cycles_wide = tot.cycles * issue_width;
 
@@ -451,138 +411,131 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE *stream) {
     // Memory report
     const double tot_ifetch_avg_lt = safe_div((double)tot.ifetch_lt, (double)tot.ifetches);
     const double tot_load_avg_lt = safe_div((double)tot.load_lt, (double)tot.loads);
-    uint64_t read_bytes = tot.mem_reads * CACHE_BLOCK_SIZE;
-    uint64_t write_bytes = tot.mem_writes * CACHE_BLOCK_SIZE;
-    perf_print(stream, "memory: ifetch_lat=%.2f, load_lat=%.2f, loads=%" PRIu64 ", stores=%" PRIu64 ", read_bytes=%" PRIu64 ", write_bytes=%" PRIu64,
-               tot_ifetch_avg_lt, tot_load_avg_lt, tot.loads, tot.stores, read_bytes, write_bytes);
-
-    if (vm_enabled) {
-      const int tot_tlb_hit_pct = calc_percent(tot.tlb_hits, tot.tlb_reads);
-      const double tot_ptw_avg_lt = safe_div((double)tot.ptw_latency, (double)tot.ptw_walks);
-      perf_print(stream, "vm: tlb_reads=%" PRIu64 ", hit=%d%%, evicts=%" PRIu64 ", ptw_walks=%" PRIu64 ", ptw_avg_lat=%.2f",
-                 tot.tlb_reads, tot_tlb_hit_pct, tot.tlb_evicts, tot.ptw_walks, tot_ptw_avg_lt);
-    }
+    perf_print(stream, "memory: ifetch_lat=%.2f, load_lat=%.2f, loads=%" PRIu64 ", stores=%" PRIu64,
+               tot_ifetch_avg_lt, tot_load_avg_lt, tot.loads, tot.stores);
   } break;
 
-  case VX_DCR_MPM_CLASS_MEM: {
-    const uint64_t num_sockets = (num_cores + socket_size - 1) / socket_size;
-    const uint64_t cores_per_cluster = (num_cores + num_clusters - 1) / num_clusters;
-
-    CacheCounters icache_tot, dcache_tot, l2_tot, l3_tot;
-    CacheCounters lmem_tot;
-    uint64_t coalescer_miss_tot = 0;
-
-    // Per-Core Local Memory & Coalescer (Print Core First)
-    for (uint32_t core_id = 0; core_id < num_cores; ++core_id) {
-      if (lmem_en) {
-        uint64_t r = 0, w = 0, bst = 0;
-        CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_LMEM_READS, core_id, &r), { return err; });
-        CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_LMEM_WRITES, core_id, &w), { return err; });
-        CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_LMEM_BANK_ST, core_id, &bst), { return err; });
-
-        lmem_tot.reads += r;
-        lmem_tot.writes += w;
-        lmem_tot.bank_st += bst;
-        perf_print_core(stream, core_id, "lmem: reqs=%" PRIu64 ", bank_stalls=%" PRIu64 " (utility=%d%%)",
-                        r + w, bst, calc_utility(r + w, bst));
-      }
-
-      if (dcache_en) {
-        uint64_t cm = 0;
-        CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_COALESCER_MISS, core_id, &cm), { return err; });
-        coalescer_miss_tot += cm;
-        perf_print_core(stream, core_id, "coalescer: misses=%" PRIu64, cm);
-      }
-    }
-
-    // Per-Socket L1
-    for (uint32_t s = 0; s < num_sockets; ++s) {
-      uint32_t rep_core = s * (uint32_t)socket_size;
-      if (rep_core >= num_cores)
-        continue;
-
-      if (icache_en) {
+  // Each cache level is now its own MPM class; --perf selects which to dump.
+  case VX_DCR_MPM_CLASS_ICACHE: {
+    if (icache_en) {
+      const uint64_t num_sockets = (num_cores + socket_size - 1) / socket_size;
+      for (uint32_t s = 0; s < num_sockets; ++s) {
+        uint32_t rep_core = s * (uint32_t)socket_size;
+        if (rep_core >= num_cores)
+          continue;
         uint64_t r = 0, m = 0, st = 0;
         CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_ICACHE_READS, rep_core, &r), { return err; });
         CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_ICACHE_MISS_R, rep_core, &m), { return err; });
         CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_ICACHE_MSHR_ST, rep_core, &st), { return err; });
-
-        icache_tot.reads += r;
-        icache_tot.miss_r += m;
-        icache_tot.mshr_st += st;
-
         perf_print_core(stream, rep_core, "icache: reads=%" PRIu64 ", miss=%" PRIu64 " (hit=%d%%), mshr_st=%" PRIu64 " (utility=%d%%)",
                         r, m, calc_ratio(m, r), st, calc_utility(m, st));
       }
-      if (dcache_en) {
-        uint64_t r = 0, w = 0, mr = 0, mw = 0, bst = 0, mst = 0;
+    } else {
+      perf_print(stream, "icache: disabled");
+    }
+  } break;
+
+  case VX_DCR_MPM_CLASS_DCACHE: {
+    if (dcache_en) {
+      const uint64_t num_sockets = (num_cores + socket_size - 1) / socket_size;
+      for (uint32_t s = 0; s < num_sockets; ++s) {
+        uint32_t rep_core = s * (uint32_t)socket_size;
+        if (rep_core >= num_cores)
+          continue;
+        uint64_t r = 0, w = 0, mr = 0, mw = 0, ev = 0, bst = 0, mst = 0;
         CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_DCACHE_READS, rep_core, &r), { return err; });
         CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_DCACHE_WRITES, rep_core, &w), { return err; });
         CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_DCACHE_MISS_R, rep_core, &mr), { return err; });
         CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_DCACHE_MISS_W, rep_core, &mw), { return err; });
+        CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_DCACHE_EVICTS, rep_core, &ev), { return err; });
         CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_DCACHE_BANK_ST, rep_core, &bst), { return err; });
         CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_DCACHE_MSHR_ST, rep_core, &mst), { return err; });
-
-        dcache_tot.reads += r;
-        dcache_tot.writes += w;
-        dcache_tot.miss_r += mr;
-        dcache_tot.miss_w += mw;
-        dcache_tot.bank_st += bst;
-        dcache_tot.mshr_st += mst;
-
-        perf_print_core(stream, rep_core, "dcache: reqs=%" PRIu64 ", miss_r=%" PRIu64 " (hit=%d%%), miss_w=%" PRIu64 " (hit=%d%%), bank_st=%" PRIu64 " (utility=%d%%)",
-                        r + w, mr, calc_ratio(mr, r), mw, calc_ratio(mw, w), bst, calc_utility(r + w, bst));
+        perf_print_core(stream, rep_core, "dcache: reqs=%" PRIu64 ", miss_r=%" PRIu64 " (hit=%d%%), miss_w=%" PRIu64 " (hit=%d%%), evicts=%" PRIu64 ", bank_st=%" PRIu64 " (utility=%d%%)",
+                        r + w, mr, calc_ratio(mr, r), mw, calc_ratio(mw, w), ev, bst, calc_utility(r + w, bst));
       }
+    } else {
+      perf_print(stream, "dcache: disabled");
     }
+  } break;
 
-    // Per-Cluster L2
+  case VX_DCR_MPM_CLASS_L2CACHE: {
     if (l2_en) {
+      const uint64_t cores_per_cluster = (num_cores + num_clusters - 1) / num_clusters;
       for (uint32_t c = 0; c < num_clusters; ++c) {
         uint32_t rep_core = c * (uint32_t)cores_per_cluster;
         if (rep_core >= num_cores)
           continue;
-        uint64_t r = 0, w = 0, mr = 0, mw = 0, bst = 0, mst = 0;
+        uint64_t r = 0, w = 0, mr = 0, mw = 0, ev = 0, bst = 0, mst = 0;
         CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L2CACHE_READS, rep_core, &r), { return err; });
         CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L2CACHE_WRITES, rep_core, &w), { return err; });
         CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L2CACHE_MISS_R, rep_core, &mr), { return err; });
         CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L2CACHE_MISS_W, rep_core, &mw), { return err; });
+        CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L2CACHE_EVICTS, rep_core, &ev), { return err; });
         CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L2CACHE_BANK_ST, rep_core, &bst), { return err; });
         CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L2CACHE_MSHR_ST, rep_core, &mst), { return err; });
-
-        l2_tot.reads += r;
-        l2_tot.writes += w;
-        l2_tot.miss_r += mr;
-        l2_tot.miss_w += mw;
-        l2_tot.bank_st += bst;
-        l2_tot.mshr_st += mst;
-
-        perf_print_core(stream, rep_core, "l2cache: reqs=%" PRIu64 ", miss_r=%" PRIu64 " (hit=%d%%), miss_w=%" PRIu64 " (hit=%d%%)",
-                        r + w, mr, calc_ratio(mr, r), mw, calc_ratio(mw, w));
+        perf_print_core(stream, rep_core, "l2cache: reqs=%" PRIu64 ", miss_r=%" PRIu64 " (hit=%d%%), miss_w=%" PRIu64 " (hit=%d%%), evicts=%" PRIu64,
+                        r + w, mr, calc_ratio(mr, r), mw, calc_ratio(mw, w), ev);
       }
+    } else {
+      perf_print(stream, "l2cache: disabled");
     }
+  } break;
 
-    // Global L3
+  case VX_DCR_MPM_CLASS_L3CACHE: {
     if (l3_en) {
-      CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L3CACHE_READS, 0, &l3_tot.reads), { return err; });
-      CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L3CACHE_WRITES, 0, &l3_tot.writes), { return err; });
-      CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L3CACHE_MISS_R, 0, &l3_tot.miss_r), { return err; });
-      CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L3CACHE_MISS_W, 0, &l3_tot.miss_w), { return err; });
-      CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L3CACHE_BANK_ST, 0, &l3_tot.bank_st), { return err; });
-      CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L3CACHE_MSHR_ST, 0, &l3_tot.mshr_st), { return err; });
-
-      perf_print(stream, "l3cache: reqs=%" PRIu64 ", miss_r=%" PRIu64 " (hit=%d%%), miss_w=%" PRIu64 " (hit=%d%%)",
-                 l3_tot.reads + l3_tot.writes, l3_tot.miss_r, calc_ratio(l3_tot.miss_r, l3_tot.reads),
-                 l3_tot.miss_w, calc_ratio(l3_tot.miss_w, l3_tot.writes));
+      uint64_t r = 0, w = 0, mr = 0, mw = 0, ev = 0, bst = 0, mst = 0;
+      CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L3CACHE_READS, 0, &r), { return err; });
+      CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L3CACHE_WRITES, 0, &w), { return err; });
+      CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L3CACHE_MISS_R, 0, &mr), { return err; });
+      CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L3CACHE_MISS_W, 0, &mw), { return err; });
+      CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L3CACHE_EVICTS, 0, &ev), { return err; });
+      CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L3CACHE_BANK_ST, 0, &bst), { return err; });
+      CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_L3CACHE_MSHR_ST, 0, &mst), { return err; });
+      perf_print(stream, "l3cache: reqs=%" PRIu64 ", miss_r=%" PRIu64 " (hit=%d%%), miss_w=%" PRIu64 " (hit=%d%%), evicts=%" PRIu64,
+                 r + w, mr, calc_ratio(mr, r), mw, calc_ratio(mw, w), ev);
+    } else {
+      perf_print(stream, "l3cache: disabled");
     }
+  } break;
 
-    // Global DRAM
+  case VX_DCR_MPM_CLASS_MEM: {
+    // Per-core local memory + coalescer + VM/MMU.
+    for (uint32_t core_id = 0; core_id < num_cores; ++core_id) {
+      if (lmem_en) {
+        uint64_t r = 0, w = 0, bst = 0;
+        CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_LMEM_READS, core_id, &r), { return err; });
+        CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_LMEM_WRITES, core_id, &w), { return err; });
+        CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_LMEM_BANK_ST, core_id, &bst), { return err; });
+        perf_print_core(stream, core_id, "lmem: reqs=%" PRIu64 ", bank_stalls=%" PRIu64 " (utility=%d%%)",
+                        r + w, bst, calc_utility(r + w, bst));
+      }
+      // The coalescer lives on the LSU->memory path; it exists when
+      // NUM_LSU_LANES > 1 && LSU_LINE_SIZE > XLENB, independent of the dcache.
+      // The counter is hard-zeroed in hardware when no coalescer is present.
+      {
+        uint64_t cm = 0;
+        CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_COALESCER_MISS, core_id, &cm), { return err; });
+        perf_print_core(stream, core_id, "coalescer: misses=%" PRIu64, cm);
+      }
+      // VM/MMU (per-core; hardware sums icache + dcache MMU counters).
+      if (vm_enabled) {
+        uint64_t reads = 0, hits = 0, evicts = 0, walks = 0, lat = 0;
+        CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_TLB_READS,   core_id, &reads),  { return err; });
+        CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_TLB_HITS,    core_id, &hits),   { return err; });
+        CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_TLB_EVICTS,  core_id, &evicts), { return err; });
+        CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_PTW_WALKS,   core_id, &walks),  { return err; });
+        CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_PTW_LATENCY, core_id, &lat),    { return err; });
+        perf_print_core(stream, core_id, "vm: tlb_reads=%" PRIu64 ", hit=%d%%, evicts=%" PRIu64 ", ptw_walks=%" PRIu64 ", ptw_avg_lat=%.2f",
+                        reads, calc_percent(hits, reads), evicts, walks, safe_div((double)lat, (double)walks));
+      }
+    }
+    // Global off-chip memory.
     {
       uint64_t r = 0, w = 0, lat = 0, bst = 0;
       CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_MEM_READS, 0, &r), { return err; });
       CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_MEM_WRITES, 0, &w), { return err; });
       CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_MEM_LT, 0, &lat), { return err; });
       CHECK_ERR(vx_mpm_query(hdevice, mpm_class, VX_CSR_MPM_MEM_BANK_ST, 0, &bst), { return err; });
-
       double avg_lat = safe_div((double)lat, (double)r);
       perf_print(stream, "memory: reqs=%" PRIu64 " (r=%" PRIu64 ", w=%" PRIu64 "), lat=%.1f cyc, bank_st=%" PRIu64 " (utility=%d%%)",
                  r + w, r, w, avg_lat, bst, calc_utility(r + w, bst));
diff --git a/tests/hip/Makefile b/tests/hip/Makefile
index 725b122ece..a6ee89b05e 100644
--- a/tests/hip/Makefile
+++ b/tests/hip/Makefile
@@ -2,10 +2,13 @@ ROOT_DIR := $(realpath ../..)
 include $(ROOT_DIR)/config.mk
 
 # --- master list ------------------------------------------------------
-TESTS := vecadd sgemm
+TESTS := vecadd sgemm histogram atomicreduce
 
 # --- common exclude list ---------------------------------------------
-EXCLUDE :=
+# histogram/atomicreduce use atomicAdd (RVA amo*.w); run them only with the
+# A extension (CONFIGS="-DVX_CFG_EXT_A_ENABLE"), so exclude them from the
+# default sweep which builds for the no-atomics config.
+EXCLUDE := histogram atomicreduce
 
 # --- per-backend exclude lists ---------------------------------------
 EXCLUDE_simx   :=
diff --git a/tests/hip/atomicreduce/Makefile b/tests/hip/atomicreduce/Makefile
new file mode 100644
index 0000000000..af9bb8686f
--- /dev/null
+++ b/tests/hip/atomicreduce/Makefile
@@ -0,0 +1,15 @@
+ROOT_DIR := $(realpath ../../..)
+include $(ROOT_DIR)/config.mk
+
+PROJECT := atomicreduce
+
+SRC_DIR := $(VORTEX_HOME)/tests/hip/$(PROJECT)
+
+SRCS := $(SRC_DIR)/main.cpp
+
+common.h: $(SRC_DIR)/common.h
+	cp $< $@
+
+OPTS ?= -n1024
+
+include ../common.mk
diff --git a/tests/hip/atomicreduce/common.h b/tests/hip/atomicreduce/common.h
new file mode 100644
index 0000000000..2a816d643b
--- /dev/null
+++ b/tests/hip/atomicreduce/common.h
@@ -0,0 +1,4 @@
+#ifndef COMMON_H
+#define COMMON_H
+
+#endif // COMMON_H
diff --git a/tests/hip/atomicreduce/main.cpp b/tests/hip/atomicreduce/main.cpp
new file mode 100644
index 0000000000..9d863b4ecb
--- /dev/null
+++ b/tests/hip/atomicreduce/main.cpp
@@ -0,0 +1,112 @@
+#include <hip/hip_runtime.h>
+
+#include <chrono>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <unistd.h>
+#include <vector>
+
+#include "common.h"
+
+#define KERNEL_NAME "atomicreduce"
+
+#define HIP_CHECK(_expr)                                                  \
+  do {                                                                    \
+    hipError_t _err = (_expr);                                            \
+    if (_err != hipSuccess) {                                             \
+      fprintf(stderr, "HIP Error: '%s' returned %d (%s)\n",               \
+              #_expr, (int)_err, hipGetErrorString(_err));                \
+      exit(-1);                                                           \
+    }                                                                     \
+  } while (0)
+
+// Every thread accumulates its element into a single global counter.
+// Maximum-contention atomicAdd, lowering to a hardware RVA amoadd.w on
+// Vortex (requires the A extension).
+__global__ void atomicreduce(const int* data, int* result, int N) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (gid < N) {
+    atomicAdd(&result[0], data[gid]);
+  }
+}
+
+static uint32_t size = 1024;
+
+static void show_usage() {
+  printf("Usage: [-n size] [-h: help]\n");
+}
+
+static void parse_args(int argc, char** argv) {
+  int c;
+  while ((c = getopt(argc, argv, "n:h")) != -1) {
+    switch (c) {
+    case 'n': size = atoi(optarg); break;
+    case 'h': show_usage(); exit(0);
+    default:  show_usage(); exit(-1);
+    }
+  }
+  printf("Workload size=%u\n", size);
+}
+
+int main(int argc, char** argv) {
+  parse_args(argc, argv);
+
+  std::vector<int> h_data(size);
+  int h_ref = 0;
+
+  srand(50);
+  for (uint32_t i = 0; i < size; ++i) {
+    h_data[i] = rand() % 1000;
+    h_ref += h_data[i];
+  }
+  int h_result = 0;
+
+  printf("Allocate device buffers\n");
+  int *d_data = nullptr, *d_result = nullptr;
+  HIP_CHECK(hipMalloc((void**)&d_data, size * sizeof(int)));
+  HIP_CHECK(hipMalloc((void**)&d_result, sizeof(int)));
+
+  printf("Upload source buffers\n");
+  HIP_CHECK(hipMemcpy(d_data, h_data.data(), size * sizeof(int), hipMemcpyHostToDevice));
+  HIP_CHECK(hipMemcpy(d_result, &h_result, sizeof(int), hipMemcpyHostToDevice));
+
+  printf("Execute the kernel '%s'\n", KERNEL_NAME);
+  int dev_id = 0;
+  HIP_CHECK(hipGetDevice(&dev_id));
+  hipDeviceProp_t dev_props{};
+  HIP_CHECK(hipGetDeviceProperties(&dev_props, dev_id));
+  uint32_t block_size = 64;
+  if ((int)block_size > dev_props.maxThreadsPerBlock)
+    block_size = (uint32_t)dev_props.maxThreadsPerBlock;
+  const uint32_t grid_size = (size + block_size - 1) / block_size;
+  printf("block_size=%u (device max=%d)\n", block_size, dev_props.maxThreadsPerBlock);
+
+  auto t0 = std::chrono::high_resolution_clock::now();
+  atomicreduce<<<dim3(grid_size), dim3(block_size), 0, 0>>>(d_data, d_result, (int)size);
+  HIP_CHECK(hipDeviceSynchronize());
+  auto t1 = std::chrono::high_resolution_clock::now();
+  printf("Elapsed time: %lld ms\n",
+         (long long)std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count());
+
+  printf("Download destination buffer\n");
+  HIP_CHECK(hipMemcpy(&h_result, d_result, sizeof(int), hipMemcpyDeviceToHost));
+
+  printf("Verify result\n");
+  int errors = 0;
+  if (h_result != h_ref) {
+    printf("*** error: expected=%d, actual=%d\n", h_ref, h_result);
+    ++errors;
+  }
+
+  HIP_CHECK(hipFree(d_data));
+  HIP_CHECK(hipFree(d_result));
+
+  if (errors == 0) {
+    printf("PASSED!\n");
+  } else {
+    printf("FAILED! - %d errors\n", errors);
+  }
+  return errors;
+}
diff --git a/tests/hip/histogram/Makefile b/tests/hip/histogram/Makefile
new file mode 100644
index 0000000000..4c6eb2c5dc
--- /dev/null
+++ b/tests/hip/histogram/Makefile
@@ -0,0 +1,15 @@
+ROOT_DIR := $(realpath ../../..)
+include $(ROOT_DIR)/config.mk
+
+PROJECT := histogram
+
+SRC_DIR := $(VORTEX_HOME)/tests/hip/$(PROJECT)
+
+SRCS := $(SRC_DIR)/main.cpp
+
+common.h: $(SRC_DIR)/common.h
+	cp $< $@
+
+OPTS ?= -n1024
+
+include ../common.mk
diff --git a/tests/hip/histogram/common.h b/tests/hip/histogram/common.h
new file mode 100644
index 0000000000..af56d37b33
--- /dev/null
+++ b/tests/hip/histogram/common.h
@@ -0,0 +1,8 @@
+#ifndef COMMON_H
+#define COMMON_H
+
+#ifndef NUM_BINS
+#define NUM_BINS 16
+#endif
+
+#endif // COMMON_H
diff --git a/tests/hip/histogram/main.cpp b/tests/hip/histogram/main.cpp
new file mode 100644
index 0000000000..3155a05760
--- /dev/null
+++ b/tests/hip/histogram/main.cpp
@@ -0,0 +1,115 @@
+#include <hip/hip_runtime.h>
+
+#include <chrono>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <unistd.h>
+#include <vector>
+
+#include "common.h"
+
+#define KERNEL_NAME "histogram"
+
+#define HIP_CHECK(_expr)                                                  \
+  do {                                                                    \
+    hipError_t _err = (_expr);                                            \
+    if (_err != hipSuccess) {                                             \
+      fprintf(stderr, "HIP Error: '%s' returned %d (%s)\n",               \
+              #_expr, (int)_err, hipGetErrorString(_err));                \
+      exit(-1);                                                           \
+    }                                                                     \
+  } while (0)
+
+// Each thread atomically increments the bin its element maps to.
+// atomicAdd on an int* lowers to a hardware RVA amoadd.w on Vortex
+// (requires the A extension).
+__global__ void histogram(const int* data, int* bins, int N) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (gid < N) {
+    atomicAdd(&bins[data[gid] % NUM_BINS], 1);
+  }
+}
+
+static uint32_t size = 1024;
+
+static void show_usage() {
+  printf("Usage: [-n size] [-h: help]\n");
+}
+
+static void parse_args(int argc, char** argv) {
+  int c;
+  while ((c = getopt(argc, argv, "n:h")) != -1) {
+    switch (c) {
+    case 'n': size = atoi(optarg); break;
+    case 'h': show_usage(); exit(0);
+    default:  show_usage(); exit(-1);
+    }
+  }
+  printf("Workload size=%u, bins=%d\n", size, NUM_BINS);
+}
+
+int main(int argc, char** argv) {
+  parse_args(argc, argv);
+
+  std::vector<int> h_data(size);
+  std::vector<int> h_bins(NUM_BINS, 0);
+  std::vector<int> h_ref(NUM_BINS, 0);
+
+  srand(50);
+  for (uint32_t i = 0; i < size; ++i) {
+    h_data[i] = rand() % 1000;
+    h_ref[h_data[i] % NUM_BINS] += 1;
+  }
+
+  printf("Allocate device buffers\n");
+  int *d_data = nullptr, *d_bins = nullptr;
+  HIP_CHECK(hipMalloc((void**)&d_data, size * sizeof(int)));
+  HIP_CHECK(hipMalloc((void**)&d_bins, NUM_BINS * sizeof(int)));
+
+  printf("Upload source buffers\n");
+  HIP_CHECK(hipMemcpy(d_data, h_data.data(), size * sizeof(int), hipMemcpyHostToDevice));
+  HIP_CHECK(hipMemset(d_bins, 0, NUM_BINS * sizeof(int)));
+
+  printf("Execute the kernel '%s'\n", KERNEL_NAME);
+  int dev_id = 0;
+  HIP_CHECK(hipGetDevice(&dev_id));
+  hipDeviceProp_t dev_props{};
+  HIP_CHECK(hipGetDeviceProperties(&dev_props, dev_id));
+  uint32_t block_size = 64;
+  if ((int)block_size > dev_props.maxThreadsPerBlock)
+    block_size = (uint32_t)dev_props.maxThreadsPerBlock;
+  const uint32_t grid_size = (size + block_size - 1) / block_size;
+  printf("block_size=%u (device max=%d)\n", block_size, dev_props.maxThreadsPerBlock);
+
+  auto t0 = std::chrono::high_resolution_clock::now();
+  histogram<<<dim3(grid_size), dim3(block_size), 0, 0>>>(d_data, d_bins, (int)size);
+  HIP_CHECK(hipDeviceSynchronize());
+  auto t1 = std::chrono::high_resolution_clock::now();
+  printf("Elapsed time: %lld ms\n",
+         (long long)std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count());
+
+  printf("Download destination buffer\n");
+  HIP_CHECK(hipMemcpy(h_bins.data(), d_bins, NUM_BINS * sizeof(int), hipMemcpyDeviceToHost));
+
+  printf("Verify result\n");
+  int errors = 0;
+  for (int i = 0; i < NUM_BINS; ++i) {
+    if (h_bins[i] != h_ref[i]) {
+      if (errors < 100)
+        printf("*** error: bin[%d] expected=%d, actual=%d\n", i, h_ref[i], h_bins[i]);
+      ++errors;
+    }
+  }
+
+  HIP_CHECK(hipFree(d_data));
+  HIP_CHECK(hipFree(d_bins));
+
+  if (errors == 0) {
+    printf("PASSED!\n");
+  } else {
+    printf("FAILED! - %d errors\n", errors);
+  }
+  return errors;
+}
diff --git a/tests/opencl/Makefile b/tests/opencl/Makefile
index 2cc2ff1009..0c849432b1 100644
--- a/tests/opencl/Makefile
+++ b/tests/opencl/Makefile
@@ -5,15 +5,20 @@ include $(ROOT_DIR)/config.mk
 TESTS := \
     vecadd sgemm conv3 psort saxpy sfilter sgemm2 sgemm3 psum oclprintf \
     dotproduct transpose spmv stencil lbm nearn guassian kmeans         \
-    blackscholes bfs copybuf
+    blackscholes bfs copybuf histogram atomicreduce
 
 # --- common exclude list ---------------------------------------------
+# histogram/atomicreduce use atomic_add (RVA amo*.w); run them only with the
+# A extension (CONFIGS="-DVX_CFG_EXT_A_ENABLE"), so exclude them from the
+# default sweep which builds for the no-atomics config.
 EXCLUDE := \
     transpose \
     kmeans    \
     bfs       \
     lbm       \
-    copybuf
+    copybuf   \
+    histogram \
+    atomicreduce
 
 # --- per-backend exclude lists ---------------------------------------
 EXCLUDE_simx   :=
diff --git a/tests/opencl/atomicreduce/Makefile b/tests/opencl/atomicreduce/Makefile
new file mode 100644
index 0000000000..196382be22
--- /dev/null
+++ b/tests/opencl/atomicreduce/Makefile
@@ -0,0 +1,20 @@
+ROOT_DIR := $(realpath ../../..)
+include $(ROOT_DIR)/config.mk
+
+PROJECT := atomicreduce
+
+SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT)
+
+SRCS := $(SRC_DIR)/main.cc
+
+common.h: $(SRC_DIR)/common.h
+	cp $< $@
+
+kernel.cl: $(SRC_DIR)/kernel.cl
+	cp $< $@
+
+KERNEL_SRCS := kernel.cl common.h
+
+OPTS ?= -n1024
+
+include ../common.mk
diff --git a/tests/opencl/atomicreduce/common.h b/tests/opencl/atomicreduce/common.h
new file mode 100644
index 0000000000..2a816d643b
--- /dev/null
+++ b/tests/opencl/atomicreduce/common.h
@@ -0,0 +1,4 @@
+#ifndef COMMON_H
+#define COMMON_H
+
+#endif // COMMON_H
diff --git a/tests/opencl/atomicreduce/kernel.cl b/tests/opencl/atomicreduce/kernel.cl
new file mode 100644
index 0000000000..fd2f03e868
--- /dev/null
+++ b/tests/opencl/atomicreduce/kernel.cl
@@ -0,0 +1,10 @@
+#include "common.h"
+
+// Atomic reduction: every work-item accumulates its element into a single
+// global counter. Maximum-contention atomic_add, lowering to a hardware RVA
+// amoadd.w on Vortex (requires the A extension).
+__kernel void atomicreduce(__global const int* data,
+                           __global int* result) {
+  int gid = get_global_id(0);
+  atomic_add(&result[0], data[gid]);
+}
diff --git a/tests/opencl/atomicreduce/main.cc b/tests/opencl/atomicreduce/main.cc
new file mode 100644
index 0000000000..7ff0f74fab
--- /dev/null
+++ b/tests/opencl/atomicreduce/main.cc
@@ -0,0 +1,174 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <CL/opencl.h>
+#include <unistd.h>
+#include <string.h>
+#include <chrono>
+#include <vector>
+#include "common.h"
+
+#define KERNEL_NAME "atomicreduce"
+
+#define CL_CHECK(_expr)                                                \
+   do {                                                                \
+     cl_int _err = _expr;                                              \
+     if (_err == CL_SUCCESS)                                           \
+       break;                                                          \
+     printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err);   \
+     cleanup();                                                        \
+     exit(-1);                                                         \
+   } while (0)
+
+#define CL_CHECK2(_expr)                                               \
+   ({                                                                  \
+     cl_int _err = CL_INVALID_VALUE;                                   \
+     decltype(_expr) _ret = _expr;                                     \
+     if (_err != CL_SUCCESS) {                                         \
+       printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
+       cleanup();                                                      \
+       exit(-1);                                                       \
+     }                                                                 \
+     _ret;                                                             \
+   })
+
+static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
+  if (nullptr == filename || nullptr == data || 0 == size)
+    return -1;
+
+  FILE* fp = fopen(filename, "r");
+  if (NULL == fp) {
+    fprintf(stderr, "Failed to load kernel.");
+    return -1;
+  }
+  fseek(fp , 0 , SEEK_END);
+  long fsize = ftell(fp);
+  rewind(fp);
+
+  *data = (uint8_t*)malloc(fsize);
+  *size = fread(*data, 1, fsize, fp);
+
+  fclose(fp);
+
+  return 0;
+}
+
+cl_device_id device_id = NULL;
+cl_context context = NULL;
+cl_command_queue commandQueue = NULL;
+cl_program program = NULL;
+cl_kernel kernel = NULL;
+cl_mem data_memobj = NULL;
+cl_mem result_memobj = NULL;
+uint8_t *kernel_bin = NULL;
+
+static void cleanup() {
+  if (commandQueue) clReleaseCommandQueue(commandQueue);
+  if (kernel) clReleaseKernel(kernel);
+  if (program) clReleaseProgram(program);
+  if (data_memobj) clReleaseMemObject(data_memobj);
+  if (result_memobj) clReleaseMemObject(result_memobj);
+  if (context) clReleaseContext(context);
+  if (device_id) clReleaseDevice(device_id);
+  if (kernel_bin) free(kernel_bin);
+}
+
+uint32_t size = 1024;
+
+static void show_usage() {
+  printf("Usage: [-n size] [-h: help]\n");
+}
+
+static void parse_args(int argc, char **argv) {
+  int c;
+  while ((c = getopt(argc, argv, "n:h")) != -1) {
+    switch (c) {
+    case 'n':
+      size = atoi(optarg);
+      break;
+    case 'h':
+      show_usage();
+      exit(0);
+      break;
+    default:
+      show_usage();
+      exit(-1);
+    }
+  }
+  printf("Workload size=%d\n", size);
+}
+
+int main (int argc, char **argv) {
+  parse_args(argc, argv);
+
+  cl_platform_id platform_id;
+  size_t kernel_size;
+
+  CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
+  CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
+
+  printf("Create context\n");
+  context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err));
+
+  printf("Allocate device buffers\n");
+  size_t data_bytes = size * sizeof(int);
+  data_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, data_bytes, NULL, &_err));
+  result_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int), NULL, &_err));
+
+  printf("Create program from kernel source\n");
+  if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
+    return -1;
+  program = CL_CHECK2(clCreateProgramWithSource(
+    context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
+
+  CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
+
+  kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
+
+  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&data_memobj));
+  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&result_memobj));
+
+  // Generate input values and the reference sum.
+  std::vector<int> h_data(size);
+  int h_ref = 0;
+  for (uint32_t i = 0; i < size; ++i) {
+    h_data[i] = rand() % 1000;
+    h_ref += h_data[i];
+  }
+  int h_result = 0;
+
+  commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err));
+
+  printf("Upload source buffers\n");
+  CL_CHECK(clEnqueueWriteBuffer(commandQueue, data_memobj, CL_TRUE, 0, data_bytes, h_data.data(), 0, NULL, NULL));
+  CL_CHECK(clEnqueueWriteBuffer(commandQueue, result_memobj, CL_TRUE, 0, sizeof(int), &h_result, 0, NULL, NULL));
+
+  printf("Execute the kernel\n");
+  size_t global_work_size[1] = {size};
+  size_t local_work_size[1] = {1};
+  auto time_start = std::chrono::high_resolution_clock::now();
+  CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+  CL_CHECK(clFinish(commandQueue));
+  auto time_end = std::chrono::high_resolution_clock::now();
+  double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
+  printf("Elapsed time: %lg ms\n", elapsed);
+
+  printf("Download destination buffer\n");
+  CL_CHECK(clEnqueueReadBuffer(commandQueue, result_memobj, CL_TRUE, 0, sizeof(int), &h_result, 0, NULL, NULL));
+
+  printf("Verify result\n");
+  int errors = 0;
+  if (h_result != h_ref) {
+    printf("*** error: expected=%d, actual=%d\n", h_ref, h_result);
+    ++errors;
+  }
+  if (0 == errors) {
+    printf("PASSED!\n");
+  } else {
+    printf("FAILED! - %d errors\n", errors);
+  }
+
+  cleanup();
+
+  return errors;
+}
diff --git a/tests/opencl/histogram/Makefile b/tests/opencl/histogram/Makefile
new file mode 100644
index 0000000000..65b58c1f18
--- /dev/null
+++ b/tests/opencl/histogram/Makefile
@@ -0,0 +1,20 @@
+ROOT_DIR := $(realpath ../../..)
+include $(ROOT_DIR)/config.mk
+
+PROJECT := histogram
+
+SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT)
+
+SRCS := $(SRC_DIR)/main.cc
+
+common.h: $(SRC_DIR)/common.h
+	cp $< $@
+
+kernel.cl: $(SRC_DIR)/kernel.cl
+	cp $< $@
+
+KERNEL_SRCS := kernel.cl common.h
+
+OPTS ?= -n1024
+
+include ../common.mk
diff --git a/tests/opencl/histogram/common.h b/tests/opencl/histogram/common.h
new file mode 100644
index 0000000000..af56d37b33
--- /dev/null
+++ b/tests/opencl/histogram/common.h
@@ -0,0 +1,8 @@
+#ifndef COMMON_H
+#define COMMON_H
+
+#ifndef NUM_BINS
+#define NUM_BINS 16
+#endif
+
+#endif // COMMON_H
diff --git a/tests/opencl/histogram/kernel.cl b/tests/opencl/histogram/kernel.cl
new file mode 100644
index 0000000000..9b1421d7df
--- /dev/null
+++ b/tests/opencl/histogram/kernel.cl
@@ -0,0 +1,11 @@
+#include "common.h"
+
+// Histogram: each work-item atomically increments the bin its element maps
+// to. atomic_add on a __global int lowers to a hardware RVA amoadd.w on
+// Vortex (requires the A extension).
+__kernel void histogram(__global const int* data,
+                        __global int* bins) {
+  int gid = get_global_id(0);
+  int bin = data[gid] % NUM_BINS;
+  atomic_add(&bins[bin], 1);
+}
diff --git a/tests/opencl/histogram/main.cc b/tests/opencl/histogram/main.cc
new file mode 100644
index 0000000000..d6b1d3b879
--- /dev/null
+++ b/tests/opencl/histogram/main.cc
@@ -0,0 +1,178 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <CL/opencl.h>
+#include <unistd.h>
+#include <string.h>
+#include <chrono>
+#include <vector>
+#include "common.h"
+
+#define KERNEL_NAME "histogram"
+
+#define CL_CHECK(_expr)                                                \
+   do {                                                                \
+     cl_int _err = _expr;                                              \
+     if (_err == CL_SUCCESS)                                           \
+       break;                                                          \
+     printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err);   \
+     cleanup();                                                        \
+     exit(-1);                                                         \
+   } while (0)
+
+#define CL_CHECK2(_expr)                                               \
+   ({                                                                  \
+     cl_int _err = CL_INVALID_VALUE;                                   \
+     decltype(_expr) _ret = _expr;                                     \
+     if (_err != CL_SUCCESS) {                                         \
+       printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
+       cleanup();                                                      \
+       exit(-1);                                                       \
+     }                                                                 \
+     _ret;                                                             \
+   })
+
+static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
+  if (nullptr == filename || nullptr == data || 0 == size)
+    return -1;
+
+  FILE* fp = fopen(filename, "r");
+  if (NULL == fp) {
+    fprintf(stderr, "Failed to load kernel.");
+    return -1;
+  }
+  fseek(fp , 0 , SEEK_END);
+  long fsize = ftell(fp);
+  rewind(fp);
+
+  *data = (uint8_t*)malloc(fsize);
+  *size = fread(*data, 1, fsize, fp);
+
+  fclose(fp);
+
+  return 0;
+}
+
+cl_device_id device_id = NULL;
+cl_context context = NULL;
+cl_command_queue commandQueue = NULL;
+cl_program program = NULL;
+cl_kernel kernel = NULL;
+cl_mem data_memobj = NULL;
+cl_mem bins_memobj = NULL;
+uint8_t *kernel_bin = NULL;
+
+static void cleanup() {
+  if (commandQueue) clReleaseCommandQueue(commandQueue);
+  if (kernel) clReleaseKernel(kernel);
+  if (program) clReleaseProgram(program);
+  if (data_memobj) clReleaseMemObject(data_memobj);
+  if (bins_memobj) clReleaseMemObject(bins_memobj);
+  if (context) clReleaseContext(context);
+  if (device_id) clReleaseDevice(device_id);
+  if (kernel_bin) free(kernel_bin);
+}
+
+uint32_t size = 1024;
+
+static void show_usage() {
+  printf("Usage: [-n size] [-h: help]\n");
+}
+
+static void parse_args(int argc, char **argv) {
+  int c;
+  while ((c = getopt(argc, argv, "n:h")) != -1) {
+    switch (c) {
+    case 'n':
+      size = atoi(optarg);
+      break;
+    case 'h':
+      show_usage();
+      exit(0);
+      break;
+    default:
+      show_usage();
+      exit(-1);
+    }
+  }
+  printf("Workload size=%d, bins=%d\n", size, NUM_BINS);
+}
+
+int main (int argc, char **argv) {
+  parse_args(argc, argv);
+
+  cl_platform_id platform_id;
+  size_t kernel_size;
+
+  CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
+  CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
+
+  printf("Create context\n");
+  context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err));
+
+  printf("Allocate device buffers\n");
+  size_t data_bytes = size * sizeof(int);
+  size_t bins_bytes = NUM_BINS * sizeof(int);
+  data_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, data_bytes, NULL, &_err));
+  bins_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_WRITE, bins_bytes, NULL, &_err));
+
+  printf("Create program from kernel source\n");
+  if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
+    return -1;
+  program = CL_CHECK2(clCreateProgramWithSource(
+    context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
+
+  CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
+
+  kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
+
+  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&data_memobj));
+  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&bins_memobj));
+
+  // Generate input values and the reference histogram.
+  std::vector<int> h_data(size);
+  std::vector<int> h_bins(NUM_BINS, 0);
+  std::vector<int> h_ref(NUM_BINS, 0);
+  for (uint32_t i = 0; i < size; ++i) {
+    h_data[i] = rand() % 1000;
+    h_ref[h_data[i] % NUM_BINS] += 1;
+  }
+
+  commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err));
+
+  printf("Upload source buffers\n");
+  CL_CHECK(clEnqueueWriteBuffer(commandQueue, data_memobj, CL_TRUE, 0, data_bytes, h_data.data(), 0, NULL, NULL));
+  CL_CHECK(clEnqueueWriteBuffer(commandQueue, bins_memobj, CL_TRUE, 0, bins_bytes, h_bins.data(), 0, NULL, NULL));
+
+  printf("Execute the kernel\n");
+  size_t global_work_size[1] = {size};
+  size_t local_work_size[1] = {1};
+  auto time_start = std::chrono::high_resolution_clock::now();
+  CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+  CL_CHECK(clFinish(commandQueue));
+  auto time_end = std::chrono::high_resolution_clock::now();
+  double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
+  printf("Elapsed time: %lg ms\n", elapsed);
+
+  printf("Download destination buffer\n");
+  CL_CHECK(clEnqueueReadBuffer(commandQueue, bins_memobj, CL_TRUE, 0, bins_bytes, h_bins.data(), 0, NULL, NULL));
+
+  printf("Verify result\n");
+  int errors = 0;
+  for (int i = 0; i < NUM_BINS; ++i) {
+    if (h_bins[i] != h_ref[i]) {
+      if (errors < 100)
+        printf("*** error: bin[%d] expected=%d, actual=%d\n", i, h_ref[i], h_bins[i]);
+      ++errors;
+    }
+  }
+  if (0 == errors) {
+    printf("PASSED!\n");
+  } else {
+    printf("FAILED! - %d errors\n", errors);
+  }
+
+  cleanup();
+
+  return errors;
+}
diff --git a/tests/regression/amo/kernel.cpp b/tests/regression/amo/kernel.cpp
index 79cb750df6..a513261a66 100644
--- a/tests/regression/amo/kernel.cpp
+++ b/tests/regression/amo/kernel.cpp
@@ -9,11 +9,10 @@
 
 typedef void (*PFN_Kernel)(kernel_arg_t* __UNIFORM__ arg);
 
-// Globally-unique hart id matching the simulator's make_hart_id():
-// (cid * VX_CFG_NUM_WARPS + wid) * VX_CFG_NUM_THREADS + tid.
+// Globally-unique hart id (mhartid): (cid * NUM_WARPS + wid) * NUM_THREADS
+// + tid, matching the LLC reservation table's make_hart_id.
 static inline uint32_t hart_id() {
-  return (vx_core_id() * vx_num_warps() + vx_warp_id()) * vx_num_threads()
-       + vx_thread_id();
+  return (uint32_t)vx_hart_id();
 }
 
 // 1) AMOADD hammer.