Skip to content

[CI] Optimize unit tests including adding stochasticity#709

Draft
hughperkins wants to merge 31 commits into
mainfrom
hp/mark-slow-tests
Draft

[CI] Optimize unit tests including adding stochasticity#709
hughperkins wants to merge 31 commits into
mainfrom
hp/mark-slow-tests

[BugFix] pytest.ini: keep slow + sample marker descriptions on one line

d0ef1f0
Select commit
Loading
Failed to load commit list.
Sign in for the full log view
GitHub Actions / Coverage Report succeeded May 27, 2026 in 0s

Diff Coverage Report

See details below for per-line coverage annotations.

Details

Coverage Report (d0ef1f0df)

Metric Value
Diff coverage (changed lines only) 95%
Overall project coverage 73%

Total: 228 lines, 12 missing, 95% covered

🟢 tests/python/conftest.py (87%)
🟢    2  import hashlib
🟢    4  import random
     20  # ---------------------------------------------------------------------------
     21  # @pytest.mark.sample(...)  --  per-test stochastic parametrize subsampling
     22  # ---------------------------------------------------------------------------
     23  #
     24  # Some tests parametrize so widely (test_tile16_load_store, test_tile16_cholesky, ...) that running every case on every
     25  # CI run is wasteful: the parametrize axes are intentionally varied to cover corner cases, but most runs would get the
     26  # same signal from a small random subset. ``@pytest.mark.sample(n=...)`` or ``@pytest.mark.sample(fraction=...)`` opts a
     27  # *single* test into per-run random sub-selection. Over many runs, each parametrize case asymptotically gets covered
     28  # (Pr[hit after k runs] = 1 - (1 - keep/total)^k).
     29  #
     30  # Reproducibility hooks:
     31  #   - whole-suite: ``--sample-seed=<S>`` reproduces the exact same trimmed set (header prints the seed used).
     32  #   - single failing case: paste the failing nodeid into ``pytest <nodeid>`` -- the sampler's ``len(group) <= 1``
     33  #     short-circuit keeps it; no flags needed.
     34  #   - exhaustive run (release gate / coverage audit): ``--no-sample`` skips the sampler entirely.
     35  #
     36  # Per-test RNG keyed on ``(seed, nodeid_prefix)``: adding / renaming a @sample-marked test does NOT shift any other
     37  # test's sample. Routine refactors don't migrate failures.
     38  
     39  
🟢   40  def pytest_addoption(parser):
🟢   41      parser.addoption(
     42          "--sample-seed",
     43          type=int,
     44          default=None,
     45          help="Seed for @pytest.mark.sample subsampling. If absent, a fresh seed is picked and printed "
     46          "in the report header so a failing run can be reproduced via --sample-seed=<S>.",
     47      )
🟢   48      parser.addoption(
     49          "--no-sample",
     50          action="store_true",
     51          default=False,
     52          help="Disable @pytest.mark.sample subsampling -- run every parametrize case of every marked test. "
     53          "Use for exhaustive CI release gates / coverage-debt audits.",
     54      )
     55  
     56  
🟢   57  @pytest.hookimpl(tryfirst=True)
🟢   58  def pytest_configure(config):
     59      # The marker is registered here (rather than only in pytest.ini) so callers that use
     60      # `--strict-markers` don't blow up if they happen to import this conftest in isolation.
🟢   61      config.addinivalue_line(
     62          "markers",
     63          "sample(fraction=None, n=None): per-test stochastic parametrize subsampling. Pass exactly one of "
     64          "`fraction` (0..1) or `n` (>= 1). Seed printed in report header; rerun the same sample with "
     65          "--sample-seed=<S>; rerun every case with --no-sample; rerun a single failing case by pasting its nodeid.",
     66      )
     67      # Seed propagation contract: the seed must reach the controller AND every xdist worker as the same value, or
     68      # xdist's collection-consistency check fails with "Different tests were collected between gw0 and gwN". argv is
     69      # forwarded by xdist to every worker, so we require the seed to live on argv as ``--sample-seed=N``. ``tests/
     70      # run_tests.py`` picks a seed once per run and injects it; direct ``pytest`` invocations either pass
     71      # ``--sample-seed`` explicitly (reproducibility) or fall back to a single-process seed picked below. We do NOT
     72      # mutate ``os.environ`` here -- env-var inheritance into xdist worker subprocesses is not guaranteed for runtime
     73      # mutations, only for vars present when pytest itself was launched.
🟢   74      if (
     75          not config.getoption("--no-sample")
     76          and config.getoption("--sample-seed") is None
     77          and not hasattr(config, "workerinput")  # single-process / non-xdist controller only.
     78      ):
🔴   79          config.option.sample_seed = random.randrange(0, 2**31)
     80  
     81  
🟢   82  def pytest_report_header(config):
🟢   83      if config.getoption("--no-sample"):
🔴   84          return "sample: --no-sample (every @sample-marked test runs every parametrize case)"
🟢   85      seed = config.getoption("--sample-seed")
🟢   86      if seed is None:
🔴   87          return None
🟢   88      return (
     89          f"sample-seed={seed}  (reproduce the same sample: --sample-seed={seed}; "
     90          f"reproduce a single failure: paste its nodeid; run every case: --no-sample)"
     91      )
     92  
     93  
🟢   94  def _sample_keep_count(mark, group_size, group_key):
     95      """Resolve ``@pytest.mark.sample(fraction=..., n=...)`` for a group of ``group_size`` parametrize cases.
     96  
     97      Exactly one of ``fraction`` (0..1) or ``n`` (int >= 1) must be passed; ``UsageError`` otherwise. The result is
     98      clamped to ``[1, group_size]`` so every @sample-marked test runs at least one case per run (no silent zero-case
     99      runs even if e.g. ``fraction * group_size`` rounds to zero on a 1-case group).
    100      """
🟢  101      fraction = mark.kwargs.get("fraction")
🟢  102      n = mark.kwargs.get("n")
🟢  103      if (fraction is None) == (n is None):
🔴  104          raise pytest.UsageError(
    105              f"@pytest.mark.sample on {group_key!r}: pass exactly one of `fraction` or `n`, got "
    106              f"fraction={fraction!r}, n={n!r}"
    107          )
🟢  108      if fraction is not None:
🔴  109          return max(1, int(round(group_size * float(fraction))))
🟢  110      return max(1, min(int(n), group_size))
    111  
    112  
🟢  113  def pytest_collection_modifyitems(config, items):
🟢  114      if config.getoption("--no-sample"):
🔴  115          return
🟢  116      seed = config.getoption("--sample-seed")
🟢  117      if seed is None:
    118          # Defensive: pytest_configure didn't run (e.g. someone imported this module manually). Nothing to do.
🔴  119          return
    120  
    121      # Group items by test function (strip the parametrize bracket suffix). Per-function stratification is what
    122      # guarantees every @sample-marked test keeps at least one case per run -- uniform sampling across all items
    123      # could otherwise drop a 2-case marked test entirely.
🟢  124      groups: dict[str, list] = {}
🟢  125      for item in items:
🟢  126          key = item.nodeid.split("[", 1)[0]
🟢  127          groups.setdefault(key, []).append(item)
    128  
🟢  129      keep, deselected = [], []
    130      # ``sorted(groups)`` so the iteration order (and therefore any incidental RNG advance) is reproducible across
    131      # Python versions / dict insertion orders. Per-test RNG is keyed below so this only matters for the (cheap)
    132      # bookkeeping order.
🟢  133      for key in sorted(groups):
🟢  134          group = groups[key]
🟢  135          mark = group[0].get_closest_marker("sample")
🟢  136          if mark is None or len(group) <= 1:
    137              # No sample mark -> every case runs. Also: a single-item group means either the test only had one
    138              # parametrize case to begin with, or pytest narrowed collection to a specific nodeid -- both cases
    139              # should run as-is. This is what makes "paste failing nodeid" work without --no-sample.
🟢  140              keep.extend(group)
🟢  141              continue
🟢  142          keep_n = _sample_keep_count(mark, len(group), key)
    143          # Per-test RNG: keyed on (seed, key) so:
    144          #   - Independence: adding / renaming / tweaking the @sample mark on test_A does NOT shift the sample of test_B.
    145          #     Routine refactors don't cause failures to migrate file-wide.
    146          #   - Locality: when debugging, you can reason about one test's sample without simulating all the others' RNG
    147          #     advances.
    148          # Seed mixing uses sha256 of a canonical ``f"{seed}|{key}"`` rather than ``random.Random((seed, key))``: tuple
    149          # seeding goes through ``_sha512(repr(a).encode())`` in CPython 3.10+ which IS deterministic in principle but
    150          # raises a ``DeprecationWarning: Seeding based on hashing is deprecated`` and is slated for removal. We pin to
    151          # an explicit hash so the sample is reproducible across Python versions and not at the mercy of stdlib churn.
    152          # CRITICAL: ``rng.sample(group_sorted, ...)`` rather than ``rng.sample(group, ...)``. xdist workers each run
    153          # ``pytest_collection_modifyitems`` independently and pytest does NOT guarantee that ``items`` (and therefore
    154          # ``group``) lands in the same in-memory order on every worker. With the same seed but a differently-ordered
    155          # list, ``rng.sample`` would pick the same indices but those indices would resolve to different items, so
    156          # workers would collect different subsets and xdist's collection-consistency check would abort the run with
    157          # "Different tests were collected between gw0 and gwN". Sorting by ``nodeid`` (a content-derived total order)
    158          # forces every worker to sample from an identical sequence.
🟢  159          group_sorted = sorted(group, key=lambda it: it.nodeid)
🟢  160          mixed = int.from_bytes(hashlib.sha256(f"{seed}|{key}".encode()).digest()[:8], "big")
🟢  161          rng = random.Random(mixed)
🟢  162          kept_nodeids = {it.nodeid for it in rng.sample(group_sorted, k=keep_n)}
🟢  163          for it in group:
🟢  164              (keep if it.nodeid in kept_nodeids else deselected).append(it)
    165  
🟢  166      if deselected:
    167          # ``pytest_deselected`` is the supported way to report filtered-out items so pytest's summary shows them as
    168          # deselected (not silently dropped). xdist also forwards this to the controller correctly.
🟢  169          config.hook.pytest_deselected(items=deselected)
🟢  170      items[:] = keep
    171  
    172  
🟢 tests/python/test_ad_gdar_diffmpm.py (100%)
      8  # Defaults shrink particle / grid / steps counts so the JIT compile + AD-tape replay stays cheap; the slow-marked
      9  # entry keeps the original (N=30, n_grid=120, steps=32) workload that runs on --run-slow. The point of the test is
     10  # that the AD-validation checker fires on the global-data-access violation in g2p (`v[f, p] = new_v`), which happens
     11  # on the first substep regardless of size.
🟢   12  @pytest.mark.parametrize(
     13      "particles_side,n_grid_size,num_steps",
     14      [
     15          (8, 32, 4),
     16          pytest.param(30, 120, 32, marks=pytest.mark.slow),
     17      ],
     18  )
🟢   20  def test_gdar_mpm(particles_side, n_grid_size, num_steps):
🟢   24      N = particles_side
🟢   26      n_grid = n_grid_size
🟢   35      max_steps = num_steps
🟢   36      steps = num_steps
🟢 tests/python/test_algorithms.py (100%)
🟢  323  _REDUCE_OPS = ["add", "min", "max"]
🟢  326  def _reduce_host(rng, op, dtype, N):
    327      """Generate the test input for a reduce of `op` on `dtype` x N values.
    328  
    329      ``add`` uses small uniform / bounded values so float sums stay representable; ``min`` and ``max`` use a wider
    330      range (-10..10 for floats, +-10000 for ints) since picking-an-element is bitwise-exact regardless of magnitude.
    331      """
🟢  332      if op == "add":
🟢  333          return _rand_reduce_host(rng, dtype, N)
🟢  335          return rng.uniform(-10.0, 10.0, size=N).astype(_DTYPE_TO_NP[dtype])
🟢  336      return _rand_reduce_host(rng, dtype, N, bound=10000)
🟢  339  def _check_reduce(op, dtype, N):
    340      """Run ``device_reduce_<op>(arr)`` and verify against ``numpy.<op>(arr)``.
    341  
    342      ``add`` accumulates so it needs (a) wider integer promotion + mod-wrap masking for u32/u64 and (b) per-N float
    343      tolerance. ``min`` / ``max`` pick one input element, so they're bitwise-exact for both ints and floats.
    344      """
🟢  348      host = _reduce_host(rng, op, dtype, N)
🟢  351      qd_fn = getattr(qd.algorithms, f"device_reduce_{op}")
🟢  352      qd_fn(inp, out=out)
🟢  355      if op == "add":
🟢  356          if _is_float(dtype):
🟢  357              expected = float(np.sum(host.astype(np.float64)))
🟢  358              rtol, atol = (_F32_REDUCE_RTOL, _F32_REDUCE_ATOL) if dtype == qd.f32 else (_F64_RTOL, _F64_ATOL)
🟢  359              assert math.isclose(
    360                  got, expected, rel_tol=rtol, abs_tol=atol
    361              ), f"{dtype} reduce_add(N={N}): got {got}, expected {expected}"
    362          else:
    363              # Promote to Python int for an arbitrary-width reference; mask both sides to dtype width to handle the
    364              # u32 / u64 mod-wrap case at large N.
🟢  365              mod = 1 << (32 if dtype in (qd.i32, qd.u32) else 64) if _is_unsigned(dtype) else None
🟢  366              ref = int(
    367                  np.sum(
    368                      host.astype(np.int64 if dtype in (qd.i32, qd.u32) else (np.int64 if dtype == qd.i64 else np.uint64))
    369                  )
    370              )  # noqa: E501
🟢  371              got_int = int(got)
🟢  372              if mod is not None:
🟢  373                  ref &= mod - 1
🟢  374                  got_int &= mod - 1
🟢  375              assert got_int == ref, f"{dtype} reduce_add(N={N}): got {got_int}, expected {ref}"
🟢  376          return
    377  
🟢  378      expected = host.min() if op == "min" else host.max()
🟢  382          assert int(got) == int(expected), f"{dtype} reduce_{op}(N={N}): got {got}, expected {expected}"
🟢  385  @pytest.mark.parametrize("op", _REDUCE_OPS)
🟢  389  def test_device_reduce(op, dtype, N):
    390      """``device_reduce_{add,min,max}`` match numpy across the full size sweep + dtype set.
    392      Unified across the three op variants. ``add`` accumulates so it needs overflow / precision-aware comparison;
    393      ``min`` / ``max`` pick one element of the input and are bitwise-exact.
    394      """
🟢  395      _check_reduce(op, dtype, N)
🟢  450  _SCAN_OPS = ["add", "min", "max"]
🟢  453  def _scan_host(rng, op, dtype, N):
    454      """Generate the test input for a scan of `op` on `dtype` x N values. Same rationale as ``_reduce_host``."""
🟢  455      if op == "add":
🟢  456          return _rand_reduce_host(rng, dtype, N, bound=100)
🟢  458          return rng.uniform(-10.0, 10.0, size=N).astype(_DTYPE_TO_NP[dtype])
🟢  459      return _rand_reduce_host(rng, dtype, N, bound=10000)
🟢  462  def _check_scan(op, dtype, N):
    463      """Run ``device_exclusive_scan_<op>(arr)`` and verify against ``numpy.<op>.accumulate``-shifted.
    464  
    465      Like the reduce family, ``add`` accumulates (overflow / precision care) while ``min`` / ``max`` are
    466      bitwise-exact in both float and int paths.
    467      """
🟢  472      host = _scan_host(rng, op, dtype, N)
🟢  475      qd_fn = getattr(qd.algorithms, f"device_exclusive_scan_{op}")
🟢  476      qd_fn(inp, out=out)
🟢  479      if op == "add":
🟢  480          if _is_float(dtype):
🟢  481              ref = np.concatenate([[0.0], np.cumsum(host.astype(np.float64))[:-1]])
🟢  482              rtol, atol = _f32_scan_tol(N) if dtype == qd.f32 else (_F64_RTOL, _F64_ATOL)
🟢  483              np.testing.assert_allclose(
    484                  got.astype(np.float64),
    485                  ref,
    486                  rtol=rtol,
    487                  atol=atol,
    488                  err_msg=f"{dtype} scan_add(N={N})",
    489              )
    490          else:
    491              # Promote to a width that survives the cumulative sum: u64 / i64 inputs use a Python int reference;
    492              # smaller ints can still use int64.
🟢  493              promote = np.int64 if dtype in (qd.i32, qd.u32, qd.i64) else np.uint64
🟢  494              host_wide = host.astype(promote)
🟢  495              ref = np.concatenate([[promote(0)], np.cumsum(host_wide)[:-1]]).astype(promote)
🟢  496              mask = _scan_dtype_mask(dtype)
🟢  497              got_view = got.astype(np.int64 if dtype != qd.u64 else np.uint64)
🟢  498              if mask != -1:
🟢  499                  got_view = got_view & promote(mask)
🟢  500                  ref = ref & promote(mask)
🟢  501              np.testing.assert_array_equal(got_view, ref, err_msg=f"{dtype} scan_add(N={N})")
🟢  502          return
    503  
🟢  504      np_accum = np.minimum.accumulate if op == "min" else np.maximum.accumulate
🟢  505      identity_table = _MIN_IDENTITY if op == "min" else _MAX_IDENTITY
🟢  507          identity = float("inf") if op == "min" else float("-inf")
🟢  508          ref = np.concatenate([[identity], np_accum(host.astype(np.float64))[:-1]]).astype(np_dt)
🟢  509          np.testing.assert_allclose(got, ref, rtol=0, atol=0, err_msg=f"{dtype} scan_{op}(N={N})")
🟢  511          ref = np.concatenate([[np_dt(identity_table[dtype])], np_accum(host)[:-1]]).astype(np_dt)
🟢  512          np.testing.assert_array_equal(got, ref, err_msg=f"{dtype} scan_{op}(N={N})")
🟢  515  @pytest.mark.parametrize("op", _SCAN_OPS)
🟢  519  def test_device_exclusive_scan(op, dtype, N):
    520      """``device_exclusive_scan_{add,min,max}`` match ``numpy.{cumsum, minimum.accumulate, maximum.accumulate}``-shifted
    521      across the full size sweep + dtype set. Unified across the three op variants; same overflow vs bitwise-exact
    522      handling as the reduce family."""
🟢  523      _check_scan(op, dtype, N)
🟢 tests/python/test_eig.py (100%)
🟢  298  @pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)])
🟢  314  @pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)])
🟢  361  @pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)])
🟢  371  @pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)])
🟢  407  @pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)])
🟢  448  @pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)])
🟢  458  @pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)])
🟢  538  @pytest.mark.parametrize("n", [3, pytest.param(12, marks=pytest.mark.slow)])
🟢  544  @pytest.mark.parametrize("n", [3, pytest.param(12, marks=pytest.mark.slow)])
🟢 tests/python/test_linalg.py (100%)
🟢  157  @pytest.mark.parametrize("n", [3, pytest.param(12, marks=pytest.mark.slow)])
🟢  163  @pytest.mark.parametrize("n", [3, pytest.param(12, marks=pytest.mark.slow)])
🟢  192  @pytest.mark.parametrize(
    193      "rows,cols",
    194      [
    195          pytest.param(9, 12, marks=pytest.mark.slow),
    196          pytest.param(12, 3, marks=pytest.mark.slow),
    197          (2, 4),
    198      ],
    199  )
🟢  205  @pytest.mark.parametrize(
    206      "rows,cols",
    207      [
    208          pytest.param(9, 12, marks=pytest.mark.slow),
    209          pytest.param(12, 3, marks=pytest.mark.slow),
    210          (2, 4),
    211      ],
    212  )
🟢  218  def _test_matmul_chain(rows_a, cols_a, cols_b, cols_c, dt):
    219      """3-way matmul chain: ``(rows_a × cols_a) · (cols_a × cols_b) · (cols_b × cols_c) → (rows_a × cols_c)``.
    221      Verifies that ``Matrix.__matmul__`` compiles and is numerically correct at the requested size. Quadrants
    222      imposes no enforced size cap on matmul, but the unrolled `static(range)` triple loop produces
    223      ``rows_a * cols_a * cols_b + rows_a * cols_b * cols_c`` FMAs per kernel call, so this test catches compile-time
    224      blow-up or back-end miscompiles at large sizes. The largest parametrize value is the chain qipc actually uses;
    225      smaller values are cheap sanity checks that the same code path still works.
🟢  228      A_np = np.random.default_rng(0xCA70).standard_normal((rows_a, cols_a)).astype(np_dt)
🟢  229      B_np = np.random.default_rng(0xCA71).standard_normal((cols_a, cols_b)).astype(np_dt)
🟢  230      C_np = np.random.default_rng(0xCA72).standard_normal((cols_b, cols_c)).astype(np_dt)
🟢  232      A = qd.Matrix.field(rows_a, cols_a, dtype=dt, shape=())
🟢  233      B = qd.Matrix.field(cols_a, cols_b, dtype=dt, shape=())
🟢  234      C = qd.Matrix.field(cols_b, cols_c, dtype=dt, shape=())
🟢  235      AB = qd.Matrix.field(rows_a, cols_b, dtype=dt, shape=())
🟢  236      ABC_chained = qd.Matrix.field(rows_a, cols_c, dtype=dt, shape=())
🟢  237      ABC_staged = qd.Matrix.field(rows_a, cols_c, dtype=dt, shape=())
    260  # qipc's actual size is (9,12,12,9) -- the largest chain it instantiates. We also keep a tiny (3,4,4,3) chain so
    261  # the default fast lane still exercises the same Matrix.__matmul__ codegen path without paying the ~90s/case
    262  # CUDA JIT cost of the qipc-sized chain.
🟢  263  _MATMUL_CHAIN_SHAPES = [
    264      (3, 4, 4, 3),
    265      pytest.param(9, 12, 12, 9, marks=pytest.mark.slow),
    266  ]
    267  
    268  
🟢  269  @pytest.mark.parametrize("rows_a,cols_a,cols_b,cols_c", _MATMUL_CHAIN_SHAPES)
🟢  271  def test_matmul_chain_qipc_sizes_f32(rows_a, cols_a, cols_b, cols_c):
🟢  272      _test_matmul_chain(rows_a, cols_a, cols_b, cols_c, qd.f32)
🟢  275  @pytest.mark.parametrize("rows_a,cols_a,cols_b,cols_c", _MATMUL_CHAIN_SHAPES)
🟢  277  def test_matmul_chain_qipc_sizes_f64(rows_a, cols_a, cols_b, cols_c):
🟢  278      _test_matmul_chain(rows_a, cols_a, cols_b, cols_c, qd.f64)
🟢  464  @pytest.mark.parametrize("n", [5, pytest.param(12, marks=pytest.mark.slow)])
🟢  474  @pytest.mark.parametrize("n", [5, pytest.param(12, marks=pytest.mark.slow)])
🟢 tests/python/test_simt.py (93%)
    890  # The three single-output reduces (`test_block_reduce_{add,min,max}`) and their three broadcast siblings
    891  # (`test_block_reduce_all_{add,min,max}`) share the same kernel skeleton, parametrize axes, and verification loop;
    892  # they differ only in (a) which `block.reduce_*` function gets called, (b) the host-side reference oracle, (c) the
    893  # init pattern (sequential for `add` so the running sum has signal, permuted hash for `min` / `max` so the result
    894  # depends on lanes other than first / last), and (d) the float tolerance regime (`add` accumulates so it uses a
    895  # relative tol; `min` / `max` pick one element of the input and use an absolute tol).
🟢  896  _BLOCK_REDUCE_OP_CASES = [
    897      # (op_name, ref_fn, init_permuted, tol_relative)
    898      pytest.param("add", _ref_reduce_add, False, True, id="add"),
    899      pytest.param("min", _ref_reduce_min, True, False, id="min"),
    900      pytest.param("max", _ref_reduce_max, True, False, id="max"),
    901  ]
🟢  904  def _init_block_reduce_src(src, N, dtype, *, permuted):
    905      """Initialize ``src[0:N]`` for a block reduce test. ``permuted=False`` is the sequential ``1..N`` init from
    906      ``_init_field`` (good for add); ``permuted=True`` is the stable hash ``((i * 1009) % 997) + 1`` so the per-block
    907      min / max depends on lanes other than first / last."""
🟢  908      if permuted:
🟢  910              v = ((i * 1009) % 997) + 1
🟢  911              src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v
    912      else:
🟢  913          _init_field(src, N, dtype)
🟢  916  def _assert_block_reduce_close(actual, expected, dtype, *, tol_relative, ctx):
    917      """Assert ``actual ~= expected`` per the block-reduce tolerance regime.
    919      Int dtypes compare exactly. Floats use relative tolerance ``1e-4 * |expected|`` for accumulating ops (sums grow
    920      with block_dim, so a relative bound is the only thing that stays meaningful across the 32 / 128 / 256 / 64 / 256 /
    921      512 block-size sweep), and absolute tolerance ``1e-5`` for picker ops (min / max pick one element so the
    922      magnitude is whatever was in the input -- a small absolute bound suffices).
    923      """
🟢  924      if dtype in _BLOCK_REDUCE_INT_DTYPES:
🟢  925          assert actual == expected, f"{ctx}: got {actual}, expected {expected}"
🟢  926      elif tol_relative:
🟢  927          assert abs(actual - expected) < 1e-4 * abs(expected), f"{ctx}: got {actual}, expected {expected}"
    928      else:
🟢  929          assert abs(actual - expected) < 1e-5, f"{ctx}: got {actual}, expected {expected}"
    931  
🟢  932  @pytest.mark.parametrize("op_name,ref_fn,init_permuted,tol_relative", _BLOCK_REDUCE_OP_CASES)
🟢  936  def test_block_reduce(dtype, sg_per_block, op_name, ref_fn, init_permuted, tol_relative):
    937      """Block reduce: thread 0 of each block holds ``<op>(src[block_base:block_base+block_dim])``. Unified across
    938      ``add`` / ``min`` / ``max`` -- op-name is closure-captured into ``@qd.kernel``."""
🟢  940      op_fn = getattr(block, f"reduce_{op_name}")
🔴  952              agg = op_fn(src[i], block_dim, dtype)
🟢  956      _init_block_reduce_src(src, N, dtype, permuted=init_permuted)
🟢  961          expected = ref_fn(block_vals)
🟢  962          _assert_block_reduce_close(dst[b], expected, dtype, tol_relative=tol_relative, ctx=f"block {b}")
🟢  965  @pytest.mark.parametrize("op_name,ref_fn,init_permuted,tol_relative", _BLOCK_REDUCE_OP_CASES)
🟢  969  def test_block_reduce_all(dtype, sg_per_block, op_name, ref_fn, init_permuted, tol_relative):
    970      """Block reduce broadcast: every thread of each block holds the block-wide ``<op>``. Verified by writing the
    971      per-thread output to a flat field, then asserting every thread of a given block reads the same aggregate.
    972      Unified across ``add`` / ``min`` / ``max``."""
🟢  974      op_fn = getattr(block, f"reduce_all_{op_name}")
🔴  985              dst[i] = op_fn(src[i], block_dim, dtype)
🟢  987      _init_block_reduce_src(src, N, dtype, permuted=init_permuted)
🟢  992          expected = ref_fn(block_vals)
🟢  995              _assert_block_reduce_close(actual, expected, dtype, tol_relative=tol_relative, ctx=f"block {b} thread {j}")
   1048  # The four scan tests in this group (`test_block_inclusive_{add,min,max}` + `test_block_exclusive_add`) share the
   1049  # kernel skeleton; only the per-op reference oracle, init pattern, and float tolerance differ. `add` accumulates
   1050  # (sequential init, relative tol); `min` / `max` pick (permuted init, absolute tol). Exclusive `min` / `max` get
   1051  # their own dedicated test below because they need a dtype-derived sentinel identity (+inf / iinfo(max), -inf /
   1052  # iinfo(min)) at lane 0 with explicit ``isinf`` handling -- different enough that fusing them in would create more
   1053  # branches than it removes.
🟢 1054  _PY_MIN = lambda a, b: a if a < b else b  # noqa: E731 (intentional 1-line lambda for ref oracle)
🟢 1055  _PY_MAX = lambda a, b: a if a > b else b  # noqa: E731
   1056  
🟢 1057  _BLOCK_INCLUSIVE_SCAN_OP_CASES = [
   1058      # (op_name, ref_fn, init_permuted, tol_relative)
   1059      pytest.param("add", _ref_inclusive_scan_add, False, True, id="add"),
   1060      pytest.param("min", lambda vals: _ref_inclusive_scan_op(vals, _PY_MIN, 0), True, False, id="min"),
   1061      pytest.param("max", lambda vals: _ref_inclusive_scan_op(vals, _PY_MAX, 0), True, False, id="max"),
   1062  ]
🟢 1065  def _assert_block_scan_close(actual, expected_j, dtype, *, tol_relative, ctx):
   1066      """Per-thread assertion for block scan tests. Same int / relative-float / absolute-float regime as
   1067      ``_assert_block_reduce_close`` but with a floor on the relative-tol base so the first few prefixes (where
   1068      ``expected_j`` is near zero) don't tighten the bound to zero."""
🟢 1069      if dtype in _BLOCK_REDUCE_INT_DTYPES:
🟢 1070          assert actual == expected_j, f"{ctx}: got {actual}, expected {expected_j}"
🟢 1071      elif tol_relative:
🟢 1072          tol_base = abs(expected_j) if abs(expected_j) > 1.0 else 1.0
🟢 1073          assert abs(actual - expected_j) < 1e-4 * tol_base, f"{ctx}: got {actual}, expected {expected_j}"
   1074      else:
🟢 1075          assert abs(actual - expected_j) < 1e-5, f"{ctx}: got {actual}, expected {expected_j}"
🟢 1078  @pytest.mark.parametrize("op_name,ref_fn,init_permuted,tol_relative", _BLOCK_INCLUSIVE_SCAN_OP_CASES)
🟢 1082  def test_block_inclusive(dtype, sg_per_block, op_name, ref_fn, init_permuted, tol_relative):
   1083      """Block inclusive prefix scan: thread ``i`` holds ``<op>(src[block_base..i])``. Unified across ``add`` / ``min``
   1084      / ``max``."""
🟢 1086      op_fn = getattr(block, f"inclusive_{op_name}")
🔴 1097              dst[i] = op_fn(src[i], block_dim, dtype)
🟢 1099      _init_block_reduce_src(src, N, dtype, permuted=init_permuted)
🟢 1104          expected = ref_fn(block_vals)
🟢 1107              _assert_block_scan_close(actual, expected[j], dtype, tol_relative=tol_relative, ctx=f"block {b} thread {j}")
🟢 1113  def test_block_exclusive_add(dtype, sg_per_block):
   1114      """Block exclusive prefix sum: thread ``i`` holds ``sum(src[block_base..i-1])``; thread 0 holds 0."""
🔴 1126              dst[i] = block.exclusive_add(src[i], block_dim, dtype)
🟢 1128      _init_field(src, N, dtype)
🟢 1133          expected = _ref_exclusive_scan_add(block_vals)
🟢 1136              _assert_block_scan_close(actual, expected[j], dtype, tol_relative=True, ctx=f"block {b} thread {j}")
🟢 1139  _BLOCK_EXCLUSIVE_MINMAX_CASES = [
   1140      # (op_name, sentinel_fn, py_op, inf_sign)
   1141      pytest.param("min", _block_exclusive_min_sentinel, _PY_MIN, 1, id="min"),
   1142      pytest.param("max", _block_exclusive_max_sentinel, _PY_MAX, -1, id="max"),
   1143  ]
🟢 1146  @pytest.mark.parametrize("op_name,sentinel_fn,py_op,inf_sign", _BLOCK_EXCLUSIVE_MINMAX_CASES)
🟢 1150  def test_block_exclusive_minmax(dtype, sg_per_block, op_name, sentinel_fn, py_op, inf_sign):
   1151      """Block exclusive prefix ``<op>`` for ``op in {min, max}``; thread 0 of each block holds the dtype-derived
   1152      identity (``+inf`` / ``iinfo(dtype).max`` for min, ``-inf`` / ``iinfo(dtype).min`` for max). The float ``inf`` /
   1153      ``-inf`` lane-0 identity gets a sign-only check because ``inf - inf`` (or ``(-inf) - (-inf)``) is ``NaN`` and the
   1154      standard ``abs(diff) < tol`` compare would fail spuriously."""
🟢 1156      op_fn = getattr(block, f"exclusive_{op_name}")
🔴 1167              dst[i] = op_fn(src[i], block_dim, dtype)
🟢 1169      _init_block_reduce_src(src, N, dtype, permuted=True)
🟢 1172      sentinel = sentinel_fn(dtype)
🟢 1175          expected = _ref_exclusive_scan_op(block_vals, py_op, sentinel)
🟢 1181                  assert math.isinf(actual) and (
   1182                      actual > 0 if inf_sign > 0 else actual < 0
   1183                  ), f"block {b} thread {j}: got {actual}, expected {expected[j]}"
   3427  # Each entry is a thin ``_check_full_matches_tiled(subgroup.X, subgroup.X_tiled, ...)`` wrapper. Collapsed into one
   3428  # op-parametrized test to drop ~80 LOC of duplication. The pytest ids match the names of the original
   3429  # ``test_subgroup_<op>`` functions so test reports / `-k` selectors stay stable.
🟢 3430  _FULL_VS_TILED_INT_CASES = [
   3431      pytest.param("reduce_add", None, id="reduce_add"),
   3432      pytest.param("reduce_all_add", None, id="reduce_all_add"),
   3433      pytest.param("reduce_min", None, id="reduce_min"),
   3434      pytest.param("reduce_max", None, id="reduce_max"),
   3435      pytest.param("reduce_all_min", None, id="reduce_all_min"),
   3436      pytest.param("reduce_all_max", None, id="reduce_all_max"),
   3437      pytest.param("inclusive_add", None, id="inclusive_add"),
   3438      pytest.param("inclusive_min", None, id="inclusive_min"),
   3439      pytest.param("inclusive_max", None, id="inclusive_max"),
   3440      # `mul` needs bounded inputs (2**N overflows i32 quickly); bitwise ops need a per-lane bit pattern that's
   3441      # non-zero on every lane so AND has signal and OR / XOR have varied bits.
   3442      pytest.param("inclusive_mul", _init_full_small_int, id="inclusive_mul"),
   3443      pytest.param("inclusive_and", _init_full_bitwise, id="inclusive_and"),
   3444      pytest.param("inclusive_or", _init_full_bitwise, id="inclusive_or"),
   3445      pytest.param("inclusive_xor", _init_full_bitwise, id="inclusive_xor"),
   3446      pytest.param("exclusive_add", None, id="exclusive_add"),
   3447      pytest.param("exclusive_mul", _init_full_small_int, id="exclusive_mul"),
   3448      pytest.param("exclusive_and", _init_full_bitwise, id="exclusive_and"),
   3449      pytest.param("exclusive_or", _init_full_bitwise, id="exclusive_or"),
   3450      pytest.param("exclusive_xor", _init_full_bitwise, id="exclusive_xor"),
   3451  ]
🟢 3454  @pytest.mark.parametrize("op_name,host_init", _FULL_VS_TILED_INT_CASES)
🟢 3456  def test_subgroup_full_matches_tiled(op_name, host_init):
   3457      """For each subgroup op ``X``, verify ``subgroup.X(v)`` matches ``subgroup.X_tiled(v, log2_group_size())``
   3458      lane-by-lane on ``qd.i32``. Covers reduce / inclusive / exclusive families; bitwise ops + ``mul`` use a custom
   3459      initializer that keeps the per-lane aggregate bounded."""
🟢 3460      full_fn = getattr(subgroup, op_name)
🟢 3461      tiled_fn = getattr(subgroup, f"{op_name}_tiled")
🟢 3462      kwargs = {}
🟢 3463      if host_init is not None:
🟢 3464          kwargs["host_init"] = host_init
🟢 3465      _check_full_matches_tiled(full_fn, tiled_fn, **kwargs)
🟢 3610  @pytest.mark.parametrize("op_name", ["reduce_add", "inclusive_add"])
🟢 3613  def test_subgroup_full_matches_tiled_float(op_name, dtype):
   3614      """Float-dtype coverage of the dtype-agnostic ``full`` wrappers (``reduce_add``, ``inclusive_add``). One f32 + one
   3615      f64 case per family is enough to catch an i32-only regression in a wrapper."""
🟢 3616      full_fn = getattr(subgroup, op_name)
🟢 3617      tiled_fn = getattr(subgroup, f"{op_name}_tiled")
🟢 3618      _check_full_matches_tiled(full_fn, tiled_fn, dtype=dtype)
🟢 tests/python/test_tile16.py (100%)
     95  # 8 geometries x 2 tensor_type x 2 qd_dtype = 32 parametrize cases. The geometries enumerate hand-picked corner cases
     96  # (origin, non-zero src/dst offsets, partial cols/rows, oversize backing array); coverage of any single geometry is
     97  # more valuable than running every combination every CI run. ``@pytest.mark.sample(n=6)`` keeps 6 of the 32 cases per
     98  # run; after k runs each specific case is hit with probability 1 - (26/32)^k = 1 - 0.8125^k (~65% after 5 runs, ~98%
     99  # after 20). See docs/source/user_guide/unit_testing.md for the reproducibility recipes.
🟢  100  @pytest.mark.sample(n=6)
    448  # 3 dst_delta x 3 src_offset x 2 tensor_type x 2 qd_dtype = 36 parametrize cases. Each case is an independent offset /
    449  # delta combo; running 6 random ones per CI run with ~97% convergence over 20 runs is the right tradeoff given each
    450  # case takes ~5s of cluster wall time. See unit_testing.md.
🟢  451  @pytest.mark.sample(n=6)
   1789      """Smoke-test that misc/demos/cholesky_blocked.py runs to completion.
   1790  
   1791      Uses small CLI overrides (N=32, N_ENVS=64, 1 warmup + 1 timed iter) so the JIT compile of the 3 unrolled kernels
   1792      and the benchmark loop both stay cheap. The demo's defaults (N=92, N_ENVS=4096, 50+200 iters) are exercised by
   1793      anyone running the script manually, not by CI.
   1794      """
🟢 1796      cmd = [
   1797          sys.executable,
   1798          str(demo),
   1799          "--n",
   1800          "32",
   1801          "--n-envs",
   1802          "64",
   1803          "--num-warmup",
   1804          "1",
   1805          "--num-iters",
   1806          "1",
   1807      ]
🟢 1808      result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)