[CI] Optimize unit tests including adding stochasticity#709
Draft
hughperkins wants to merge 31 commits into
Draft
[CI] Optimize unit tests including adding stochasticity#709hughperkins wants to merge 31 commits into
hughperkins wants to merge 31 commits into
GitHub Actions / Coverage Report
succeeded
May 27, 2026 in 0s
Diff Coverage Report
See details below for per-line coverage annotations.
Details
Coverage Report (d0ef1f0df)
| Metric | Value |
|---|---|
| Diff coverage (changed lines only) | 95% |
| Overall project coverage | 73% |
Total: 228 lines, 12 missing, 95% covered
🟢 tests/python/conftest.py (87%)
🟢 2 import hashlib
🟢 4 import random
20 # ---------------------------------------------------------------------------
21 # @pytest.mark.sample(...) -- per-test stochastic parametrize subsampling
22 # ---------------------------------------------------------------------------
23 #
24 # Some tests parametrize so widely (test_tile16_load_store, test_tile16_cholesky, ...) that running every case on every
25 # CI run is wasteful: the parametrize axes are intentionally varied to cover corner cases, but most runs would get the
26 # same signal from a small random subset. ``@pytest.mark.sample(n=...)`` or ``@pytest.mark.sample(fraction=...)`` opts a
27 # *single* test into per-run random sub-selection. Over many runs, each parametrize case asymptotically gets covered
28 # (Pr[hit after k runs] = 1 - (1 - keep/total)^k).
29 #
30 # Reproducibility hooks:
31 # - whole-suite: ``--sample-seed=<S>`` reproduces the exact same trimmed set (header prints the seed used).
32 # - single failing case: paste the failing nodeid into ``pytest <nodeid>`` -- the sampler's ``len(group) <= 1``
33 # short-circuit keeps it; no flags needed.
34 # - exhaustive run (release gate / coverage audit): ``--no-sample`` skips the sampler entirely.
35 #
36 # Per-test RNG keyed on ``(seed, nodeid_prefix)``: adding / renaming a @sample-marked test does NOT shift any other
37 # test's sample. Routine refactors don't migrate failures.
38
39
🟢 40 def pytest_addoption(parser):
🟢 41 parser.addoption(
42 "--sample-seed",
43 type=int,
44 default=None,
45 help="Seed for @pytest.mark.sample subsampling. If absent, a fresh seed is picked and printed "
46 "in the report header so a failing run can be reproduced via --sample-seed=<S>.",
47 )
🟢 48 parser.addoption(
49 "--no-sample",
50 action="store_true",
51 default=False,
52 help="Disable @pytest.mark.sample subsampling -- run every parametrize case of every marked test. "
53 "Use for exhaustive CI release gates / coverage-debt audits.",
54 )
55
56
🟢 57 @pytest.hookimpl(tryfirst=True)
🟢 58 def pytest_configure(config):
59 # The marker is registered here (rather than only in pytest.ini) so callers that use
60 # `--strict-markers` don't blow up if they happen to import this conftest in isolation.
🟢 61 config.addinivalue_line(
62 "markers",
63 "sample(fraction=None, n=None): per-test stochastic parametrize subsampling. Pass exactly one of "
64 "`fraction` (0..1) or `n` (>= 1). Seed printed in report header; rerun the same sample with "
65 "--sample-seed=<S>; rerun every case with --no-sample; rerun a single failing case by pasting its nodeid.",
66 )
67 # Seed propagation contract: the seed must reach the controller AND every xdist worker as the same value, or
68 # xdist's collection-consistency check fails with "Different tests were collected between gw0 and gwN". argv is
69 # forwarded by xdist to every worker, so we require the seed to live on argv as ``--sample-seed=N``. ``tests/
70 # run_tests.py`` picks a seed once per run and injects it; direct ``pytest`` invocations either pass
71 # ``--sample-seed`` explicitly (reproducibility) or fall back to a single-process seed picked below. We do NOT
72 # mutate ``os.environ`` here -- env-var inheritance into xdist worker subprocesses is not guaranteed for runtime
73 # mutations, only for vars present when pytest itself was launched.
🟢 74 if (
75 not config.getoption("--no-sample")
76 and config.getoption("--sample-seed") is None
77 and not hasattr(config, "workerinput") # single-process / non-xdist controller only.
78 ):
🔴 79 config.option.sample_seed = random.randrange(0, 2**31)
80
81
🟢 82 def pytest_report_header(config):
🟢 83 if config.getoption("--no-sample"):
🔴 84 return "sample: --no-sample (every @sample-marked test runs every parametrize case)"
🟢 85 seed = config.getoption("--sample-seed")
🟢 86 if seed is None:
🔴 87 return None
🟢 88 return (
89 f"sample-seed={seed} (reproduce the same sample: --sample-seed={seed}; "
90 f"reproduce a single failure: paste its nodeid; run every case: --no-sample)"
91 )
92
93
🟢 94 def _sample_keep_count(mark, group_size, group_key):
95 """Resolve ``@pytest.mark.sample(fraction=..., n=...)`` for a group of ``group_size`` parametrize cases.
96
97 Exactly one of ``fraction`` (0..1) or ``n`` (int >= 1) must be passed; ``UsageError`` otherwise. The result is
98 clamped to ``[1, group_size]`` so every @sample-marked test runs at least one case per run (no silent zero-case
99 runs even if e.g. ``fraction * group_size`` rounds to zero on a 1-case group).
100 """
🟢 101 fraction = mark.kwargs.get("fraction")
🟢 102 n = mark.kwargs.get("n")
🟢 103 if (fraction is None) == (n is None):
🔴 104 raise pytest.UsageError(
105 f"@pytest.mark.sample on {group_key!r}: pass exactly one of `fraction` or `n`, got "
106 f"fraction={fraction!r}, n={n!r}"
107 )
🟢 108 if fraction is not None:
🔴 109 return max(1, int(round(group_size * float(fraction))))
🟢 110 return max(1, min(int(n), group_size))
111
112
🟢 113 def pytest_collection_modifyitems(config, items):
🟢 114 if config.getoption("--no-sample"):
🔴 115 return
🟢 116 seed = config.getoption("--sample-seed")
🟢 117 if seed is None:
118 # Defensive: pytest_configure didn't run (e.g. someone imported this module manually). Nothing to do.
🔴 119 return
120
121 # Group items by test function (strip the parametrize bracket suffix). Per-function stratification is what
122 # guarantees every @sample-marked test keeps at least one case per run -- uniform sampling across all items
123 # could otherwise drop a 2-case marked test entirely.
🟢 124 groups: dict[str, list] = {}
🟢 125 for item in items:
🟢 126 key = item.nodeid.split("[", 1)[0]
🟢 127 groups.setdefault(key, []).append(item)
128
🟢 129 keep, deselected = [], []
130 # ``sorted(groups)`` so the iteration order (and therefore any incidental RNG advance) is reproducible across
131 # Python versions / dict insertion orders. Per-test RNG is keyed below so this only matters for the (cheap)
132 # bookkeeping order.
🟢 133 for key in sorted(groups):
🟢 134 group = groups[key]
🟢 135 mark = group[0].get_closest_marker("sample")
🟢 136 if mark is None or len(group) <= 1:
137 # No sample mark -> every case runs. Also: a single-item group means either the test only had one
138 # parametrize case to begin with, or pytest narrowed collection to a specific nodeid -- both cases
139 # should run as-is. This is what makes "paste failing nodeid" work without --no-sample.
🟢 140 keep.extend(group)
🟢 141 continue
🟢 142 keep_n = _sample_keep_count(mark, len(group), key)
143 # Per-test RNG: keyed on (seed, key) so:
144 # - Independence: adding / renaming / tweaking the @sample mark on test_A does NOT shift the sample of test_B.
145 # Routine refactors don't cause failures to migrate file-wide.
146 # - Locality: when debugging, you can reason about one test's sample without simulating all the others' RNG
147 # advances.
148 # Seed mixing uses sha256 of a canonical ``f"{seed}|{key}"`` rather than ``random.Random((seed, key))``: tuple
149 # seeding goes through ``_sha512(repr(a).encode())`` in CPython 3.10+ which IS deterministic in principle but
150 # raises a ``DeprecationWarning: Seeding based on hashing is deprecated`` and is slated for removal. We pin to
151 # an explicit hash so the sample is reproducible across Python versions and not at the mercy of stdlib churn.
152 # CRITICAL: ``rng.sample(group_sorted, ...)`` rather than ``rng.sample(group, ...)``. xdist workers each run
153 # ``pytest_collection_modifyitems`` independently and pytest does NOT guarantee that ``items`` (and therefore
154 # ``group``) lands in the same in-memory order on every worker. With the same seed but a differently-ordered
155 # list, ``rng.sample`` would pick the same indices but those indices would resolve to different items, so
156 # workers would collect different subsets and xdist's collection-consistency check would abort the run with
157 # "Different tests were collected between gw0 and gwN". Sorting by ``nodeid`` (a content-derived total order)
158 # forces every worker to sample from an identical sequence.
🟢 159 group_sorted = sorted(group, key=lambda it: it.nodeid)
🟢 160 mixed = int.from_bytes(hashlib.sha256(f"{seed}|{key}".encode()).digest()[:8], "big")
🟢 161 rng = random.Random(mixed)
🟢 162 kept_nodeids = {it.nodeid for it in rng.sample(group_sorted, k=keep_n)}
🟢 163 for it in group:
🟢 164 (keep if it.nodeid in kept_nodeids else deselected).append(it)
165
🟢 166 if deselected:
167 # ``pytest_deselected`` is the supported way to report filtered-out items so pytest's summary shows them as
168 # deselected (not silently dropped). xdist also forwards this to the controller correctly.
🟢 169 config.hook.pytest_deselected(items=deselected)
🟢 170 items[:] = keep
171
172
🟢 tests/python/test_ad_gdar_diffmpm.py (100%)
8 # Defaults shrink particle / grid / steps counts so the JIT compile + AD-tape replay stays cheap; the slow-marked
9 # entry keeps the original (N=30, n_grid=120, steps=32) workload that runs on --run-slow. The point of the test is
10 # that the AD-validation checker fires on the global-data-access violation in g2p (`v[f, p] = new_v`), which happens
11 # on the first substep regardless of size.
🟢 12 @pytest.mark.parametrize(
13 "particles_side,n_grid_size,num_steps",
14 [
15 (8, 32, 4),
16 pytest.param(30, 120, 32, marks=pytest.mark.slow),
17 ],
18 )
🟢 20 def test_gdar_mpm(particles_side, n_grid_size, num_steps):
🟢 24 N = particles_side
🟢 26 n_grid = n_grid_size
🟢 35 max_steps = num_steps
🟢 36 steps = num_steps
🟢 tests/python/test_algorithms.py (100%)
🟢 323 _REDUCE_OPS = ["add", "min", "max"]
🟢 326 def _reduce_host(rng, op, dtype, N):
327 """Generate the test input for a reduce of `op` on `dtype` x N values.
328
329 ``add`` uses small uniform / bounded values so float sums stay representable; ``min`` and ``max`` use a wider
330 range (-10..10 for floats, +-10000 for ints) since picking-an-element is bitwise-exact regardless of magnitude.
331 """
🟢 332 if op == "add":
🟢 333 return _rand_reduce_host(rng, dtype, N)
🟢 335 return rng.uniform(-10.0, 10.0, size=N).astype(_DTYPE_TO_NP[dtype])
🟢 336 return _rand_reduce_host(rng, dtype, N, bound=10000)
🟢 339 def _check_reduce(op, dtype, N):
340 """Run ``device_reduce_<op>(arr)`` and verify against ``numpy.<op>(arr)``.
341
342 ``add`` accumulates so it needs (a) wider integer promotion + mod-wrap masking for u32/u64 and (b) per-N float
343 tolerance. ``min`` / ``max`` pick one input element, so they're bitwise-exact for both ints and floats.
344 """
🟢 348 host = _reduce_host(rng, op, dtype, N)
🟢 351 qd_fn = getattr(qd.algorithms, f"device_reduce_{op}")
🟢 352 qd_fn(inp, out=out)
🟢 355 if op == "add":
🟢 356 if _is_float(dtype):
🟢 357 expected = float(np.sum(host.astype(np.float64)))
🟢 358 rtol, atol = (_F32_REDUCE_RTOL, _F32_REDUCE_ATOL) if dtype == qd.f32 else (_F64_RTOL, _F64_ATOL)
🟢 359 assert math.isclose(
360 got, expected, rel_tol=rtol, abs_tol=atol
361 ), f"{dtype} reduce_add(N={N}): got {got}, expected {expected}"
362 else:
363 # Promote to Python int for an arbitrary-width reference; mask both sides to dtype width to handle the
364 # u32 / u64 mod-wrap case at large N.
🟢 365 mod = 1 << (32 if dtype in (qd.i32, qd.u32) else 64) if _is_unsigned(dtype) else None
🟢 366 ref = int(
367 np.sum(
368 host.astype(np.int64 if dtype in (qd.i32, qd.u32) else (np.int64 if dtype == qd.i64 else np.uint64))
369 )
370 ) # noqa: E501
🟢 371 got_int = int(got)
🟢 372 if mod is not None:
🟢 373 ref &= mod - 1
🟢 374 got_int &= mod - 1
🟢 375 assert got_int == ref, f"{dtype} reduce_add(N={N}): got {got_int}, expected {ref}"
🟢 376 return
377
🟢 378 expected = host.min() if op == "min" else host.max()
🟢 382 assert int(got) == int(expected), f"{dtype} reduce_{op}(N={N}): got {got}, expected {expected}"
🟢 385 @pytest.mark.parametrize("op", _REDUCE_OPS)
🟢 389 def test_device_reduce(op, dtype, N):
390 """``device_reduce_{add,min,max}`` match numpy across the full size sweep + dtype set.
392 Unified across the three op variants. ``add`` accumulates so it needs overflow / precision-aware comparison;
393 ``min`` / ``max`` pick one element of the input and are bitwise-exact.
394 """
🟢 395 _check_reduce(op, dtype, N)
🟢 450 _SCAN_OPS = ["add", "min", "max"]
🟢 453 def _scan_host(rng, op, dtype, N):
454 """Generate the test input for a scan of `op` on `dtype` x N values. Same rationale as ``_reduce_host``."""
🟢 455 if op == "add":
🟢 456 return _rand_reduce_host(rng, dtype, N, bound=100)
🟢 458 return rng.uniform(-10.0, 10.0, size=N).astype(_DTYPE_TO_NP[dtype])
🟢 459 return _rand_reduce_host(rng, dtype, N, bound=10000)
🟢 462 def _check_scan(op, dtype, N):
463 """Run ``device_exclusive_scan_<op>(arr)`` and verify against ``numpy.<op>.accumulate``-shifted.
464
465 Like the reduce family, ``add`` accumulates (overflow / precision care) while ``min`` / ``max`` are
466 bitwise-exact in both float and int paths.
467 """
🟢 472 host = _scan_host(rng, op, dtype, N)
🟢 475 qd_fn = getattr(qd.algorithms, f"device_exclusive_scan_{op}")
🟢 476 qd_fn(inp, out=out)
🟢 479 if op == "add":
🟢 480 if _is_float(dtype):
🟢 481 ref = np.concatenate([[0.0], np.cumsum(host.astype(np.float64))[:-1]])
🟢 482 rtol, atol = _f32_scan_tol(N) if dtype == qd.f32 else (_F64_RTOL, _F64_ATOL)
🟢 483 np.testing.assert_allclose(
484 got.astype(np.float64),
485 ref,
486 rtol=rtol,
487 atol=atol,
488 err_msg=f"{dtype} scan_add(N={N})",
489 )
490 else:
491 # Promote to a width that survives the cumulative sum: u64 / i64 inputs use a Python int reference;
492 # smaller ints can still use int64.
🟢 493 promote = np.int64 if dtype in (qd.i32, qd.u32, qd.i64) else np.uint64
🟢 494 host_wide = host.astype(promote)
🟢 495 ref = np.concatenate([[promote(0)], np.cumsum(host_wide)[:-1]]).astype(promote)
🟢 496 mask = _scan_dtype_mask(dtype)
🟢 497 got_view = got.astype(np.int64 if dtype != qd.u64 else np.uint64)
🟢 498 if mask != -1:
🟢 499 got_view = got_view & promote(mask)
🟢 500 ref = ref & promote(mask)
🟢 501 np.testing.assert_array_equal(got_view, ref, err_msg=f"{dtype} scan_add(N={N})")
🟢 502 return
503
🟢 504 np_accum = np.minimum.accumulate if op == "min" else np.maximum.accumulate
🟢 505 identity_table = _MIN_IDENTITY if op == "min" else _MAX_IDENTITY
🟢 507 identity = float("inf") if op == "min" else float("-inf")
🟢 508 ref = np.concatenate([[identity], np_accum(host.astype(np.float64))[:-1]]).astype(np_dt)
🟢 509 np.testing.assert_allclose(got, ref, rtol=0, atol=0, err_msg=f"{dtype} scan_{op}(N={N})")
🟢 511 ref = np.concatenate([[np_dt(identity_table[dtype])], np_accum(host)[:-1]]).astype(np_dt)
🟢 512 np.testing.assert_array_equal(got, ref, err_msg=f"{dtype} scan_{op}(N={N})")
🟢 515 @pytest.mark.parametrize("op", _SCAN_OPS)
🟢 519 def test_device_exclusive_scan(op, dtype, N):
520 """``device_exclusive_scan_{add,min,max}`` match ``numpy.{cumsum, minimum.accumulate, maximum.accumulate}``-shifted
521 across the full size sweep + dtype set. Unified across the three op variants; same overflow vs bitwise-exact
522 handling as the reduce family."""
🟢 523 _check_scan(op, dtype, N)
🟢 tests/python/test_eig.py (100%)
🟢 298 @pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)])
🟢 314 @pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)])
🟢 361 @pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)])
🟢 371 @pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)])
🟢 407 @pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)])
🟢 448 @pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)])
🟢 458 @pytest.mark.parametrize("n", [4, pytest.param(12, marks=pytest.mark.slow)])
🟢 538 @pytest.mark.parametrize("n", [3, pytest.param(12, marks=pytest.mark.slow)])
🟢 544 @pytest.mark.parametrize("n", [3, pytest.param(12, marks=pytest.mark.slow)])
🟢 tests/python/test_linalg.py (100%)
🟢 157 @pytest.mark.parametrize("n", [3, pytest.param(12, marks=pytest.mark.slow)])
🟢 163 @pytest.mark.parametrize("n", [3, pytest.param(12, marks=pytest.mark.slow)])
🟢 192 @pytest.mark.parametrize(
193 "rows,cols",
194 [
195 pytest.param(9, 12, marks=pytest.mark.slow),
196 pytest.param(12, 3, marks=pytest.mark.slow),
197 (2, 4),
198 ],
199 )
🟢 205 @pytest.mark.parametrize(
206 "rows,cols",
207 [
208 pytest.param(9, 12, marks=pytest.mark.slow),
209 pytest.param(12, 3, marks=pytest.mark.slow),
210 (2, 4),
211 ],
212 )
🟢 218 def _test_matmul_chain(rows_a, cols_a, cols_b, cols_c, dt):
219 """3-way matmul chain: ``(rows_a × cols_a) · (cols_a × cols_b) · (cols_b × cols_c) → (rows_a × cols_c)``.
221 Verifies that ``Matrix.__matmul__`` compiles and is numerically correct at the requested size. Quadrants
222 imposes no enforced size cap on matmul, but the unrolled `static(range)` triple loop produces
223 ``rows_a * cols_a * cols_b + rows_a * cols_b * cols_c`` FMAs per kernel call, so this test catches compile-time
224 blow-up or back-end miscompiles at large sizes. The largest parametrize value is the chain qipc actually uses;
225 smaller values are cheap sanity checks that the same code path still works.
🟢 228 A_np = np.random.default_rng(0xCA70).standard_normal((rows_a, cols_a)).astype(np_dt)
🟢 229 B_np = np.random.default_rng(0xCA71).standard_normal((cols_a, cols_b)).astype(np_dt)
🟢 230 C_np = np.random.default_rng(0xCA72).standard_normal((cols_b, cols_c)).astype(np_dt)
🟢 232 A = qd.Matrix.field(rows_a, cols_a, dtype=dt, shape=())
🟢 233 B = qd.Matrix.field(cols_a, cols_b, dtype=dt, shape=())
🟢 234 C = qd.Matrix.field(cols_b, cols_c, dtype=dt, shape=())
🟢 235 AB = qd.Matrix.field(rows_a, cols_b, dtype=dt, shape=())
🟢 236 ABC_chained = qd.Matrix.field(rows_a, cols_c, dtype=dt, shape=())
🟢 237 ABC_staged = qd.Matrix.field(rows_a, cols_c, dtype=dt, shape=())
260 # qipc's actual size is (9,12,12,9) -- the largest chain it instantiates. We also keep a tiny (3,4,4,3) chain so
261 # the default fast lane still exercises the same Matrix.__matmul__ codegen path without paying the ~90s/case
262 # CUDA JIT cost of the qipc-sized chain.
🟢 263 _MATMUL_CHAIN_SHAPES = [
264 (3, 4, 4, 3),
265 pytest.param(9, 12, 12, 9, marks=pytest.mark.slow),
266 ]
267
268
🟢 269 @pytest.mark.parametrize("rows_a,cols_a,cols_b,cols_c", _MATMUL_CHAIN_SHAPES)
🟢 271 def test_matmul_chain_qipc_sizes_f32(rows_a, cols_a, cols_b, cols_c):
🟢 272 _test_matmul_chain(rows_a, cols_a, cols_b, cols_c, qd.f32)
🟢 275 @pytest.mark.parametrize("rows_a,cols_a,cols_b,cols_c", _MATMUL_CHAIN_SHAPES)
🟢 277 def test_matmul_chain_qipc_sizes_f64(rows_a, cols_a, cols_b, cols_c):
🟢 278 _test_matmul_chain(rows_a, cols_a, cols_b, cols_c, qd.f64)
🟢 464 @pytest.mark.parametrize("n", [5, pytest.param(12, marks=pytest.mark.slow)])
🟢 474 @pytest.mark.parametrize("n", [5, pytest.param(12, marks=pytest.mark.slow)])
🟢 tests/python/test_simt.py (93%)
890 # The three single-output reduces (`test_block_reduce_{add,min,max}`) and their three broadcast siblings
891 # (`test_block_reduce_all_{add,min,max}`) share the same kernel skeleton, parametrize axes, and verification loop;
892 # they differ only in (a) which `block.reduce_*` function gets called, (b) the host-side reference oracle, (c) the
893 # init pattern (sequential for `add` so the running sum has signal, permuted hash for `min` / `max` so the result
894 # depends on lanes other than first / last), and (d) the float tolerance regime (`add` accumulates so it uses a
895 # relative tol; `min` / `max` pick one element of the input and use an absolute tol).
🟢 896 _BLOCK_REDUCE_OP_CASES = [
897 # (op_name, ref_fn, init_permuted, tol_relative)
898 pytest.param("add", _ref_reduce_add, False, True, id="add"),
899 pytest.param("min", _ref_reduce_min, True, False, id="min"),
900 pytest.param("max", _ref_reduce_max, True, False, id="max"),
901 ]
🟢 904 def _init_block_reduce_src(src, N, dtype, *, permuted):
905 """Initialize ``src[0:N]`` for a block reduce test. ``permuted=False`` is the sequential ``1..N`` init from
906 ``_init_field`` (good for add); ``permuted=True`` is the stable hash ``((i * 1009) % 997) + 1`` so the per-block
907 min / max depends on lanes other than first / last."""
🟢 908 if permuted:
🟢 910 v = ((i * 1009) % 997) + 1
🟢 911 src[i] = v if dtype in _BLOCK_REDUCE_INT_DTYPES else 1.0 * v
912 else:
🟢 913 _init_field(src, N, dtype)
🟢 916 def _assert_block_reduce_close(actual, expected, dtype, *, tol_relative, ctx):
917 """Assert ``actual ~= expected`` per the block-reduce tolerance regime.
919 Int dtypes compare exactly. Floats use relative tolerance ``1e-4 * |expected|`` for accumulating ops (sums grow
920 with block_dim, so a relative bound is the only thing that stays meaningful across the 32 / 128 / 256 / 64 / 256 /
921 512 block-size sweep), and absolute tolerance ``1e-5`` for picker ops (min / max pick one element so the
922 magnitude is whatever was in the input -- a small absolute bound suffices).
923 """
🟢 924 if dtype in _BLOCK_REDUCE_INT_DTYPES:
🟢 925 assert actual == expected, f"{ctx}: got {actual}, expected {expected}"
🟢 926 elif tol_relative:
🟢 927 assert abs(actual - expected) < 1e-4 * abs(expected), f"{ctx}: got {actual}, expected {expected}"
928 else:
🟢 929 assert abs(actual - expected) < 1e-5, f"{ctx}: got {actual}, expected {expected}"
931
🟢 932 @pytest.mark.parametrize("op_name,ref_fn,init_permuted,tol_relative", _BLOCK_REDUCE_OP_CASES)
🟢 936 def test_block_reduce(dtype, sg_per_block, op_name, ref_fn, init_permuted, tol_relative):
937 """Block reduce: thread 0 of each block holds ``<op>(src[block_base:block_base+block_dim])``. Unified across
938 ``add`` / ``min`` / ``max`` -- op-name is closure-captured into ``@qd.kernel``."""
🟢 940 op_fn = getattr(block, f"reduce_{op_name}")
🔴 952 agg = op_fn(src[i], block_dim, dtype)
🟢 956 _init_block_reduce_src(src, N, dtype, permuted=init_permuted)
🟢 961 expected = ref_fn(block_vals)
🟢 962 _assert_block_reduce_close(dst[b], expected, dtype, tol_relative=tol_relative, ctx=f"block {b}")
🟢 965 @pytest.mark.parametrize("op_name,ref_fn,init_permuted,tol_relative", _BLOCK_REDUCE_OP_CASES)
🟢 969 def test_block_reduce_all(dtype, sg_per_block, op_name, ref_fn, init_permuted, tol_relative):
970 """Block reduce broadcast: every thread of each block holds the block-wide ``<op>``. Verified by writing the
971 per-thread output to a flat field, then asserting every thread of a given block reads the same aggregate.
972 Unified across ``add`` / ``min`` / ``max``."""
🟢 974 op_fn = getattr(block, f"reduce_all_{op_name}")
🔴 985 dst[i] = op_fn(src[i], block_dim, dtype)
🟢 987 _init_block_reduce_src(src, N, dtype, permuted=init_permuted)
🟢 992 expected = ref_fn(block_vals)
🟢 995 _assert_block_reduce_close(actual, expected, dtype, tol_relative=tol_relative, ctx=f"block {b} thread {j}")
1048 # The four scan tests in this group (`test_block_inclusive_{add,min,max}` + `test_block_exclusive_add`) share the
1049 # kernel skeleton; only the per-op reference oracle, init pattern, and float tolerance differ. `add` accumulates
1050 # (sequential init, relative tol); `min` / `max` pick (permuted init, absolute tol). Exclusive `min` / `max` get
1051 # their own dedicated test below because they need a dtype-derived sentinel identity (+inf / iinfo(max), -inf /
1052 # iinfo(min)) at lane 0 with explicit ``isinf`` handling -- different enough that fusing them in would create more
1053 # branches than it removes.
🟢 1054 _PY_MIN = lambda a, b: a if a < b else b # noqa: E731 (intentional 1-line lambda for ref oracle)
🟢 1055 _PY_MAX = lambda a, b: a if a > b else b # noqa: E731
1056
🟢 1057 _BLOCK_INCLUSIVE_SCAN_OP_CASES = [
1058 # (op_name, ref_fn, init_permuted, tol_relative)
1059 pytest.param("add", _ref_inclusive_scan_add, False, True, id="add"),
1060 pytest.param("min", lambda vals: _ref_inclusive_scan_op(vals, _PY_MIN, 0), True, False, id="min"),
1061 pytest.param("max", lambda vals: _ref_inclusive_scan_op(vals, _PY_MAX, 0), True, False, id="max"),
1062 ]
🟢 1065 def _assert_block_scan_close(actual, expected_j, dtype, *, tol_relative, ctx):
1066 """Per-thread assertion for block scan tests. Same int / relative-float / absolute-float regime as
1067 ``_assert_block_reduce_close`` but with a floor on the relative-tol base so the first few prefixes (where
1068 ``expected_j`` is near zero) don't tighten the bound to zero."""
🟢 1069 if dtype in _BLOCK_REDUCE_INT_DTYPES:
🟢 1070 assert actual == expected_j, f"{ctx}: got {actual}, expected {expected_j}"
🟢 1071 elif tol_relative:
🟢 1072 tol_base = abs(expected_j) if abs(expected_j) > 1.0 else 1.0
🟢 1073 assert abs(actual - expected_j) < 1e-4 * tol_base, f"{ctx}: got {actual}, expected {expected_j}"
1074 else:
🟢 1075 assert abs(actual - expected_j) < 1e-5, f"{ctx}: got {actual}, expected {expected_j}"
🟢 1078 @pytest.mark.parametrize("op_name,ref_fn,init_permuted,tol_relative", _BLOCK_INCLUSIVE_SCAN_OP_CASES)
🟢 1082 def test_block_inclusive(dtype, sg_per_block, op_name, ref_fn, init_permuted, tol_relative):
1083 """Block inclusive prefix scan: thread ``i`` holds ``<op>(src[block_base..i])``. Unified across ``add`` / ``min``
1084 / ``max``."""
🟢 1086 op_fn = getattr(block, f"inclusive_{op_name}")
🔴 1097 dst[i] = op_fn(src[i], block_dim, dtype)
🟢 1099 _init_block_reduce_src(src, N, dtype, permuted=init_permuted)
🟢 1104 expected = ref_fn(block_vals)
🟢 1107 _assert_block_scan_close(actual, expected[j], dtype, tol_relative=tol_relative, ctx=f"block {b} thread {j}")
🟢 1113 def test_block_exclusive_add(dtype, sg_per_block):
1114 """Block exclusive prefix sum: thread ``i`` holds ``sum(src[block_base..i-1])``; thread 0 holds 0."""
🔴 1126 dst[i] = block.exclusive_add(src[i], block_dim, dtype)
🟢 1128 _init_field(src, N, dtype)
🟢 1133 expected = _ref_exclusive_scan_add(block_vals)
🟢 1136 _assert_block_scan_close(actual, expected[j], dtype, tol_relative=True, ctx=f"block {b} thread {j}")
🟢 1139 _BLOCK_EXCLUSIVE_MINMAX_CASES = [
1140 # (op_name, sentinel_fn, py_op, inf_sign)
1141 pytest.param("min", _block_exclusive_min_sentinel, _PY_MIN, 1, id="min"),
1142 pytest.param("max", _block_exclusive_max_sentinel, _PY_MAX, -1, id="max"),
1143 ]
🟢 1146 @pytest.mark.parametrize("op_name,sentinel_fn,py_op,inf_sign", _BLOCK_EXCLUSIVE_MINMAX_CASES)
🟢 1150 def test_block_exclusive_minmax(dtype, sg_per_block, op_name, sentinel_fn, py_op, inf_sign):
1151 """Block exclusive prefix ``<op>`` for ``op in {min, max}``; thread 0 of each block holds the dtype-derived
1152 identity (``+inf`` / ``iinfo(dtype).max`` for min, ``-inf`` / ``iinfo(dtype).min`` for max). The float ``inf`` /
1153 ``-inf`` lane-0 identity gets a sign-only check because ``inf - inf`` (or ``(-inf) - (-inf)``) is ``NaN`` and the
1154 standard ``abs(diff) < tol`` compare would fail spuriously."""
🟢 1156 op_fn = getattr(block, f"exclusive_{op_name}")
🔴 1167 dst[i] = op_fn(src[i], block_dim, dtype)
🟢 1169 _init_block_reduce_src(src, N, dtype, permuted=True)
🟢 1172 sentinel = sentinel_fn(dtype)
🟢 1175 expected = _ref_exclusive_scan_op(block_vals, py_op, sentinel)
🟢 1181 assert math.isinf(actual) and (
1182 actual > 0 if inf_sign > 0 else actual < 0
1183 ), f"block {b} thread {j}: got {actual}, expected {expected[j]}"
3427 # Each entry is a thin ``_check_full_matches_tiled(subgroup.X, subgroup.X_tiled, ...)`` wrapper. Collapsed into one
3428 # op-parametrized test to drop ~80 LOC of duplication. The pytest ids match the names of the original
3429 # ``test_subgroup_<op>`` functions so test reports / `-k` selectors stay stable.
🟢 3430 _FULL_VS_TILED_INT_CASES = [
3431 pytest.param("reduce_add", None, id="reduce_add"),
3432 pytest.param("reduce_all_add", None, id="reduce_all_add"),
3433 pytest.param("reduce_min", None, id="reduce_min"),
3434 pytest.param("reduce_max", None, id="reduce_max"),
3435 pytest.param("reduce_all_min", None, id="reduce_all_min"),
3436 pytest.param("reduce_all_max", None, id="reduce_all_max"),
3437 pytest.param("inclusive_add", None, id="inclusive_add"),
3438 pytest.param("inclusive_min", None, id="inclusive_min"),
3439 pytest.param("inclusive_max", None, id="inclusive_max"),
3440 # `mul` needs bounded inputs (2**N overflows i32 quickly); bitwise ops need a per-lane bit pattern that's
3441 # non-zero on every lane so AND has signal and OR / XOR have varied bits.
3442 pytest.param("inclusive_mul", _init_full_small_int, id="inclusive_mul"),
3443 pytest.param("inclusive_and", _init_full_bitwise, id="inclusive_and"),
3444 pytest.param("inclusive_or", _init_full_bitwise, id="inclusive_or"),
3445 pytest.param("inclusive_xor", _init_full_bitwise, id="inclusive_xor"),
3446 pytest.param("exclusive_add", None, id="exclusive_add"),
3447 pytest.param("exclusive_mul", _init_full_small_int, id="exclusive_mul"),
3448 pytest.param("exclusive_and", _init_full_bitwise, id="exclusive_and"),
3449 pytest.param("exclusive_or", _init_full_bitwise, id="exclusive_or"),
3450 pytest.param("exclusive_xor", _init_full_bitwise, id="exclusive_xor"),
3451 ]
🟢 3454 @pytest.mark.parametrize("op_name,host_init", _FULL_VS_TILED_INT_CASES)
🟢 3456 def test_subgroup_full_matches_tiled(op_name, host_init):
3457 """For each subgroup op ``X``, verify ``subgroup.X(v)`` matches ``subgroup.X_tiled(v, log2_group_size())``
3458 lane-by-lane on ``qd.i32``. Covers reduce / inclusive / exclusive families; bitwise ops + ``mul`` use a custom
3459 initializer that keeps the per-lane aggregate bounded."""
🟢 3460 full_fn = getattr(subgroup, op_name)
🟢 3461 tiled_fn = getattr(subgroup, f"{op_name}_tiled")
🟢 3462 kwargs = {}
🟢 3463 if host_init is not None:
🟢 3464 kwargs["host_init"] = host_init
🟢 3465 _check_full_matches_tiled(full_fn, tiled_fn, **kwargs)
🟢 3610 @pytest.mark.parametrize("op_name", ["reduce_add", "inclusive_add"])
🟢 3613 def test_subgroup_full_matches_tiled_float(op_name, dtype):
3614 """Float-dtype coverage of the dtype-agnostic ``full`` wrappers (``reduce_add``, ``inclusive_add``). One f32 + one
3615 f64 case per family is enough to catch an i32-only regression in a wrapper."""
🟢 3616 full_fn = getattr(subgroup, op_name)
🟢 3617 tiled_fn = getattr(subgroup, f"{op_name}_tiled")
🟢 3618 _check_full_matches_tiled(full_fn, tiled_fn, dtype=dtype)
🟢 tests/python/test_tile16.py (100%)
95 # 8 geometries x 2 tensor_type x 2 qd_dtype = 32 parametrize cases. The geometries enumerate hand-picked corner cases
96 # (origin, non-zero src/dst offsets, partial cols/rows, oversize backing array); coverage of any single geometry is
97 # more valuable than running every combination every CI run. ``@pytest.mark.sample(n=6)`` keeps 6 of the 32 cases per
98 # run; after k runs each specific case is hit with probability 1 - (26/32)^k = 1 - 0.8125^k (~65% after 5 runs, ~98%
99 # after 20). See docs/source/user_guide/unit_testing.md for the reproducibility recipes.
🟢 100 @pytest.mark.sample(n=6)
448 # 3 dst_delta x 3 src_offset x 2 tensor_type x 2 qd_dtype = 36 parametrize cases. Each case is an independent offset /
449 # delta combo; running 6 random ones per CI run with ~97% convergence over 20 runs is the right tradeoff given each
450 # case takes ~5s of cluster wall time. See unit_testing.md.
🟢 451 @pytest.mark.sample(n=6)
1789 """Smoke-test that misc/demos/cholesky_blocked.py runs to completion.
1790
1791 Uses small CLI overrides (N=32, N_ENVS=64, 1 warmup + 1 timed iter) so the JIT compile of the 3 unrolled kernels
1792 and the benchmark loop both stay cheap. The demo's defaults (N=92, N_ENVS=4096, 50+200 iters) are exercised by
1793 anyone running the script manually, not by CI.
1794 """
🟢 1796 cmd = [
1797 sys.executable,
1798 str(demo),
1799 "--n",
1800 "32",
1801 "--n-envs",
1802 "64",
1803 "--num-warmup",
1804 "1",
1805 "--num-iters",
1806 "1",
1807 ]
🟢 1808 result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
Loading