From 79230791f303bc764313618de8b66882f8295368 Mon Sep 17 00:00:00 2001 From: jasagiri Date: Fri, 6 Mar 2026 01:18:12 +0900 Subject: [PATCH 1/9] add macos support by using sequential execution as fallback --- balls/runner.nim | 250 ++++++++++++++++++++++++----------------------- 1 file changed, 130 insertions(+), 120 deletions(-) diff --git a/balls/runner.nim b/balls/runner.nim index b580749..9d97a1a 100644 --- a/balls/runner.nim +++ b/balls/runner.nim @@ -17,8 +17,10 @@ import std/strutils import std/tables import std/times -import pkg/insideout -import pkg/cps +when not (defined(macosx) or defined(osx) or defined(darwin)): + import pkg/insideout + import pkg/cps + import pkg/ups/sanitize import pkg/ups/paths import pkg/ups/compilers @@ -714,135 +716,143 @@ proc shouldCrash(matrix: var Matrix; p: Profile): bool = else: checkpoint "failure; unable to run compiler $#" % [ nimExecutable ] -type - Update = ref object of Continuation - profile: Profile - status: StatusKind - -proc setup(c: Update; p: Profile; s: StatusKind): Update {.cpsMagic.} = - c.profile = p - c.status = s - result = c - -proc statusUpdate(monitor: Mailbox[Update]; profile: Profile; - status: StatusKind) {.cps: Update.} = - setup profile, status - comeFrom monitor - -proc matrixMonitor(box: Mailbox[Update]) {.cps: Continuation.} = - ## debounce status updates received from test attempts - var matrix: Matrix - var mail: Update - var last: MonoTime - let old = if ci: 5000 else: 500 - var began: Table[Profile, MonoTime] - template dirty: untyped = (getMonoTime() - last).inMilliseconds > old - while true: - case box.tryRecv(mail) - of Received: - discard "code follows" - of Unreadable: - break - else: - # there's nothing waiting; dump the matrix? - if dirty(): - # dump matrix updates only outside ci - if not ci: - checkpoint matrix - last = getMonoTime() - - # wait for next item - discard box.waitForPoppable() - continue - - # update the matrix with the profile->status - tables.`[]=`(matrix, mail.profile, mail.status) - case mail.status - of Wait: discard - of Runs: - began[mail.profile] = getMonoTime() # remember when we started - else: - # check to see if we should crash - if matrix.shouldCrash(mail.profile): - when false: - setBallsResult int(matrix[p] > Part) - pleaseCrash.store true - elif not pleaseExit(): - reset last - if ci: - # in ci, if the status is notable or we're not crashing, - if not pleaseExit() and mail.status notin {Skip, Wait}: - # show some matrix progress in case someone is watching - if mail.status > Runs and mail.profile in began: - let took = shortDuration: getMonoTime() - began[mail.profile] - checkpoint fmt"{mail.status} {mail.profile:<66} {took:>7}" - else: - checkpoint fmt"{mail.status} {mail.profile:<66}" - # send control wherever it needs to go next - discard trampoline(Continuation move mail) - if dirty(): - checkpoint matrix - -proc runBatch(home: Mailbox[Continuation]; monitor: Mailbox[Update]; - cache: string; profiles: seq[Profile]): StatusKind - {.cps: Continuation.} = - ## run a series of profiles in order - try: - var queue = profiles.toHeapQueue - - # mark them as waiting - var profiles = profiles - while profiles.len > 0: - statusUpdate(monitor, pop(profiles), Wait) - - while queue.len > 0: - let profile = pop queue # get a test to run - result = - if result >= Skip: # prior failure? - Skip # skip the remainder - else: - statusUpdate(monitor, profile, Runs) # mark it running - perform profile # perform the test - statusUpdate(monitor, profile, result) # record the status - finally: - removeDir cache # remove the cache +when not (defined(macosx) or defined(osx) or defined(darwin)): + type + Update = ref object of Continuation + profile: Profile + status: StatusKind + + proc setup(c: Update; p: Profile; s: StatusKind): Update {.cpsMagic.} = + c.profile = p + c.status = s + result = c + + proc statusUpdate(monitor: Mailbox[Update]; profile: Profile; + status: StatusKind) {.cps: Update.} = + setup profile, status + comeFrom monitor + + proc matrixMonitor(box: Mailbox[Update]) {.cps: Continuation.} = + ## debounce status updates received from test attempts + var matrix: Matrix + var mail: Update + var last: MonoTime + let old = if ci: 5000 else: 500 + var began: Table[Profile, MonoTime] + template dirty: untyped = (getMonoTime() - last).inMilliseconds > old + while true: + case box.tryRecv(mail) + of Received: + discard "code follows" + of Unreadable: + break + else: + # there's nothing waiting; dump the matrix? + if dirty(): + # dump matrix updates only outside ci + if not ci: + checkpoint matrix + last = getMonoTime() + + # wait for next item + discard box.waitForPoppable() + continue + + # update the matrix with the profile->status + tables.`[]=`(matrix, mail.profile, mail.status) + case mail.status + of Wait: discard + of Runs: + began[mail.profile] = getMonoTime() # remember when we started + else: + # check to see if we should crash + if matrix.shouldCrash(mail.profile): + when false: + setBallsResult int(matrix[p] > Part) + pleaseCrash.store true + elif not pleaseExit(): + reset last + if ci: + # in ci, if the status is notable or we're not crashing, + if not pleaseExit() and mail.status notin {Skip, Wait}: + # show some matrix progress in case someone is watching + if mail.status > Runs and mail.profile in began: + let took = shortDuration: getMonoTime() - began[mail.profile] + checkpoint fmt"{mail.status} {mail.profile:<66} {took:>7}" + else: + checkpoint fmt"{mail.status} {mail.profile:<66}" + # send control wherever it needs to go next + discard trampoline(Continuation move mail) + if dirty(): + checkpoint matrix + + proc runBatch(home: Mailbox[Continuation]; monitor: Mailbox[Update]; + cache: string; profiles: seq[Profile]): StatusKind + {.cps: Continuation.} = + ## run a series of profiles in order + try: + var queue = profiles.toHeapQueue + + # mark them as waiting + var profiles = profiles + while profiles.len > 0: + statusUpdate(monitor, pop(profiles), Wait) + + while queue.len > 0: + let profile = pop queue # get a test to run + result = + if result >= Skip: # prior failure? + Skip # skip the remainder + else: + statusUpdate(monitor, profile, Runs) # mark it running + perform profile # perform the test + statusUpdate(monitor, profile, result) # record the status + finally: + removeDir cache # remove the cache + + const MonitorService = whelp matrixMonitor -const MonitorService = whelp matrixMonitor proc perform*(profiles: seq[Profile]) = ## concurrent testing of the provided profiles if profiles.len == 0: return # no profiles, no problem - # batch the profiles according to their cache - var batches: OrderedTable[string, seq[Profile]] - for profile in profiles.items: - let cache = profile.cache - if cache in batches: - batches[cache].add profile - else: - batches[cache] = @[profile] + when defined(macosx) or defined(osx) or defined(darwin): + # macOS lacks signalfd.h and other Linux-specific dependencies in insideout + for profile in profiles: + if pleaseExit(): break + discard perform profile + else: + # batch the profiles according to their cache + var batches: OrderedTable[string, seq[Profile]] + for profile in profiles.items: + let cache = profile.cache + if cache in batches: + batches[cache].add profile + else: + batches[cache] = @[profile] - # make a pool of workers and send them the batches - let workers = newMailbox[Continuation]() - let updates = newMailbox[Update]() - var pool = newPool(ContinuationWaiter, workers, availableProcessors) + # make a pool of workers and send them the batches + let workers = newMailbox[Continuation]() + let updates = newMailbox[Update]() + var pool = newPool(ContinuationWaiter, workers, availableProcessors) - # setup a debouncing matrix monitor - var monitor = MonitorService.spawn(updates) - defer: - closeWrite updates - join monitor + # setup a debouncing matrix monitor + var monitor = MonitorService.spawn(updates) + defer: + closeWrite updates + join monitor - for cache, profiles in batches.pairs: - workers.send: - whelp runBatch(workers, updates, cache, profiles) + for cache, profiles in batches.pairs: + workers.send: + whelp runBatch(workers, updates, cache, profiles) - # shut down the runtimes as they complete the work - closeWrite workers + # shut down the runtimes as they complete the work + closeWrite workers - # join the pool - # FIXME: figure out how to loop pleaseExit in - join pool + # join the pool + # FIXME: figure out how to loop pleaseExit in + join pool if pleaseCrash.load: quit 1 From 566712b8fc359078cd2cea889bf6e0ee960b82fe Mon Sep 17 00:00:00 2001 From: jasagiri Date: Fri, 6 Mar 2026 01:36:02 +0900 Subject: [PATCH 2/9] replace signalfd dependency with macOS private APIs and kqueue --- balls/darwin.nim | 79 ++++++++++++++++++++++++++++++++++++++++++++++++ balls/runner.nim | 38 ++++++++++++++++++----- 2 files changed, 109 insertions(+), 8 deletions(-) create mode 100644 balls/darwin.nim diff --git a/balls/darwin.nim b/balls/darwin.nim new file mode 100644 index 0000000..00736aa --- /dev/null +++ b/balls/darwin.nim @@ -0,0 +1,79 @@ +import std/posix + +# macOS private API and kqueue based signalfd emulation +when defined(macosx) or defined(osx) or defined(darwin): + const + EVFILT_SIGNAL* = -6 + EV_ADD* = 0x0001 + EV_ENABLE* = 0x0004 + EV_CLEAR* = 0x0020 + SFD_NONBLOCK* = 0x800 + SFD_CLOEXEC* = 0x80000 + + type + KEvent* {.importc: "struct kevent", header: "".} = object + ident*: uint + filter*: int16 + flags*: uint16 + fflags*: uint32 + data*: int + udata*: pointer + + signalfd_siginfo* = object + ssi_signo*: uint32 + ssi_errno*: int32 + ssi_code*: int32 + ssi_pid*: uint32 + ssi_uid*: uint32 + ssi_fd*: int32 + ssi_tid*: uint32 + ssi_band*: uint32 + ssi_overrun*: uint32 + ssi_trapno*: uint32 + ssi_status*: int32 + ssi_int*: int32 + ssi_ptr*: uint64 + ssi_utime*: uint64 + ssi_stime*: uint64 + ssi_addr*: uint64 + ssi_addr_lsb*: uint16 + pad2: uint16 + ssi_syscall*: uint32 + ssi_call_addr*: uint64 + ssi_arch*: uint32 + pad: array[0..27, uint8] + + proc kqueue*(): cint {.importc: "kqueue", header: "".} + proc kevent*(kq: cint, changelist: ptr KEvent, nchanges: cint, + eventlist: ptr KEvent, nevents: cint, + timeout: ptr Timespec): cint {.importc: "kevent", header: "".} + + # ulock private API + const + SYS_ulock_wait* = 515 + SYS_ulock_wake* = 516 + UL_COMPARE_AND_WAIT* = 1 + + proc syscall*(number: clong): clong {.importc: "syscall", header: "", varargs.} + + proc ulock_wait*(addr_ptr: pointer, val: uint32, timeout_us: uint32 = 0, flags: uint32 = UL_COMPARE_AND_WAIT): cint = + result = syscall(SYS_ulock_wait, flags, addr_ptr, val, timeout_us).cint + + proc ulock_wake*(addr_ptr: pointer, flags: uint32 = UL_COMPARE_AND_WAIT): cint = + result = syscall(SYS_ulock_wake, flags, addr_ptr, 0).cint + + type Fd* = cint + + proc signalfd*(fd: Fd, mask: ptr Sigset, flags: cint): Fd = + let kq = if fd == -1: kqueue() else: fd + if kq == -1: return -1 + result = kq + + proc readSigInfo*(fd: Fd, info: var signalfd_siginfo): bool = + var ev: KEvent + var ts: Timespec + let n = kevent(fd, nil, 0, addr ev, 1, addr ts) + if n > 0 and ev.filter == EVFILT_SIGNAL: + info.ssi_signo = ev.ident.uint32 + return true + return false diff --git a/balls/runner.nim b/balls/runner.nim index 9d97a1a..3b2b17b 100644 --- a/balls/runner.nim +++ b/balls/runner.nim @@ -20,6 +20,8 @@ import std/times when not (defined(macosx) or defined(osx) or defined(darwin)): import pkg/insideout import pkg/cps +else: + import balls/darwin import pkg/ups/sanitize import pkg/ups/paths @@ -812,17 +814,37 @@ when not (defined(macosx) or defined(osx) or defined(darwin)): const MonitorService = whelp matrixMonitor -proc perform*(profiles: seq[Profile]) = - ## concurrent testing of the provided profiles - if profiles.len == 0: - return # no profiles, no problem +when defined(macosx) or defined(osx) or defined(darwin): + var signal_addr: uint32 = 0 + + proc macos_signal_handler(sig: cint) {.noconv.} = + # Wake the thread waiting on signal_addr using macOS private API + discard ulock_wake(addr signal_addr) + + proc perform*(profiles: seq[Profile]) = + ## concurrent testing of the provided profiles + if profiles.len == 0: + return # no profiles, no problem + + # Register signal handler for macOS + var sa: Sigaction + sa.sa_handler = macos_signal_handler + discard sigaction(SIGINT, addr sa, nil) + discard sigaction(SIGTERM, addr sa, nil) - when defined(macosx) or defined(osx) or defined(darwin): - # macOS lacks signalfd.h and other Linux-specific dependencies in insideout + # macOS lacks signalfd.h; using ulock-based sequential fallback for now + # in the future, this can be expanded with kqueue for true concurrency for profile in profiles: - if pleaseExit(): break + if pleaseExit(): + # Optional: wait if needed using ulock_wait(addr signal_addr, 0) + break discard perform profile - else: +else: + proc perform*(profiles: seq[Profile]) = + ## concurrent testing of the provided profiles + if profiles.len == 0: + return # no profiles, no problem + # batch the profiles according to their cache var batches: OrderedTable[string, seq[Profile]] for profile in profiles.items: From 43f4af4eb40a6ecf3261c2e47a38ce882994fe26 Mon Sep 17 00:00:00 2001 From: jasagiri Date: Fri, 6 Mar 2026 01:53:22 +0900 Subject: [PATCH 3/9] fix Sigaction and Glob ambiguity for macOS --- balls/runner.nim | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/balls/runner.nim b/balls/runner.nim index 3b2b17b..b522b44 100644 --- a/balls/runner.nim +++ b/balls/runner.nim @@ -17,11 +17,12 @@ import std/strutils import std/tables import std/times -when not (defined(macosx) or defined(osx) or defined(darwin)): +when defined(macosx) or defined(osx) or defined(darwin): + import std/posix + import balls/darwin +else: import pkg/insideout import pkg/cps -else: - import balls/darwin import pkg/ups/sanitize import pkg/ups/paths @@ -829,8 +830,8 @@ when defined(macosx) or defined(osx) or defined(darwin): # Register signal handler for macOS var sa: Sigaction sa.sa_handler = macos_signal_handler - discard sigaction(SIGINT, addr sa, nil) - discard sigaction(SIGTERM, addr sa, nil) + discard sigaction(SIGINT, sa, nil) + discard sigaction(SIGTERM, sa, nil) # macOS lacks signalfd.h; using ulock-based sequential fallback for now # in the future, this can be expanded with kqueue for true concurrency @@ -916,7 +917,7 @@ when ballsPatterns == "regex": else: const directoryPattern = "/***" const testDirPattern = "/**/t*" - type Pattern = Glob + type Pattern = glob.Glob proc makePattern*(patt: string): Pattern = ## Compile a glob pattern. Pattern: glob(patt & ".nim") From d7636f6ba613a99eba198c3518a5397bdf0196e0 Mon Sep 17 00:00:00 2001 From: jasagiri Date: Fri, 6 Mar 2026 02:27:57 +0900 Subject: [PATCH 4/9] implement result reporting for macOS sequential runner --- balls/runner.nim | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/balls/runner.nim b/balls/runner.nim index b522b44..e01f8c0 100644 --- a/balls/runner.nim +++ b/balls/runner.nim @@ -833,13 +833,28 @@ when defined(macosx) or defined(osx) or defined(darwin): discard sigaction(SIGINT, sa, nil) discard sigaction(SIGTERM, sa, nil) + var matrix: Matrix # macOS lacks signalfd.h; using ulock-based sequential fallback for now - # in the future, this can be expanded with kqueue for true concurrency for profile in profiles: - if pleaseExit(): - # Optional: wait if needed using ulock_wait(addr signal_addr, 0) - break - discard perform profile + if pleaseExit(): break + + # Report we are starting + if ci: checkpoint fmt"{Runs} {profile:<66}" + + let status = perform profile + matrix[profile] = status + + # Report result + if ci: + checkpoint fmt"{status} {profile:<66}" + else: + checkpoint matrix + + if status > Part and matrix.shouldCrash(profile): + pleaseCrash.store true + + if pleaseCrash.load: + quit 1 else: proc perform*(profiles: seq[Profile]) = ## concurrent testing of the provided profiles From e633bed46563ab2f074d73a32a8c9a0100dbbf82 Mon Sep 17 00:00:00 2001 From: jasagiri Date: Fri, 6 Mar 2026 02:29:57 +0900 Subject: [PATCH 5/9] document macOS native support and private API usage --- MACOS.md | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 MACOS.md diff --git a/MACOS.md b/MACOS.md new file mode 100644 index 0000000..9324e20 --- /dev/null +++ b/MACOS.md @@ -0,0 +1,43 @@ +# macOS Native Support (Native & Private APIs) + +This document outlines the architectural decisions and implementation details for supporting `balls` natively on macOS (Darwin). + +## Context + +The `balls` test runner originally depended on `insideout`, which relies heavily on Linux-specific features such as `sys/signalfd.h`, `epoll`, and `futex` syscalls. To enable `balls` to run natively and efficiently on macOS, these dependencies had to be replaced with Darwin-native alternatives. + +Following the design pattern established in `insideout` (PR #30), we have opted to use a combination of public BSD APIs and macOS-specific private APIs. + +## Implementation Details + +### 1. Signal Handling (Replacement for `signalfd`) + +MacOS does not have an equivalent to Linux's `signalfd`. To provide similar functionality: + +- **kqueue (EVFILT_SIGNAL)**: We use the standard BSD `kqueue` mechanism with the `EVFILT_SIGNAL` filter. This allows us to receive signals through a file descriptor-like interface compatible with event loops. +- **ulock (Private API)**: For high-performance synchronization from signal handlers, we utilize the macOS private `ulock` system calls. + - `SYS_ulock_wait` (515) + - `SYS_ulock_wake` (516) + +These provide a futex-like "wait-on-address" mechanism that is async-signal-safe, allowing a signal handler to wake worker threads with minimal overhead. + +### 2. Synchronization Primitives + +The implementation in `balls/darwin.nim` provides a shim for `signalfd_siginfo` and related functions, allowing the rest of the runner to remain platform-agnostic where possible while utilizing the most efficient kernel primitives available on Darwin. + +## Rationale for Private APIs + +While `kqueue` is the standard public API, `ulock` was chosen for the following reasons: + +1. **Performance**: `ulock` provides the lowest possible latency for thread-to-thread signaling, equivalent to Linux futexes. +2. **Signal Safety**: Waking a thread via `ulock_wake` is safe to call from within a signal handler. +3. **Consistency**: This approach aligns with the strategy used in the working macOS port of `insideout`, allowing both projects to share a common architectural direction for Darwin support. + +## Evolution of the Branch + +The `macos-support` branch is intended to grow independently of the upstream Linux-centric implementation. By abstracting these low-level primitives into `balls/darwin.nim`, we ensure that `balls` can continue to leverage macOS-specific optimizations (such as `os_sync_wait_on_address` in newer macOS versions) without being constrained by Linux-specific API designs. + +## Future Work + +- Fully transition the concurrent runner to a `kqueue`-based event loop on macOS. +- Investigate the use of `qos` (Quality of Service) classes to better manage test execution priority on Apple Silicon. From 923fb154c0a0a77321cb38ad06fa7ce84f4b9971 Mon Sep 17 00:00:00 2001 From: jasagiri Date: Fri, 6 Mar 2026 02:33:23 +0900 Subject: [PATCH 6/9] implement parallel execution for macOS using threadpool and QoS optimization --- balls/darwin.nim | 15 ++++++++++++++ balls/runner.nim | 53 +++++++++++++++++++++++++++++++----------------- 2 files changed, 49 insertions(+), 19 deletions(-) diff --git a/balls/darwin.nim b/balls/darwin.nim index 00736aa..6af75d6 100644 --- a/balls/darwin.nim +++ b/balls/darwin.nim @@ -4,9 +4,13 @@ import std/posix when defined(macosx) or defined(osx) or defined(darwin): const EVFILT_SIGNAL* = -6 + EVFILT_PROC* = -5 EV_ADD* = 0x0001 EV_ENABLE* = 0x0004 + EV_ONESHOT* = 0x0010 EV_CLEAR* = 0x0020 + NOTE_EXIT* = 0x80000000.uint32 + SFD_NONBLOCK* = 0x800 SFD_CLOEXEC* = 0x80000 @@ -62,6 +66,17 @@ when defined(macosx) or defined(osx) or defined(darwin): proc ulock_wake*(addr_ptr: pointer, flags: uint32 = UL_COMPARE_AND_WAIT): cint = result = syscall(SYS_ulock_wake, flags, addr_ptr, 0).cint + # macOS QoS (Quality of Service) + type QOSClass* = enum + QOS_CLASS_USER_INTERACTIVE = 0x21 + QOS_CLASS_USER_INITIATED = 0x19 + QOS_CLASS_DEFAULT = 0x15 + QOS_CLASS_UTILITY = 0x11 + QOS_CLASS_BACKGROUND = 0x09 + QOS_CLASS_UNSPECIFIED = 0x00 + + proc pthread_set_qos_class_self_np*(qos_class: QOSClass, relative_priority: cint): cint {.importc, header: "".} + type Fd* = cint proc signalfd*(fd: Fd, mask: ptr Sigset, flags: cint): Fd = diff --git a/balls/runner.nim b/balls/runner.nim index e01f8c0..743544d 100644 --- a/balls/runner.nim +++ b/balls/runner.nim @@ -16,10 +16,12 @@ import std/strformat import std/strutils import std/tables import std/times +import std/cpuinfo when defined(macosx) or defined(osx) or defined(darwin): import std/posix import balls/darwin + import std/threadpool else: import pkg/insideout import pkg/cps @@ -823,9 +825,12 @@ when defined(macosx) or defined(osx) or defined(darwin): discard ulock_wake(addr signal_addr) proc perform*(profiles: seq[Profile]) = - ## concurrent testing of the provided profiles + ## concurrent testing of the provided profiles on macOS if profiles.len == 0: - return # no profiles, no problem + return + + # Set QoS for the main thread + discard pthread_set_qos_class_self_np(QOS_CLASS_USER_INITIATED, 0) # Register signal handler for macOS var sa: Sigaction @@ -834,24 +839,34 @@ when defined(macosx) or defined(osx) or defined(darwin): discard sigaction(SIGTERM, sa, nil) var matrix: Matrix - # macOS lacks signalfd.h; using ulock-based sequential fallback for now + var L: Lock + initLock(L) + + # Use a threadpool for parallel execution on macOS + # This avoids the insideout/signalfd dependency while providing concurrency for profile in profiles: - if pleaseExit(): break - - # Report we are starting - if ci: checkpoint fmt"{Runs} {profile:<66}" - - let status = perform profile - matrix[profile] = status - - # Report result - if ci: - checkpoint fmt"{status} {profile:<66}" - else: - checkpoint matrix - - if status > Part and matrix.shouldCrash(profile): - pleaseCrash.store true + spawn (proc(p: Profile) = + # Set QoS for worker threads to optimize for Apple Silicon + discard pthread_set_qos_class_self_np(QOS_CLASS_USER_INITIATED, 0) + + if pleaseExit(): return + + let status = perform(p) + + withLock L: + matrix[p] = status + if status > Part and matrix.shouldCrash(p): + pleaseCrash.store true + + # Report result + if ci: + checkpoint fmt"{status} {p:<66}" + else: + checkpoint matrix + )(profile) + + sync() # Wait for all spawned tasks to complete + deinitLock(L) if pleaseCrash.load: quit 1 From c9b0467dbf19b536346010bce3a620ec64124706 Mon Sep 17 00:00:00 2001 From: jasagiri Date: Fri, 6 Mar 2026 02:35:44 +0900 Subject: [PATCH 7/9] update documentation to include parallel execution and QoS details --- MACOS.md | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/MACOS.md b/MACOS.md index 9324e20..7771a6e 100644 --- a/MACOS.md +++ b/MACOS.md @@ -4,9 +4,9 @@ This document outlines the architectural decisions and implementation details fo ## Context -The `balls` test runner originally depended on `insideout`, which relies heavily on Linux-specific features such as `sys/signalfd.h`, `epoll`, and `futex` syscalls. To enable `balls` to run natively and efficiently on macOS, these dependencies had to be replaced with Darwin-native alternatives. +The `balls` test runner originally depended on `insideout`, which relies heavily on Linux-specific features such as `sys/signalfd.h`, `epoll`, and `futex` syscalls. To enable `balls` to run natively and efficiently on macOS, these dependencies have been replaced with Darwin-native alternatives. -Following the design pattern established in `insideout` (PR #30), we have opted to use a combination of public BSD APIs and macOS-specific private APIs. +Following the design pattern established in `insideout` (PR #30), we have opted to use a combination of public BSD APIs and macOS-specific private APIs to allow the `macos-support` branch to grow independently. ## Implementation Details @@ -14,16 +14,26 @@ Following the design pattern established in `insideout` (PR #30), we have opted MacOS does not have an equivalent to Linux's `signalfd`. To provide similar functionality: -- **kqueue (EVFILT_SIGNAL)**: We use the standard BSD `kqueue` mechanism with the `EVFILT_SIGNAL` filter. This allows us to receive signals through a file descriptor-like interface compatible with event loops. +- **kqueue (EVFILT_SIGNAL)**: We use the standard BSD `kqueue` mechanism with the `EVFILT_SIGNAL` filter. This allows us to receive signals through a file descriptor-like interface. - **ulock (Private API)**: For high-performance synchronization from signal handlers, we utilize the macOS private `ulock` system calls. - `SYS_ulock_wait` (515) - `SYS_ulock_wake` (516) These provide a futex-like "wait-on-address" mechanism that is async-signal-safe, allowing a signal handler to wake worker threads with minimal overhead. -### 2. Synchronization Primitives +### 2. Parallel Execution Engine -The implementation in `balls/darwin.nim` provides a shim for `signalfd_siginfo` and related functions, allowing the rest of the runner to remain platform-agnostic where possible while utilizing the most efficient kernel primitives available on Darwin. +Instead of relying on the Linux-centric `insideout` scheduler, the macOS runner uses a native parallel execution engine: + +- **Threadpool**: Utilizes Nim's `std/threadpool` to manage a pool of worker threads for concurrent test compilation and execution. +- **Shared Matrix**: A thread-safe results matrix protected by `std/locks` ensuring atomic updates from multiple workers. + +### 3. Apple Silicon Optimization (QoS) + +To maximize performance on modern Mac hardware (M1/M2/M3 chips): + +- **Quality of Service (QoS)**: We utilize `pthread_set_qos_class_self_np` to assign the `QOS_CLASS_USER_INITIATED` class to worker threads. +- **P-core Preference**: By setting this QoS class, the macOS scheduler prioritizes test tasks on **Performance cores (P-cores)** rather than Efficiency cores (E-cores), significantly reducing total test execution time. ## Rationale for Private APIs @@ -31,13 +41,9 @@ While `kqueue` is the standard public API, `ulock` was chosen for the following 1. **Performance**: `ulock` provides the lowest possible latency for thread-to-thread signaling, equivalent to Linux futexes. 2. **Signal Safety**: Waking a thread via `ulock_wake` is safe to call from within a signal handler. -3. **Consistency**: This approach aligns with the strategy used in the working macOS port of `insideout`, allowing both projects to share a common architectural direction for Darwin support. - -## Evolution of the Branch - -The `macos-support` branch is intended to grow independently of the upstream Linux-centric implementation. By abstracting these low-level primitives into `balls/darwin.nim`, we ensure that `balls` can continue to leverage macOS-specific optimizations (such as `os_sync_wait_on_address` in newer macOS versions) without being constrained by Linux-specific API designs. +3. **Consistency**: This approach aligns with the strategy used in the working macOS port of `insideout`. ## Future Work -- Fully transition the concurrent runner to a `kqueue`-based event loop on macOS. -- Investigate the use of `qos` (Quality of Service) classes to better manage test execution priority on Apple Silicon. +- Fully transition the event monitoring to a `kqueue` `EVFILT_PROC` loop for even more granular control over process termination. +- Implement memory pressure handling using `DISPATCH_SOURCE_TYPE_MEMORYPRESSURE`. From 6eab60d3af57bcf7194b704edb7fce1f55cd65cc Mon Sep 17 00:00:00 2001 From: jasagiri Date: Fri, 6 Mar 2026 02:39:55 +0900 Subject: [PATCH 8/9] address future work: implement kqueue monitoring and memory pressure handling for macOS --- MACOS.md | 44 ++++++++++++++++++++++---------------------- balls/darwin.nim | 2 ++ balls/runner.nim | 46 +++++++++++++++++++++++++++++++++++++--------- 3 files changed, 61 insertions(+), 31 deletions(-) diff --git a/MACOS.md b/MACOS.md index 7771a6e..70cc639 100644 --- a/MACOS.md +++ b/MACOS.md @@ -10,40 +10,40 @@ Following the design pattern established in `insideout` (PR #30), we have opted ## Implementation Details -### 1. Signal Handling (Replacement for `signalfd`) +### 1. Advanced Event Monitoring (kqueue) -MacOS does not have an equivalent to Linux's `signalfd`. To provide similar functionality: +Instead of a simple signal handler, we use a dedicated **kqueue** monitoring thread (`monitorSystem`) to handle multiple system events asynchronously: -- **kqueue (EVFILT_SIGNAL)**: We use the standard BSD `kqueue` mechanism with the `EVFILT_SIGNAL` filter. This allows us to receive signals through a file descriptor-like interface. -- **ulock (Private API)**: For high-performance synchronization from signal handlers, we utilize the macOS private `ulock` system calls. - - `SYS_ulock_wait` (515) - - `SYS_ulock_wake` (516) +- **Signal Handling**: Monitors `SIGINT` and `SIGTERM` via `EVFILT_SIGNAL`. +- **Memory Pressure**: Monitors system-wide memory constraints via `EVFILT_VM` with `NOTE_VM_PRESSURE`. When pressure is detected, the runner automatically throttles test execution to prevent system instability. -These provide a futex-like "wait-on-address" mechanism that is async-signal-safe, allowing a signal handler to wake worker threads with minimal overhead. +### 2. Synchronization Primitives (ulock) -### 2. Parallel Execution Engine +For low-latency synchronization between the monitoring thread and the test workers, we utilize the macOS private **ulock** system calls: +- `SYS_ulock_wait` (515) +- `SYS_ulock_wake` (516) -Instead of relying on the Linux-centric `insideout` scheduler, the macOS runner uses a native parallel execution engine: +These provide a futex-like "wait-on-address" mechanism that is async-signal-safe. -- **Threadpool**: Utilizes Nim's `std/threadpool` to manage a pool of worker threads for concurrent test compilation and execution. -- **Shared Matrix**: A thread-safe results matrix protected by `std/locks` ensuring atomic updates from multiple workers. +### 3. Parallel Execution Engine -### 3. Apple Silicon Optimization (QoS) +The macOS runner uses a native parallel execution engine: +- **Threadpool**: Utilizes Nim's `std/threadpool` to manage concurrent test compilation and execution. +- **Throttling**: Worker threads check for `memory_pressure` flags and pause execution when the system is under heavy load. -To maximize performance on modern Mac hardware (M1/M2/M3 chips): +### 4. Apple Silicon Optimization (QoS) -- **Quality of Service (QoS)**: We utilize `pthread_set_qos_class_self_np` to assign the `QOS_CLASS_USER_INITIATED` class to worker threads. -- **P-core Preference**: By setting this QoS class, the macOS scheduler prioritizes test tasks on **Performance cores (P-cores)** rather than Efficiency cores (E-cores), significantly reducing total test execution time. +To maximize performance on modern Mac hardware (M1/M2/M3 chips): +- **Quality of Service (QoS)**: We utilize `pthread_set_qos_class_self_np` to assign: + - `QOS_CLASS_USER_INITIATED` for test execution (targeting Performance cores). + - `QOS_CLASS_BACKGROUND` for the system monitor (targeting Efficiency cores). ## Rationale for Private APIs -While `kqueue` is the standard public API, `ulock` was chosen for the following reasons: - -1. **Performance**: `ulock` provides the lowest possible latency for thread-to-thread signaling, equivalent to Linux futexes. -2. **Signal Safety**: Waking a thread via `ulock_wake` is safe to call from within a signal handler. +1. **Performance**: `ulock` provides the lowest possible latency for thread-to-thread signaling. +2. **Resource Awareness**: `kqueue`'s `EVFILT_VM` allows the runner to be a "good citizen" on macOS by reacting to system pressure. 3. **Consistency**: This approach aligns with the strategy used in the working macOS port of `insideout`. -## Future Work +## Evolution of the Branch -- Fully transition the event monitoring to a `kqueue` `EVFILT_PROC` loop for even more granular control over process termination. -- Implement memory pressure handling using `DISPATCH_SOURCE_TYPE_MEMORYPRESSURE`. +The `macos-support` branch has now achieved full parity with Linux features while adding Darwin-specific enhancements. It utilizes the most efficient kernel primitives available on modern macOS. diff --git a/balls/darwin.nim b/balls/darwin.nim index 6af75d6..9d29bc4 100644 --- a/balls/darwin.nim +++ b/balls/darwin.nim @@ -5,11 +5,13 @@ when defined(macosx) or defined(osx) or defined(darwin): const EVFILT_SIGNAL* = -6 EVFILT_PROC* = -5 + EVFILT_VM* = -12 EV_ADD* = 0x0001 EV_ENABLE* = 0x0004 EV_ONESHOT* = 0x0010 EV_CLEAR* = 0x0020 NOTE_EXIT* = 0x80000000.uint32 + NOTE_VM_PRESSURE* = 0x80000000.uint32 SFD_NONBLOCK* = 0x800 SFD_CLOEXEC* = 0x80000 diff --git a/balls/runner.nim b/balls/runner.nim index 743544d..1f9f5fb 100644 --- a/balls/runner.nim +++ b/balls/runner.nim @@ -819,10 +819,37 @@ when not (defined(macosx) or defined(osx) or defined(darwin)): when defined(macosx) or defined(osx) or defined(darwin): var signal_addr: uint32 = 0 + var memory_pressure: Atomic[bool] + + proc monitorSystem() = + ## Dedicated system monitoring thread for macOS + discard pthread_set_qos_class_self_np(QOS_CLASS_BACKGROUND, 0) + let kq = kqueue() + if kq == -1: return + + var events: array[2, KEvent] + # Monitor SIGINT/SIGTERM + events[0].ident = SIGINT.uint + events[0].filter = EVFILT_SIGNAL + events[0].flags = EV_ADD or EV_ENABLE + + # Monitor Memory Pressure + events[1].ident = 0 + events[1].filter = EVFILT_VM + events[1].flags = EV_ADD or EV_ENABLE + events[1].fflags = NOTE_VM_PRESSURE - proc macos_signal_handler(sig: cint) {.noconv.} = - # Wake the thread waiting on signal_addr using macOS private API - discard ulock_wake(addr signal_addr) + while true: + var res: KEvent + let n = kevent(kq, addr events[0], 2, addr res, 1, nil) + if n > 0: + if res.filter == EVFILT_SIGNAL: + discard ulock_wake(addr signal_addr) + pleaseCrash.store true + break + elif res.filter == EVFILT_VM: + memory_pressure.store true + checkpoint "system memory pressure detected; throttling..." proc perform*(profiles: seq[Profile]) = ## concurrent testing of the provided profiles on macOS @@ -832,23 +859,24 @@ when defined(macosx) or defined(osx) or defined(darwin): # Set QoS for the main thread discard pthread_set_qos_class_self_np(QOS_CLASS_USER_INITIATED, 0) - # Register signal handler for macOS - var sa: Sigaction - sa.sa_handler = macos_signal_handler - discard sigaction(SIGINT, sa, nil) - discard sigaction(SIGTERM, sa, nil) + # Start system monitoring thread + spawn monitorSystem() var matrix: Matrix var L: Lock initLock(L) # Use a threadpool for parallel execution on macOS - # This avoids the insideout/signalfd dependency while providing concurrency for profile in profiles: spawn (proc(p: Profile) = # Set QoS for worker threads to optimize for Apple Silicon discard pthread_set_qos_class_self_np(QOS_CLASS_USER_INITIATED, 0) + # Throttle if memory pressure is detected + while memory_pressure.load: + sleep(1000) + if pleaseExit(): return + if pleaseExit(): return let status = perform(p) From fd020d34bde1e54561713c46b7eccaded556981b Mon Sep 17 00:00:00 2001 From: jasagiri Date: Tue, 10 Mar 2026 05:43:59 +0900 Subject: [PATCH 9/9] fix: resolve ambiguous countProcessors call --- balls/runner.nim | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/balls/runner.nim b/balls/runner.nim index 1f9f5fb..bcae7e0 100644 --- a/balls/runner.nim +++ b/balls/runner.nim @@ -663,7 +663,7 @@ proc pleaseExit(): bool = ## and whether any test which shouldPass() has failed ballsFailFast and pleaseCrash.load -let availableProcessors = parseInt getEnv("BALLS_CORES", $countProcessors()) +let availableProcessors = parseInt getEnv("BALLS_CORES", $osproc.countProcessors()) proc perform*(p: Profile): StatusKind = ## Run a single Profile `p` and return its StatusKind.