diff --git a/include/clasp/core/bytecode.h b/include/clasp/core/bytecode.h index 33b3b2655a..f2c16fbad9 100644 --- a/include/clasp/core/bytecode.h +++ b/include/clasp/core/bytecode.h @@ -9,6 +9,7 @@ namespace core { class Bytecode_O; +struct VMDynRecord; // defined in clasp/gctools/threadlocal.h }; template <> struct gctools::GCInfo { @@ -326,19 +327,25 @@ class VMFrameDynEnv_O : public DynEnv_O { LISP_CLASS(core, CorePkg, VMFrameDynEnv_O, "VMFrameDynEnv", DynEnv_O); public: - VMFrameDynEnv_O(T_O** a_old_sp, T_O** a_old_fp) : old_sp(a_old_sp), old_fp(a_old_fp) {} + VMFrameDynEnv_O(T_O** a_old_sp, T_O** a_old_fp, VMDynRecord* a_old_dyn_top) + : old_sp(a_old_sp), old_fp(a_old_fp), old_dyn_top(a_old_dyn_top) {} // Slightly sketchy: We use the destructor to reset the stack pointer, // so that C++ unwinds are also affected by this dynenv. // This means VMFrames must be stack allocated. + // old_dyn_top is the _dynRecordTop mark saved when this bytecode_call was + // entered. If an SJLJ longjmp bypasses this frame, proceed() restores it so + // the VM dynenv-record stack does not keep stale records across activations. ~VMFrameDynEnv_O() { VirtualMachine& vm = my_thread->_VM; vm._stackPointer = this->old_sp; vm._framePointer = this->old_fp; + vm._dynRecordTop = this->old_dyn_top; } public: T_O** old_sp; T_O** old_fp; + VMDynRecord* old_dyn_top; public: virtual SearchStatus search() const { return Continue; } diff --git a/include/clasp/core/clasp_gmpxx.h b/include/clasp/core/clasp_gmpxx.h index b448325e36..3a4d8dae73 100644 --- a/include/clasp/core/clasp_gmpxx.h +++ b/include/clasp/core/clasp_gmpxx.h @@ -28,6 +28,9 @@ THE SOFTWARE. /* Define a C++ GMP wrapper */ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wdeprecated-literal-operator" #include +#pragma clang diagnostic pop typedef mpz_class Bignum; diff --git a/include/clasp/core/commandLineOptions.h b/include/clasp/core/commandLineOptions.h index 178becafe1..fe2bebc68d 100644 --- a/include/clasp/core/commandLineOptions.h +++ b/include/clasp/core/commandLineOptions.h @@ -77,6 +77,7 @@ struct CommandLineOptions { bool _NoRc; bool _PauseForDebugger; bool _GenerateTrampolines; + std::vector _ExtensionArguments; bool validStartupTypeOption(const std::string& arg); void printVersion(); diff --git a/include/clasp/core/lisp.h b/include/clasp/core/lisp.h index 75a8f693d5..347652042b 100644 --- a/include/clasp/core/lisp.h +++ b/include/clasp/core/lisp.h @@ -279,6 +279,10 @@ class Lisp { std::atomic _AllObjectFiles; std::atomic _AllCodeBlocks; std::atomic _AllBytecodeModules; + // Every GFBytecodeSimpleFun ever made (atomic-pushed list of cons cells). + // Walked by arena_post_load_regenerate_trampolines after a snapshot load + // so the dispatch trampoline for each generic function gets re-attached. + std::atomic _AllGFBytecodeFuns; SimpleFun_sp _UnboundCellFunctionEntryPoint; T_sp _TerminalIO; List_sp _ActiveThreads; diff --git a/include/clasp/core/sampling_profiler.h b/include/clasp/core/sampling_profiler.h new file mode 100644 index 0000000000..55e769a217 --- /dev/null +++ b/include/clasp/core/sampling_profiler.h @@ -0,0 +1,91 @@ +/* + * sampling_profiler.h — CPU-time sampling profiler. + * + * At rate `N` Hz, an ITIMER_PROF timer delivers SIGPROF to an arbitrary + * running thread. The handler walks the frame-pointer chain via the + * ucontext registers and appends a sample (timestamp, thread id, depth, + * optional bytecode-VM pc, variable-length PC array) to a per-process + * bump-allocated ring. + * + * Separate from src/core/profiler.cc's RangePush/RangePop instrumentation. + * That profiler measures user-annotated regions; this one periodically + * snapshots whatever code is running. + * + * See Phase 4 / Phase 5 for post-mortem symbolication and flame-graph + * output — this header covers the recording side only. + */ +#pragma once + +#include +#include +#include +#include + +namespace core { + +// Per-sample header (variable-length record). A SampleHeader is followed +// immediately in the ring buffer by `depth` × uint64_t native PCs. +struct SampleHeader { + uint64_t timestamp_ns; // CLOCK_MONOTONIC at signal delivery + uint64_t vm_pc; // bytecode VM's _pc at sample time, or 0 + uint32_t thread_id; // Linux tid / macOS port id (truncated) + uint32_t depth; // number of trailing PCs (0 if walk failed) +}; + +// Aggregated symbolicated sample: one entry per unique (thread_id, frames) +// group. `frames` is outermost-first (index 0 is the root, last is the +// leaf). `sample_count` is the number of raw samples that collapsed into +// this entry. +struct SymbolicatedSample { + uint32_t thread_id; + size_t sample_count; + std::vector frames; + core::T_sp encode(); +}; + +// Start the profiler. +// rate_hz : sampling rate in Hz (e.g. 97). Clamped to [1, 10000]. +// max_depth : per-sample stack-depth cap. Clamped to [1, 8192]. +// buffer_bytes : ring buffer size (0 = default 256 MiB). +// Returns true on success. Fails if the profiler is already running or the +// OS timer/signal setup fails. +bool sampling_profiler_start(unsigned rate_hz, + unsigned max_depth, + size_t buffer_bytes); + +// Stop sampling. The buffer is preserved; call +// sampling_profiler_save / sampling_profiler_reset to drain / clear. +void sampling_profiler_stop(); + +// True while a profile session is active. +bool sampling_profiler_running(); + +// Discard all captured samples and reset the bump pointer. +void sampling_profiler_reset(); + +// Drop the ring buffer contents to `path` as collapsed-stacks format +// (one stack per line, semicolon-separated, trailing ' '), ready +// to feed Brendan Gregg's flamegraph.pl. Symbolicates on the fly using +// the arena side table, ObjectFile lookup, bytecode-module scan, and +// dladdr. Returns true on success, false on I/O error. +bool sampling_profiler_save(const char* path); + +// Return one entry per recorded sample. Each inner vector holds the +// symbolicated frame names for that sample, outermost-first (index 0 +// is the root, last index is the leaf). Prints a warning and returns +// an empty vector if the profiler is still running. +std::vector sampling_profiler_symbolicated_samples(); + +// Populate the calling thread's stack bounds for later frame-walking. +// Must be called from a non-signal context. sampling_profiler_start +// calls this automatically for the calling thread; other threads that +// should be fully profiled need to call ext:profile-register-thread +// (or this function) themselves once before being sampled. +void sampling_profiler_register_current_thread(); + +// Diagnostics. +size_t sampling_profiler_samples_recorded(); +size_t sampling_profiler_samples_dropped(); +size_t sampling_profiler_bytes_used(); + +} // namespace core diff --git a/include/clasp/core/unwind.h b/include/clasp/core/unwind.h index b972d4d5e5..923081dfd6 100644 --- a/include/clasp/core/unwind.h +++ b/include/clasp/core/unwind.h @@ -46,6 +46,7 @@ class DynEnv_O : public General_O { * C++ code with unknown dynamic environment, e.g. because nontrivial * destructors need to be run, there are catch blocks, or we simply * don't know. */ +FORWARD(UnknownDynEnv); class UnknownDynEnv_O : public DynEnv_O { LISP_CLASS(core, CorePkg, UnknownDynEnv_O, "UnknownDynEnv", DynEnv_O); diff --git a/include/clasp/gctools/threadlocal.h b/include/clasp/gctools/threadlocal.h index f1856af924..2a4a8977e9 100644 --- a/include/clasp/gctools/threadlocal.h +++ b/include/clasp/gctools/threadlocal.h @@ -78,10 +78,39 @@ extern int global_debug_virtual_machine; #define VM_RESET_COUNTERS(vm) #endif +// ---------- Dynamic-environment records for the bytecode interpreter ---------- +// The bytecode VM establishes dynamic environments (tagbody, catch, +// special-bind, progv, unwind-protect) by pushing records onto a side stack +// instead of recursing into bytecode_vm. The entering opcodes push; the +// matching exit opcodes pop; an outer try/catch(Unwind&) in bytecode_vm walks +// the stack to run cleanups / resume at a saved pc on non-local exits. +// +// Currently only the type and the stack exist — no opcodes are migrated yet. +enum class VMDynKind : uint8_t { + Tagbody = 1, // from `entry` opcode (not yet migrated) + Catch, // from `catch_8/16` (not yet migrated) + SpecialBind, // from `special_bind` (one per bound cell) + Progv, // from `progv` (one record covers N bindings) + UnwindProtect, // from `protect` +}; + +struct VMDynRecord { + VMDynKind kind; + uint8_t _pad[7]; + void* frame; // __builtin_frame_address at establishment + core::T_O* slot0; // kind-specific GC-managed: tag / cell / cleanup closure + core::T_O* slot1; // kind-specific GC-managed: old binding value + core::T_O** sp_mark; // stack pointer at establishment + core::T_O** fp_mark; // frame pointer at establishment + unsigned char* target_pc; // resume pc (Tagbody/Catch) + core::T_O* dynenv_mark; // saved head of my_thread->dynEnvStackGet() +}; + struct VirtualMachine { // Stack size is kind of arbitrary, and really we should make it // grow and etc. static constexpr size_t MaxStackWords = 65536; + static constexpr size_t MaxDynRecords = 4096; bool _Running; core::T_O** _stackBottom = nullptr; size_t _stackBytes; @@ -101,6 +130,13 @@ struct VirtualMachine { core::T_O** _literals; unsigned char* _pc; + // Dynamic-environment record stack. Root-allocated so GC scans the + // T_O*/T_O** slots conservatively. _dynRecordTop points one past the last + // live record, so an empty stack has _dynRecordTop == _dynRecordBottom. + VMDynRecord* _dynRecordBottom = nullptr; + VMDynRecord* _dynRecordLimit = nullptr; + VMDynRecord* _dynRecordTop = nullptr; + void error(); void enable_guards(); diff --git a/include/clasp/llvmo/code.h b/include/clasp/llvmo/code.h index d988f056b2..fcbb8a6b69 100644 --- a/include/clasp/llvmo/code.h +++ b/include/clasp/llvmo/code.h @@ -84,6 +84,13 @@ class ObjectFile_O : public LibraryBase_O { size_t _Size; size_t _ObjectId; JITDylib_sp _TheJITDylib; + // If true, this ObjectFile is transient arena-init scaffolding (shared + // trampoline / stub template) that must not be serialized into snapshots. + // The ObjectFile is still registered in _AllObjectFiles normally — LLVM's + // link layer plugin looks it up by name during materialization, so it must + // stay findable at runtime. The snapshot save walker checks this flag and + // skips any ObjectFile with it set. + bool _TransientSkipSnapshot = false; // // Code data void* _TextSectionStart; diff --git a/include/clasp/llvmo/jit.h b/include/clasp/llvmo/jit.h index 6fa34db982..661616bdc7 100644 --- a/include/clasp/llvmo/jit.h +++ b/include/clasp/llvmo/jit.h @@ -89,6 +89,7 @@ THE SOFTWARE. #include #include #include +#include #include #include diff --git a/include/clasp/llvmo/llvmoPackage.h b/include/clasp/llvmo/llvmoPackage.h index eac32f3e0d..45b79223e5 100644 --- a/include/clasp/llvmo/llvmoPackage.h +++ b/include/clasp/llvmo/llvmoPackage.h @@ -68,6 +68,4 @@ ClaspJIT_sp llvm_sys__clasp_jit(); void initialize_llvm(); void initialize_ClaspJIT(); -core::Pointer_mv cmp__compile_trampoline(core::T_sp name); - }; // namespace llvmo diff --git a/include/clasp/llvmo/trampolineWork.h b/include/clasp/llvmo/trampolineWork.h new file mode 100644 index 0000000000..0abc15cc84 --- /dev/null +++ b/include/clasp/llvmo/trampolineWork.h @@ -0,0 +1,43 @@ +#pragma once + +/* + File: trampoline.h +*/ + +/* +Copyright (c) 2014, Christian E. Schafmeister + +CLASP is free software; you can redistribute it and/or +modify it under the terms of the GNU Library General Public +License as published by the Free Software Foundation; either +version 2 of the License, or (at your option) any later version. + +See directory 'clasp/licenses' for full details. + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +/* -^- */ + +#include + +namespace llvmo { + + +core::Pointer_mv cmp__compile_trampoline(core::T_sp name); + +// Per-generic-function trampoline. Returns the address of an arena slot that +// tail-calls GFBytecodeEntryPoint::entry_point_n. Each GF gets a unique +// address so flame charts and backtraces show its name instead of all GFs +// sharing the static entry_point_n symbol. +core::Pointer_sp cmp__compile_gf_trampoline(core::T_sp name); + +}; // namespace llvmo diff --git a/include/clasp/llvmo/trampoline_arena.h b/include/clasp/llvmo/trampoline_arena.h new file mode 100644 index 0000000000..2139671028 --- /dev/null +++ b/include/clasp/llvmo/trampoline_arena.h @@ -0,0 +1,174 @@ +/* + * trampoline_arena.h — bytecode trampoline arena. + * + * Each "trampoline" is a small function that calls bytecode_call via an + * embedded absolute address. Each slot is a memcpy of a template the JIT + * compiles once at init; the template embeds bytecode_call's address as a + * 64-bit literal (movabs on x86_64), so all slots are byte-identical and + * need no per-slot patching. + * + * caller -> trampoline (unique address per fn, shared bytes) + * `-> bytecode_call + * + * Backtrace identifies which Lisp function a frame represents by looking + * up the trampoline's address (which is the return-address from + * bytecode_call's frame) in the side table. + * + * Public API: + * - arena_install_trampoline_template(bytes, size): provided by the + * wire-up code in llvmoPackage.cc once it has compiled the template + * and captured its bytes. Idempotent. + * - arena_compile_trampoline(name): allocates a slot, memcpys the + * template, registers in the side table + perf-PID.map, returns the + * slot's address. + * - arena_lookup_by_pc(pc), arena_owns_pc(pc): for backtrace. + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace llvmo { + +struct TrampolineEntry { + uint8_t* code_start; + uint32_t code_size; + std::string name; +}; + +class TrampolineSideTable { +public: + TrampolineSideTable(); + // Append a new entry. Mutex-locked. Entries are appended in arena + // allocation order (which is also code-address-ascending, since the + // arena bump allocator only goes forward). + void append(TrampolineEntry e); + // Lock-free reader. Binary searches the published portion of the table + // for the entry whose [code_start, code_start+code_size) contains pc. + // Safe to call concurrently with new appends, including from signal + // handlers (no lock, no allocation). + const TrampolineEntry* find(uintptr_t pc) const; + size_t published() const { return _published.load(std::memory_order_acquire); } + +private: + std::mutex _write_lock; + std::vector _entries; + std::atomic _published{0}; +}; + +class ExecutableArena { +public: + // Interleaved slot layout: [code | CIE | FDE | 4B terminator], padded to + // 16-byte stride. Every slot is a byte-identical memcpy of a single + // composed template. This works because the FDE uses pcrel|sdata4 for its + // PC begin field (offset from the field's own address to the function + // start), and within a slot both the field position and the code position + // are at compile-time-constant offsets from the slot base — so the PC + // begin value is a constant across every slot. Likewise the FDE's CIE + // pointer (distance back to the preceding CIE) is a constant. + // + // Each slot registers its own CIE+FDE with libgcc via + // __register_frame(slot + code_size) — the trailing 4B zero terminator + // stops libgcc's walk after the single FDE. + // + // Caller provides tramp/CIE/FDE byte blobs that are pre-patched for this + // layout (CIE pointer = cie_size + 4; PC begin = -(code_size + cie_size + 8); + // PC range = code_size). + ExecutableArena(const uint8_t* tramp_bytes, size_t tramp_size, + const uint8_t* cie_bytes, size_t cie_len, + const uint8_t* fde_bytes, size_t fde_len); + // Allocate a fresh slot, memcpy the composed template into it, and + // register its FDE with libgcc. Single-mapped: returned address is both + // writeable and executable. + uint8_t* allocate(); + // True if pc lies in any committed page. Lock-free. + bool owns(uintptr_t pc) const; + // Code length within each slot — used by the side table for [start, start+size). + size_t slot_code_size() const { return _tramp_size; } + +private: + std::mutex _lock; + size_t _tramp_size = 0; // code length, for FDE PC range and side-table size + size_t _cie_size = 0; + size_t _fde_size = 0; + size_t _slot_stride = 0; // 16-aligned: code + CIE + FDE + 4B term + padding + size_t _page_size = 0; + // Pre-composed slot template [code | CIE | FDE | terminator | pad]. Memcpy'd + // verbatim into every allocated slot; no per-slot patching. + std::vector _slot_template; + uint8_t* _current_page = nullptr; + size_t _current_offset = 0; + struct PageRange { + uintptr_t start; + uintptr_t end; + }; + std::vector _pages; + std::atomic _pages_published{0}; +}; + +// Install the captured trampoline template. Returns true on success; +// subsequent calls are idempotent no-ops. +// +// tramp_bytes : exact machine bytes memcpy'd into each arena slot. +// tramp_size : length of tramp_bytes. +// cie_bytes : DWARF eh_frame CIE bytes (length prefix + CIE body). Same +// bytes are memcpy'd into every slot immediately after the +// code. Must specify DW_EH_PE_pcrel|DW_EH_PE_sdata4 for the +// FDE encoding so the FDE's PC begin is a distance (constant +// in this layout) rather than an absolute address. +// cie_len : length of cie_bytes. +// fde_bytes : DWARF eh_frame FDE bytes (length prefix + FDE body), +// pre-patched for the slot layout: CIE pointer field set to +// (cie_len + 4), PC begin (sdata4) set to +// -(tramp_size + cie_len + 8), PC range set to tramp_size. +// fde_len : length of fde_bytes. +bool arena_install_trampoline_template(const uint8_t* tramp_bytes, size_t tramp_size, + const uint8_t* cie_bytes, size_t cie_len, + const uint8_t* fde_bytes, size_t fde_len); + +// True once arena_install_trampoline_template has been called successfully. +bool arena_is_initialized(); + +// Allocate and register a new trampoline. Pre: arena is initialized. +core::Pointer_sp arena_compile_trampoline(const std::string& name); + +// Generic-function dispatch arena. A second arena with its own template — +// the GF template is a 3-arg trampoline (closure, nargs, args) that tail-calls +// GFBytecodeEntryPoint::entry_point_n at an embedded absolute address, so +// every generic function gets a unique PC and its name shows up in backtraces +// and the perf-PID.map. Separate template because the signature differs from +// the bytecode trampoline; shares the lookup/owns API below. +bool gf_arena_install_trampoline_template(const uint8_t* tramp_bytes, size_t tramp_size, + const uint8_t* cie_bytes, size_t cie_len, + const uint8_t* fde_bytes, size_t fde_len); +bool gf_arena_is_initialized(); +core::Pointer_sp gf_arena_compile_trampoline(const std::string& name); + +// Backtrace lookup. Lock-free, signal-handler safe. Checks both arenas +// (bytecode first, then GF) so the caller doesn't care which kind a frame +// belongs to. +const TrampolineEntry* arena_lookup_by_pc(uintptr_t pc); +bool arena_owns_pc(uintptr_t pc); + +// Post-snapshot-load pass: walk every BytecodeSimpleFun reachable from +// _AllBytecodeModules and install a fresh arena trampoline. Required because +// the trampoline pointer baked into a snapshot points at an mmap'd page that +// no longer exists after restart; the save side substitutes bytecode_call, +// and this restores wrapped trampolines so backtrace / perf-map work as +// before. No-op when CLASP_TRAMPOLINE_BACKEND != "arena". +void arena_post_load_regenerate_trampolines(); + +// Append one entry to /tmp/perf-PID.map. Lazy-opens the file once per +// process (truncating any stale contents from a previous-PID-with-the-same-id +// rerun) and appends thereafter. Thread-safe. Used by arena trampoline +// registration and by the LLVM-ORC link plugin's per-symbol callback so the +// two never race over fopen("w") and clobber each other's data. +void perf_map_append(uint8_t* addr, size_t size, const std::string& name); + +}; // namespace llvmo diff --git a/repos.sexp b/repos.sexp index fd5cd9c35e..34a1a0715c 100644 --- a/repos.sexp +++ b/repos.sexp @@ -1,4 +1,33 @@ -((:name :acclimation +( + + (:name :cl-unicode + :extension :cando + :directory "src/lisp/kernel/contrib/cl-unicode/" + :repository "https://github.com/edicl/cl-unicode.git" + :branch "master" + :pin 1) + + (:name :cl-interpol + :extension :cando + :directory "src/lisp/kernel/contrib/cl-interpol/" + :repository "https://github.com/edicl/cl-interpol.git" + :branch "master" + :pin 1) + + (:name :cl-csv + :extension :cando + :directory "src/lisp/kernel/contrib/cl-csv/" + :repository "https://github.com/AccelerationNet/cl-csv.git" + :branch "master" + :pin 1) + (:name :iterate + :extension :cando + :directory "src/lisp/kernel/contrib/iterate/" + :repository "https://github.com/lisp-mirror/iterate.git" + :branch "master" + :pin 1) + +(:name :acclimation :directory "src/lisp/kernel/contrib/Acclimation/" :repository "https://github.com/robert-strandh/Acclimation.git" :branch "master" @@ -136,9 +165,9 @@ :pin 0) (:name :closer-mop :directory "src/lisp/kernel/contrib/closer-mop/" - :repository "https://github.com/pcostanza/closer-mop.git" + :repository "https://codeberg.org/pcostanza/closer-mop" :branch "master" - :commit "04db182eb614a1000718d14df230dc946e446775" + :commit "26c1d68174" :pin 1) (:name :clostrum :directory "src/lisp/kernel/contrib/Clostrum/" @@ -422,4 +451,5 @@ :repository "https://github.com/usocket/usocket.git" :branch "master" :commit "32f4841b4313d37c36963d0d1865135ee4e29a01" - :pin 1)) \ No newline at end of file + :pin 1) + ) \ No newline at end of file diff --git a/src/analysis/clasp_gc.sif b/src/analysis/clasp_gc.sif index 089eed8a1c..d162b0183c 100644 --- a/src/analysis/clasp_gc.sif +++ b/src/analysis/clasp_gc.sif @@ -5808,6 +5808,9 @@ {fixed-field :offset-type-cxx-identifier "ATOMIC_SMART_PTR_OFFSET" :offset-ctype "gctools::smart_ptr" :offset-base-ctype "core::Lisp" :layout-offset-field-names ("_Roots" "._AllBytecodeModules")} +{fixed-field :offset-type-cxx-identifier "ATOMIC_SMART_PTR_OFFSET" + :offset-ctype "gctools::smart_ptr" :offset-base-ctype "core::Lisp" + :layout-offset-field-names ("_Roots" "._AllGFBytecodeFuns")} {fixed-field :offset-type-cxx-identifier "SMART_PTR_OFFSET" :offset-ctype "gctools::smart_ptr" :offset-base-ctype "core::Lisp" :layout-offset-field-names ("_Roots" "._UnboundCellFunctionEntryPoint")} diff --git a/src/analysis/clasp_gc_cando.sif b/src/analysis/clasp_gc_cando.sif index 84711e19e8..b7003ef94b 100644 --- a/src/analysis/clasp_gc_cando.sif +++ b/src/analysis/clasp_gc_cando.sif @@ -12867,6 +12867,9 @@ {fixed-field :offset-type-cxx-identifier "ATOMIC_SMART_PTR_OFFSET" :offset-ctype "gctools::smart_ptr" :offset-base-ctype "core::Lisp" :layout-offset-field-names ("_Roots" "._AllBytecodeModules")} +{fixed-field :offset-type-cxx-identifier "ATOMIC_SMART_PTR_OFFSET" + :offset-ctype "gctools::smart_ptr" :offset-base-ctype "core::Lisp" + :layout-offset-field-names ("_Roots" "._AllGFBytecodeFuns")} {fixed-field :offset-type-cxx-identifier "SMART_PTR_OFFSET" :offset-ctype "gctools::smart_ptr" :offset-base-ctype "core::Lisp" :layout-offset-field-names ("_Roots" "._UnboundCellFunctionEntryPoint")} diff --git a/src/core/backtrace.cc b/src/core/backtrace.cc index b8962513e9..6509a47d14 100644 --- a/src/core/backtrace.cc +++ b/src/core/backtrace.cc @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #ifdef USE_LIBUNWIND @@ -433,33 +434,84 @@ static DebuggerFrame_sp make_lisp_frame(size_t frameIndex, void* absolute_ip, co nil(), INTERN_(kw, lisp), XEPp); } -static DebuggerFrame_sp make_bytecode_frame_from_function(BytecodeSimpleFun_sp fun, void* bpc, T_O** bfp) { +// Recover (closure, nargs, args) from the parent trampoline frame, if the +// bytecode_call C++ frame at `bytecode_call_fbp` was reached via an arena +// trampoline. The trampoline saves these at fixed negative offsets from its +// own rbp: +// [tramp_rbp - 0x20] = closure +// [tramp_rbp - 0x18] = nargs +// [tramp_rbp - 0x10] = args +// The walk: +// bytecode_call_rbp = bytecode_call_fbp +// tramp_rbp = *(bytecode_call_rbp) // saved old rbp +// tramp_return_pc = *(bytecode_call_rbp + 8) // PC inside trampoline +// Returns true on success. Returns false (and leaves outputs unchanged) if +// the parent frame isn't a trampoline (e.g. bytecode was invoked directly +// from C++) or any pointer looks wrong. +static bool recover_args_from_trampoline_frame(void* bytecode_call_fbp, + T_sp& out_closure, + size_t& out_nargs, + T_O**& out_args) { + if (!bytecode_call_fbp) return false; + void** rbp = (void**)bytecode_call_fbp; + void* tramp_rbp = rbp[0]; + void* tramp_return_pc = rbp[1]; + if (!tramp_rbp) return false; + if (!llvmo::arena_owns_pc((uintptr_t)tramp_return_pc)) return false; + uint8_t* p = (uint8_t*)tramp_rbp; + T_O* saved_closure = *(T_O**)(p - 0x20); + size_t saved_nargs = *(size_t*)(p - 0x18); + T_O** saved_args = *(T_O***)(p - 0x10); + if (saved_nargs > 256) return false; // sanity bound + out_closure = T_sp((gctools::Tagged)saved_closure); + out_nargs = saved_nargs; + out_args = saved_args; + return true; +} + +static DebuggerFrame_sp make_bytecode_frame_from_function(BytecodeSimpleFun_sp fun, void* bpc, T_O** bfp, + void* bytecode_call_fbp) { // We can get the closure easy if the function actually isn't one. - // Otherwise we'd have to poke through bytecode_vm arguments or maybe - // the vm stack? T_sp closure = (fun->environmentSize() == 0) ? (T_sp)fun : nil(); List_sp bindings = bytecode_bindings_for_pc(fun->code(), bpc, bfp); T_sp spi = bytecode_spi_for_pc(fun->code(), bpc); - // Grab arguments. - T_sp tnargs((gctools::Tagged)(*(bfp - BYTECODE_FRAME_NARGS_OFFSET))); - size_t nargs = tnargs.unsafe_fixnum(); - T_O** argptr = (T_O**)*(bfp - BYTECODE_FRAME_ARGS_OFFSET); + // Recover arguments from the parent trampoline frame's saved-arg area. + // The VM stack also has nargs/args at fp - BYTECODE_FRAME_NARGS_OFFSET / + // ARGS_OFFSET (pushed by bytecode_call for the chain walk), but the + // trampoline path also yields the closure and works uniformly for every + // bytecode_call C frame. ql::list largs; - for (size_t i = 0; i < nargs; ++i) { - T_O* rarg = argptr[i]; - T_sp temp((gctools::Tagged)rarg); - largs << temp; + bool args_available = false; + T_sp tramp_closure; + size_t nargs = 0; + T_O** argptr = nullptr; + if (recover_args_from_trampoline_frame(bytecode_call_fbp, tramp_closure, nargs, argptr)) { + if (closure.nilp()) closure = tramp_closure; + for (size_t i = 0; i < nargs; ++i) { + T_sp temp((gctools::Tagged)argptr[i]); + largs << temp; + } + args_available = true; } - // Finally make the frame. - return DebuggerFrame_O::make(fun->functionName(), Pointer_O::create(bpc), spi, fun->fdesc(), closure, largs.cons(), true, bindings, + return DebuggerFrame_O::make(fun->functionName(), Pointer_O::create(bpc), spi, fun->fdesc(), + closure, largs.cons(), args_available, bindings, INTERN_(kw, bytecode), false); } -static DebuggerFrame_sp make_bytecode_frame(size_t frameIndex, unsigned char*& pc, T_O**& fp) { - // Get the PC and frame pointer for the next frame. +static DebuggerFrame_sp make_bytecode_frame(size_t frameIndex, unsigned char*& pc, T_O**& fp, + void* bytecode_call_fbp) { + // Snapshot (pc, fp) for the current bytecode frame, then advance the + // by-ref pc/fp to the caller's bytecode frame using the VM-stack chain + // bytecode_call builds (nargs/args/old_fp pushes; the PC at offset 3 + // came from the caller's `call` opcode push). Each subsequent C frame + // named "bytecode_call" picks up its own (pc, fp) via this advance, so + // every bytecode frame in the backtrace gets its lexical bindings. + // The walk terminates naturally: when bytecode is invoked from C++, the + // saved old_fp at offset 0 is NULL (vm._framePointer was NULL), so the + // next iteration sees fp == NULL and stops. void* bpc = pc; T_O** bfp = fp; - if (fp) { // null fp means we've hit the end. + if (fp) { pc = (unsigned char*)(*(fp - BYTECODE_FRAME_PC_OFFSET)); fp = (T_O**)(*(fp - BYTECODE_FRAME_FP_OFFSET)); } @@ -470,7 +522,7 @@ static DebuggerFrame_sp make_bytecode_frame(size_t frameIndex, unsigned char*& p if (bytecode_module_contains_address_p(mod, bpc)) { T_sp fun = bytecode_function_for_pc(mod, bpc); if (gc::IsA(fun)) - return make_bytecode_frame_from_function(gc::As_unsafe(fun), bpc, bfp); + return make_bytecode_frame_from_function(gc::As_unsafe(fun), bpc, bfp, bytecode_call_fbp); } } return DebuggerFrame_O::make(INTERN_(kw, bytecode), Pointer_O::create(bpc), nil(), nil(), nil(), nil(), false, @@ -504,7 +556,8 @@ bool maybe_demangle(const std::string& fnName, std::string& output) { return false; } -static DebuggerFrame_sp make_cxx_frame(size_t fi, void* ip, const char* cstring, unsigned char*& bytecode_pc, T_O**& bytecode_fp) { +static DebuggerFrame_sp make_cxx_frame(size_t fi, void* ip, const char* cstring, void* fbp, + unsigned char*& bytecode_pc, T_O**& bytecode_fp) { MaybeTrace trace(__FUNCTION__); #ifdef USE_LIBUNWIND std::string linkname(cstring); @@ -550,19 +603,66 @@ static DebuggerFrame_sp make_cxx_frame(size_t fi, void* ip, const char* cstring, // Look for bytecode frames. // NOTE: This is a little fragile. Beware. if (name == "bytecode_call") - return make_bytecode_frame(fi, bytecode_pc, bytecode_fp); + return make_bytecode_frame(fi, bytecode_pc, bytecode_fp, fbp); T_sp lname = SimpleBaseString_O::make(name); D(printf("%s%s:%d:%s lname %s\n", trace.spaces().c_str(), __FILE__, __LINE__, __FUNCTION__, name.c_str());); return DebuggerFrame_O::make(lname, Pointer_O::create(ip), nil(), nil(), nil(), nil(), false, nil(), INTERN_(kw, c_PLUS__PLUS_), false); } +// Arena-trampoline frame: the PC lives in an mmap'd arena slot, so there's +// no ObjectFile and no DWARF. The side table gives us the trampoline's name. +// The trampoline IR saves (closure, nargs, args) to its own stack frame at +// fixed negative offsets from rbp; recover them here so the frame shows the +// call's arguments alongside the function name. +// +// Layout (matches the volatile stores in build_trampoline_ir): +// [fbp - 0x20] = closure +// [fbp - 0x18] = nargs +// [fbp - 0x10] = args (T_O**) +static DebuggerFrame_sp make_arena_trampoline_frame(size_t /*fi*/, void* ip, void* fbp, + const llvmo::TrampolineEntry* entry) { + T_sp lname = SimpleBaseString_O::make(entry->name); + T_sp closure = nil(); + ql::list largs; + bool args_available = false; + if (fbp) { + uint8_t* p = (uint8_t*)fbp; + T_O* saved_closure = *(T_O**)(p - 0x20); + size_t saved_nargs = *(size_t*)(p - 0x18); + T_O** saved_args = *(T_O***)(p - 0x10); + if (saved_nargs <= 256) { // sanity bound + closure = T_sp((gctools::Tagged)saved_closure); + for (size_t i = 0; i < saved_nargs; ++i) { + T_sp temp((gctools::Tagged)saved_args[i]); + largs << temp; + } + args_available = true; + } + } + return DebuggerFrame_O::make(lname, Pointer_O::create(ip), nil(), nil(), + closure, largs.cons(), args_available, nil(), + INTERN_(kw, lisp), false); +} + static DebuggerFrame_sp make_frame(size_t fi, void* absolute_ip, const char* string, void* fbp, unsigned char*& bytecode_pc, T_O**& bytecode_fp) { MaybeTrace trace(__FUNCTION__); + // DIAGNOSTIC: log every PC the stack walker hands us, plus whether the + // arena recognized it. Set CLASP_BT_DEBUG=1 to enable. + static const bool s_bt_debug = (getenv("CLASP_BT_DEBUG") != nullptr); + if (s_bt_debug) { + bool owned = llvmo::arena_owns_pc((uintptr_t)absolute_ip); + fprintf(stderr, "[bt-debug] frame %zu ip=%p arena_owns=%d symname='%s'\n", + fi, absolute_ip, (int)owned, string ? string : "(null)"); + fflush(stderr); + } + // Check the trampoline arena first — arena slots aren't in any ObjectFile. + if (const llvmo::TrampolineEntry* e = llvmo::arena_lookup_by_pc((uintptr_t)absolute_ip)) + return make_arena_trampoline_frame(fi, absolute_ip, fbp, e); T_sp of = llvmo::only_object_file_for_instruction_pointer(absolute_ip); if (of.nilp()) - return make_cxx_frame(fi, absolute_ip, string, bytecode_pc, bytecode_fp); + return make_cxx_frame(fi, absolute_ip, string, fbp, bytecode_pc, bytecode_fp); // The absolute_ip is in an ObjectFile_O object - so it must be a lisp frame else return make_lisp_frame(fi, absolute_ip, string, gc::As_unsafe(of), fbp); @@ -570,6 +670,9 @@ static DebuggerFrame_sp make_frame(size_t fi, void* absolute_ip, const char* str static bool sanity_check_frame(size_t frameIndex, void* ip, void* fbp) { MaybeTrace trace(__FUNCTION__); + // Arena trampoline frames have no stackmap / register-save area, so there + // is nothing to sanity-check; treat them like a C++ frame. + if (llvmo::arena_owns_pc((uintptr_t)ip)) return true; T_sp of = llvmo::only_object_file_for_instruction_pointer(ip); if (of.nilp()) return true; // C++ frames are always fine for our purposes diff --git a/src/core/bytecode.cc b/src/core/bytecode.cc index 16df16cd16..384b4e934f 100644 --- a/src/core/bytecode.cc +++ b/src/core/bytecode.cc @@ -215,6 +215,70 @@ static Function_sp fdesignator_in_env(T_sp desig, T_sp env) { static unsigned char* long_dispatch(VirtualMachine&, unsigned char*, MultipleValues& multipleValues, T_O**, T_O**, Closure_O*, core::T_O**, core::T_O**, size_t, core::T_O**, uint8_t); +// Forward declaration of bytecode_vm (needed by helper functions below). +gctools::return_type +bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure, + core::T_O** fp, core::T_O** sp, size_t lcc_nargs, core::T_O** lcc_args); + +// Pop the topmost VMDynRecord and undo its effect. Used by the `unbind` +// opcode, the `_return` defensive drain, and the outer catch handler — all +// need to reverse one record worth of dynamic state. +static inline void vm_pop_dyn_record(VirtualMachine& vm) { + VMDynRecord* r = --vm._dynRecordTop; + switch (r->kind) { + case VMDynKind::SpecialBind: { + VariableCell_sp cell((gctools::Tagged)r->slot0); + T_sp oldval((gctools::Tagged)r->slot1); + my_thread->dynEnvStackSet(CONS_CDR(my_thread->dynEnvStackGet())); + cell->unbind(oldval); + break; + } + case VMDynKind::Progv: { + SimpleVector_sp cells((gctools::Tagged)r->slot0); + SimpleVector_sp oldvals((gctools::Tagged)r->slot1); + my_thread->dynEnvStackSet(CONS_CDR(my_thread->dynEnvStackGet())); + for (size_t i = 0; i < cells->length(); ++i) + cells->vref(i).as_unsafe()->unbind(oldvals->vref(i)); + break; + } + case VMDynKind::UnwindProtect: { + T_sp cleanup_fn((gctools::Tagged)r->slot0); + // Pop the SJLJ barrier (UnknownDynEnv_O) we pushed for this protect. + my_thread->dynEnvStackSet(CONS_CDR(my_thread->dynEnvStackGet())); + // Save multiple values and _UnwindDest across the cleanup (it may + // funcall arbitrary code), then run with *interrupts-enabled* = NIL. + T_sp dest = my_thread->_UnwindDest; + size_t dindex = my_thread->_UnwindDestIndex; + MultipleValues& mv = lisp_multipleValues(); + size_t nvals = mv.getSize(); + T_O* mv_temp[nvals]; + mv.saveToTemp(nvals, mv_temp); + call_with_variable_bound(_sym_STARinterrupts_enabledSTAR, nil(), + [&]() { return eval::funcall(cleanup_fn); }); + mv.loadFromTemp(nvals, mv_temp); + my_thread->_UnwindDestIndex = dindex; + my_thread->_UnwindDest = dest; + break; + } + case VMDynKind::Catch: { + // Unwinding PAST a catch (non-matching throw or non-CatchThrow exit). + // Reset the dyn-env stack to the state it had before catch_8/16 + // pushed its CatchDynEnv + barrier pair. + my_thread->dynEnvStackSet(T_sp((gctools::Tagged)r->dynenv_mark)); + break; + } + case VMDynKind::Tagbody: { + // Unwinding PAST a tagbody (non-matching Unwind). Reset the dyn-env + // stack to the state it had before `entry` pushed its TagbodyDynEnv + // + barrier pair. + my_thread->dynEnvStackSet(T_sp((gctools::Tagged)r->dynenv_mark)); + break; + } + default: + break; + } +} + SYMBOL_EXPORT_SC_(KeywordPkg, name); #ifdef DEBUG_VIRTUAL_MACHINE __attribute__((optnone)) @@ -242,6 +306,113 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure #endif MultipleValues& multipleValues = core::lisp_multipleValues(); unsigned char* pc = vm._pc; + + // Mark of the VM dynenv-record stack at entry to this activation. The outer + // try/catch(Unwind&) below walks the stack back down to this mark to run + // cleanups and decide whether to resume dispatch (on a matched + // tagbody/catch target) or rethrow to the caller. No opcodes push records + // yet — the scaffolding is inert until opcodes are migrated one at a time. + VMDynRecord* const dyn_entry_mark = vm._dynRecordTop; + + while (true) { + try { + +// ---------- Computed-goto dispatch ---------- +// GCC and Clang support the labels-as-values extension (&&label). +// Each opcode ends with VM_NEXT which jumps directly to the next +// opcode, avoiding the single indirect-branch bottleneck of a switch. +// Disabled when DEBUG_VIRTUAL_MACHINE is on so the switch is preserved +// for debugging/recording. +#if defined(__GNUC__) && !defined(DEBUG_VIRTUAL_MACHINE) +#define USE_COMPUTED_GOTO 1 +#endif + +#ifdef USE_COMPUTED_GOTO + #define VM_CASE(name) vm_op_##name + #define VM_NEXT goto *dispatch_table[*pc] + static void* dispatch_table[256]; + static bool dispatch_table_init = false; + if (!dispatch_table_init) [[unlikely]] { + for (int i = 0; i < 256; ++i) dispatch_table[i] = &&vm_op_error; + dispatch_table[(uint8_t)vm_code::ref] = &&vm_op_ref; + dispatch_table[(uint8_t)vm_code::_const] = &&vm_op__const; + dispatch_table[(uint8_t)vm_code::closure] = &&vm_op_closure; + dispatch_table[(uint8_t)vm_code::call] = &&vm_op_call; + dispatch_table[(uint8_t)vm_code::call_receive_one] = &&vm_op_call_receive_one; + dispatch_table[(uint8_t)vm_code::call_receive_fixed] = &&vm_op_call_receive_fixed; + dispatch_table[(uint8_t)vm_code::bind] = &&vm_op_bind; + dispatch_table[(uint8_t)vm_code::set] = &&vm_op_set; + dispatch_table[(uint8_t)vm_code::make_cell] = &&vm_op_make_cell; + dispatch_table[(uint8_t)vm_code::cell_ref] = &&vm_op_cell_ref; + dispatch_table[(uint8_t)vm_code::cell_set] = &&vm_op_cell_set; + dispatch_table[(uint8_t)vm_code::make_closure] = &&vm_op_make_closure; + dispatch_table[(uint8_t)vm_code::make_uninitialized_closure] = &&vm_op_make_uninitialized_closure; + dispatch_table[(uint8_t)vm_code::initialize_closure] = &&vm_op_initialize_closure; + dispatch_table[(uint8_t)vm_code::_return] = &&vm_op__return; + dispatch_table[(uint8_t)vm_code::bind_required_args] = &&vm_op_bind_required_args; + dispatch_table[(uint8_t)vm_code::bind_optional_args] = &&vm_op_bind_optional_args; + dispatch_table[(uint8_t)vm_code::listify_rest_args] = &&vm_op_listify_rest_args; + dispatch_table[(uint8_t)vm_code::vaslistify_rest_args] = &&vm_op_vaslistify_rest_args; + dispatch_table[(uint8_t)vm_code::parse_key_args] = &&vm_op_parse_key_args; + dispatch_table[(uint8_t)vm_code::jump_8] = &&vm_op_jump_8; + dispatch_table[(uint8_t)vm_code::jump_16] = &&vm_op_jump_16; + dispatch_table[(uint8_t)vm_code::jump_24] = &&vm_op_jump_24; + dispatch_table[(uint8_t)vm_code::jump_if_8] = &&vm_op_jump_if_8; + dispatch_table[(uint8_t)vm_code::jump_if_16] = &&vm_op_jump_if_16; + dispatch_table[(uint8_t)vm_code::jump_if_24] = &&vm_op_jump_if_24; + dispatch_table[(uint8_t)vm_code::jump_if_supplied_8] = &&vm_op_jump_if_supplied_8; + dispatch_table[(uint8_t)vm_code::jump_if_supplied_16] = &&vm_op_jump_if_supplied_16; + dispatch_table[(uint8_t)vm_code::check_arg_count_LE] = &&vm_op_check_arg_count_LE; + dispatch_table[(uint8_t)vm_code::check_arg_count_GE] = &&vm_op_check_arg_count_GE; + dispatch_table[(uint8_t)vm_code::check_arg_count_EQ] = &&vm_op_check_arg_count_EQ; + dispatch_table[(uint8_t)vm_code::push_values] = &&vm_op_push_values; + dispatch_table[(uint8_t)vm_code::append_values] = &&vm_op_append_values; + dispatch_table[(uint8_t)vm_code::pop_values] = &&vm_op_pop_values; + dispatch_table[(uint8_t)vm_code::mv_call] = &&vm_op_mv_call; + dispatch_table[(uint8_t)vm_code::mv_call_receive_one] = &&vm_op_mv_call_receive_one; + dispatch_table[(uint8_t)vm_code::mv_call_receive_fixed] = &&vm_op_mv_call_receive_fixed; + dispatch_table[(uint8_t)vm_code::save_sp] = &&vm_op_save_sp; + dispatch_table[(uint8_t)vm_code::restore_sp] = &&vm_op_restore_sp; + dispatch_table[(uint8_t)vm_code::entry] = &&vm_op_entry; + dispatch_table[(uint8_t)vm_code::exit_8] = &&vm_op_exit_8; + dispatch_table[(uint8_t)vm_code::exit_16] = &&vm_op_exit_16; + dispatch_table[(uint8_t)vm_code::exit_24] = &&vm_op_exit_24; + dispatch_table[(uint8_t)vm_code::entry_close] = &&vm_op_entry_close; + dispatch_table[(uint8_t)vm_code::catch_8] = &&vm_op_catch_8; + dispatch_table[(uint8_t)vm_code::catch_16] = &&vm_op_catch_16; + dispatch_table[(uint8_t)vm_code::_throw] = &&vm_op__throw; + dispatch_table[(uint8_t)vm_code::catch_close] = &&vm_op_catch_close; + dispatch_table[(uint8_t)vm_code::special_bind] = &&vm_op_special_bind; + dispatch_table[(uint8_t)vm_code::symbol_value] = &&vm_op_symbol_value; + dispatch_table[(uint8_t)vm_code::symbol_value_set] = &&vm_op_symbol_value_set; + dispatch_table[(uint8_t)vm_code::unbind] = &&vm_op_unbind; + dispatch_table[(uint8_t)vm_code::progv] = &&vm_op_progv; + dispatch_table[(uint8_t)vm_code::fdefinition] = &&vm_op_fdefinition; + dispatch_table[(uint8_t)vm_code::nil] = &&vm_op_nil; + // vm_code::eq (55) — not yet implemented, stays as vm_op_error + dispatch_table[(uint8_t)vm_code::push] = &&vm_op_push; + dispatch_table[(uint8_t)vm_code::pop] = &&vm_op_pop; + dispatch_table[(uint8_t)vm_code::dup] = &&vm_op_dup; + dispatch_table[(uint8_t)vm_code::fdesignator] = &&vm_op_fdesignator; + dispatch_table[(uint8_t)vm_code::called_fdefinition] = &&vm_op_called_fdefinition; + dispatch_table[(uint8_t)vm_code::protect] = &&vm_op_protect; + dispatch_table[(uint8_t)vm_code::cleanup] = &&vm_op_cleanup; + dispatch_table[(uint8_t)vm_code::encell] = &&vm_op_encell; + dispatch_table[(uint8_t)vm_code::_long] = &&vm_op__long; + dispatch_table_init = true; + } + VM_NEXT; // initial dispatch + + vm_op_error: { + SimpleFun_sp ep = closure->entryPoint(); + BytecodeModule_sp bcm = gc::As(ep)->code(); + unsigned char* codeStart = (unsigned char*)bcm->bytecode()->rowMajorAddressOfElement_(0); + unsigned char* codeEnd = codeStart + bcm->bytecode()->arrayTotalSize(); + SIMPLE_ERROR("Unknown opcode {} pc: {} module: {} - {}", *pc, (void*)pc, (void*)codeStart, (void*)codeEnd); + } +#else + #define VM_CASE(name) case vm_code::name + #define VM_NEXT break while (1) { VM_PC_CHECK(vm, pc, bytecode_start, bytecode_end); #if DEBUG_VM_RECORD_PLAYBACK == 1 @@ -252,48 +423,43 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure global_stackTrigger, stackHeight); return gctools::return_type(nil().raw_(), 0); } - // printf("%c", (unsigned char)(*vm._pc)+32); if (global_recordingFile) { VM_RECORD_PLAYBACK(vm._pc, "pc"); VM_RECORD_PLAYBACK(vm._stackPointer, "stackPointer"); } #endif switch ((vm_code)*pc) { - case vm_code::ref: { +#endif + VM_CASE(ref): { uint8_t n = *(++pc); DBG_VM1("ref %" PRIu8 "\n", n); vm.push(sp, *(vm.reg(fp, n))); pc++; - break; + VM_NEXT; } - case vm_code::_const: { + VM_CASE(_const): { uint8_t n = *(++pc); DBG_VM1("const %" PRIu8 "\n", n); T_O* value = literals[n]; vm.push(sp, value); VM_RECORD_PLAYBACK(value, "const"); pc++; - break; + VM_NEXT; } - case vm_code::closure: { + VM_CASE(closure): { uint8_t n = *(++pc); DBG_VM("closure %" PRIu8 "\n", n); vm.push(sp, closed[n]); pc++; - break; + VM_NEXT; } - case vm_code::call: { + VM_CASE(call): { uint8_t nargs = *(++pc); DBG_VM1("call %" PRIu8 "\n", nargs); T_sp tfunc((gctools::Tagged)(*(vm.stackref(sp, nargs)))); Function_sp func = gc::As_assert(tfunc); T_O** args = vm.stackref(sp, nargs - 1); maybe_step_call(__builtin_frame_address(0), func, nargs, args); - // We push the PC for the debugger (see make_bytecode_frame in backtrace.cc) - // We do this here rather than bytecode_call because e.g. we may call a - // non-bytecode function, that in turn calls a bunch of different bytecode - // functions, which may trash vm._pc making it unsuitable. - // We have to do this for all call instructions, not just this one. vm.push(sp, (T_O*)pc); vm._pc = pc; vm._stackPointer = sp; @@ -301,9 +467,9 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure multipleValues.setN(res.raw_(), res.number_of_values()); vm.drop(sp, nargs + 2); pc++; - break; + VM_NEXT; } - case vm_code::call_receive_one: { + VM_CASE(call_receive_one): { uint8_t nargs = *(++pc); DBG_VM1("call-receive-one %" PRIu8 "\n", nargs); T_sp tfunc((gctools::Tagged)(*(vm.stackref(sp, nargs)))); @@ -327,9 +493,9 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure vm.push(sp, res.raw_()); VM_RECORD_PLAYBACK(res.raw_(), "vm_call_receive_one"); pc++; - break; + VM_NEXT; } - case vm_code::call_receive_fixed: { + VM_CASE(call_receive_fixed): { uint8_t nargs = *(++pc); uint8_t nvals = *(++pc); DBG_VM("call-receive-fixed %" PRIu8 " %" PRIu8 "\n", nargs, nvals); @@ -349,40 +515,40 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure vm.push(sp, multipleValues.valueGet(i, svalues).raw_()); } pc++; - break; + VM_NEXT; } - case vm_code::bind: { + VM_CASE(bind): { uint8_t nelems = *(++pc); uint8_t base = *(++pc); DBG_VM1("bind %" PRIu8 " %" PRIu8 "\n", nelems, base); vm.copytoreg(fp, vm.stackref(sp, nelems - 1), nelems, base); vm.drop(sp, nelems); pc++; - break; + VM_NEXT; } - case vm_code::set: { + VM_CASE(set): { uint8_t n = *(++pc); DBG_VM("set %" PRIu8 "\n", n); vm.setreg(fp, n, vm.pop(sp)); pc++; - break; + VM_NEXT; } - case vm_code::make_cell: { + VM_CASE(make_cell): { DBG_VM1("make-cell\n"); T_sp car((gctools::Tagged)(vm.pop(sp))); T_sp cdr((gctools::Tagged)nil().raw_()); vm.push(sp, Cons_O::create(car, cdr).raw_()); pc++; - break; + VM_NEXT; } - case vm_code::cell_ref: { + VM_CASE(cell_ref): { DBG_VM1("cell-ref\n"); T_sp cons((gctools::Tagged)vm.pop(sp)); vm.push(sp, cons.unsafe_cons()->car().raw_()); pc++; - break; + VM_NEXT; } - case vm_code::cell_set: { + VM_CASE(cell_set): { DBG_VM("cell-set\n"); T_sp cons((gctools::Tagged)vm.pop(sp)); Cons_sp ccons = gc::As_assert(cons); @@ -390,9 +556,9 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure T_sp tval((gctools::Tagged)val); ccons->rplaca(tval); pc++; - break; + VM_NEXT; } - case vm_code::make_closure: { + VM_CASE(make_closure): { uint8_t c = *(++pc); DBG_VM("make-closure %" PRIu8 "\n", c); T_sp fn_sp((gctools::Tagged)literals[c]); @@ -400,14 +566,13 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure size_t nclosed = fn->environmentSize(); DBG_VM(" nclosed = %zu\n", nclosed); Closure_sp closure = Closure_O::make_bytecode_closure(fn, nclosed); - // FIXME: Can we use some more abstracted access? vm.copyto(sp, nclosed, (T_O**)(closure->_Slots.data())); vm.drop(sp, nclosed); vm.push(sp, closure.raw_()); pc++; - break; + VM_NEXT; } - case vm_code::make_uninitialized_closure: { + VM_CASE(make_uninitialized_closure): { uint8_t c = *(++pc); DBG_VM("make-uninitialized-closure %" PRIu8 "\n", c); T_sp fn_sp((gctools::Tagged)literals[c]); @@ -417,51 +582,56 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure Closure_sp closure = Closure_O::make_bytecode_closure(fn, nclosed); vm.push(sp, closure.raw_()); pc++; - break; + VM_NEXT; } - case vm_code::initialize_closure: { + VM_CASE(initialize_closure): { uint8_t c = *(++pc); DBG_VM("initialize-closure %" PRIu8 "\n", c); T_sp tclosure((gctools::Tagged)(*(vm.reg(fp, c)))); Closure_sp closure = gc::As_assert(tclosure); - // FIXME: We ought to be able to get the closure size directly - // from the closure through some nice method. BytecodeSimpleFun_sp fn = gc::As_assert(closure->entryPoint()); size_t nclosed = fn->environmentSize(); DBG_VM(" nclosed = %zu\n", nclosed); vm.copyto(sp, nclosed, (T_O**)(closure->_Slots.data())); vm.drop(sp, nclosed); pc++; - break; + VM_NEXT; } - case vm_code::_return: { + VM_CASE(_return): { DBG_VM1("return\n"); - // since the stack pointer is a local variable we don't need to - // adjust it. + // Drain any VM dynenv records still established within this + // activation. The old recursive-bytecode_vm flow relied on RAII to + // restore bindings on normal return even when the compiler didn't + // emit an explicit unbind before _return; we match that semantics. + // Sync sp/pc in case the drain runs an unwind-protect cleanup that + // recurses into bytecode_call. + vm._pc = pc; + vm._stackPointer = sp; + while (vm._dynRecordTop > dyn_entry_mark) + vm_pop_dyn_record(vm); size_t nvalues = multipleValues.getSize(); return gctools::return_type(multipleValues.valueGet(0, nvalues).raw_(), nvalues); } - case vm_code::bind_required_args: { + VM_CASE(bind_required_args): { uint8_t nargs = *(++pc); DBG_VM("bind-required-args %" PRIu8 "\n", nargs); vm.copytoreg(fp, lcc_args, nargs, 0); pc++; - break; + VM_NEXT; } - case vm_code::bind_optional_args: { + VM_CASE(bind_optional_args): { uint8_t nreq = *(++pc); uint8_t nopt = *(++pc); DBG_VM("bind-optional-args %" PRIu8 " %" PRIu8 "\n", nreq, nopt); for (size_t i = nreq + nopt; i > lcc_nargs; --i) vm.push(sp, unbound().raw_()); - // Push provided args, last arg first. for (size_t j = std::min((size_t)nreq+(size_t)nopt, lcc_nargs); j > nreq; --j) vm.push(sp, lcc_args[j - 1]); pc++; - break; + VM_NEXT; } - case vm_code::listify_rest_args: { + VM_CASE(listify_rest_args): { uint8_t start = *(++pc); DBG_VM("listify-rest-args %" PRIu8 "\n", start); ql::list rest; @@ -471,21 +641,17 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure } vm.push(sp, rest.cons().raw_()); pc++; - break; + VM_NEXT; } - case vm_code::vaslistify_rest_args: { - // - // This pushes two vaslist structures (each two words that look like fixnums) - // onto the stack. the theVaslist_backup is used by vaslist_rewind - // + VM_CASE(vaslistify_rest_args): { uint8_t start = *(++pc); DBG_VM("vaslistify-rest-args %" PRIu8 "\n", start); auto theVaslist = vm.alloca_vaslist2(sp, lcc_args + start, lcc_nargs - start); vm.push(sp, theVaslist); pc++; - break; + VM_NEXT; } - case vm_code::parse_key_args: { + VM_CASE(parse_key_args): { uint8_t more_start = *(++pc); uint8_t key_count_info = *(++pc); uint8_t key_literal_start = *(++pc); @@ -500,17 +666,12 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure T_sp tclosure((gctools::Tagged)gctools::tag_general(closure)); throwOddKeywordsError(tclosure); } - // We grab keyword arguments from the end to the beginning. - // This means that earlier arguments are put in their variables - // last, matching the CL semantics. - // KLUDGE: We use a signed type so that if more_start is zero we don't - // wrap arg_index around. There's probably a cleverer solution. ptrdiff_t arg_index; for (arg_index = lcc_nargs - 1; arg_index >= more_start; arg_index -= 2) { bool valid_key_p = false; T_O* key = lcc_args[arg_index - 1]; if (key == kw::_sym_allow_other_keys.raw_()) { - valid_key_p = true; // aok is always valid. + valid_key_p = true; T_sp value((gctools::Tagged)(lcc_args[arg_index])); aokp = value.notnilp(); } @@ -533,7 +694,6 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure T_sp tclosure((gctools::Tagged)gctools::tag_general(closure)); throwUnrecognizedKeywordArgumentError(tclosure, unknown_keys); } - // Finally, push keys to the stack. for (size_t i = 0; i < key_count; ++i) { size_t key_id = key_count - i - 1; T_sp key((gctools::Tagged)literals[key_id + key_literal_start]); @@ -541,27 +701,27 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure vm.push(sp, value.raw_()); } pc++; - break; + VM_NEXT; } - case vm_code::jump_8: { + VM_CASE(jump_8): { int8_t rel = *(pc + 1); DBG_VM1("jump %" PRId8 "\n", rel); pc += rel; - break; + VM_NEXT; } - case vm_code::jump_16: { + VM_CASE(jump_16): { int16_t rel = read_s16(pc + 1); DBG_VM("jump %" PRId16 "\n", rel); pc += rel; - break; + VM_NEXT; } - case vm_code::jump_24: { + VM_CASE(jump_24): { int32_t rel = read_label(pc, 3); DBG_VM("jump %" PRId32 "\n", rel); pc += rel; - break; + VM_NEXT; } - case vm_code::jump_if_8: { + VM_CASE(jump_if_8): { int8_t rel = *(pc + 1); DBG_VM1("jump-if %" PRId8 "\n", rel); T_sp tval((gctools::Tagged)vm.pop(sp)); @@ -570,9 +730,9 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure pc += rel; else pc += 2; - break; + VM_NEXT; } - case vm_code::jump_if_16: { + VM_CASE(jump_if_16): { int16_t rel = read_s16(pc + 1); DBG_VM("jump-if %" PRId16 "\n", rel); T_sp tval((gctools::Tagged)vm.pop(sp)); @@ -580,9 +740,9 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure pc += rel; else pc += 3; - break; + VM_NEXT; } - case vm_code::jump_if_24: { + VM_CASE(jump_if_24): { int32_t rel = read_label(pc, 3); DBG_VM("jump-if %" PRId32 "\n", rel); T_sp tval((gctools::Tagged)vm.pop(sp)); @@ -590,9 +750,9 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure pc += rel; else pc += 4; - break; + VM_NEXT; } - case vm_code::jump_if_supplied_8: { + VM_CASE(jump_if_supplied_8): { int32_t rel = *(pc + 1); DBG_VM("jump-if-supplied %" PRId8 "\n", rel); T_sp tval((gctools::Tagged)(vm.pop(sp))); @@ -602,9 +762,9 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure vm.push(sp, tval.raw_()); pc += rel; } - break; + VM_NEXT; } - case vm_code::jump_if_supplied_16: { + VM_CASE(jump_if_supplied_16): { int32_t rel = read_s16(pc + 1); DBG_VM("jump-if-supplied %" PRId16 "\n", rel); T_sp tval((gctools::Tagged)(vm.pop(sp))); @@ -614,9 +774,9 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure vm.push(sp, tval.raw_()); pc += rel; } - break; + VM_NEXT; } - case vm_code::check_arg_count_LE: { + VM_CASE(check_arg_count_LE): { uint8_t max_nargs = *(++pc); DBG_VM("check-arg-count<= %" PRIu8 "\n", max_nargs); if (lcc_nargs > max_nargs) { @@ -624,9 +784,9 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure throwTooManyArgumentsError(tclosure, lcc_nargs, max_nargs); } pc++; - break; + VM_NEXT; } - case vm_code::check_arg_count_GE: { + VM_CASE(check_arg_count_GE): { uint8_t min_nargs = *(++pc); DBG_VM("check-arg-count>= %" PRIu8 "\n", min_nargs); if (lcc_nargs < min_nargs) { @@ -634,9 +794,9 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure throwTooFewArgumentsError(tclosure, lcc_nargs, min_nargs); } pc++; - break; + VM_NEXT; } - case vm_code::check_arg_count_EQ: { + VM_CASE(check_arg_count_EQ): { uint8_t req_nargs = *(++pc); DBG_VM1("check-arg-count= %" PRIu8 "\n", req_nargs); if (lcc_nargs != req_nargs) { @@ -644,21 +804,19 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure wrongNumberOfArguments(tclosure, lcc_nargs, req_nargs); } pc++; - break; + VM_NEXT; } - case vm_code::push_values: { - // TODO: Direct copy? + VM_CASE(push_values): { DBG_VM("push-values\n"); size_t nvalues = multipleValues.getSize(); DBG_VM(" nvalues = %zu\n", nvalues); for (size_t i = 0; i < nvalues; ++i) vm.push(sp, multipleValues.valueGet(i, nvalues).raw_()); - // We could skip tagging this, but that's error-prone. vm.push(sp, make_fixnum(nvalues).raw_()); pc++; - break; + VM_NEXT; } - case vm_code::append_values: { + VM_CASE(append_values): { DBG_VM("append-values\n"); T_sp texisting_values((gctools::Tagged)vm.pop(sp)); size_t existing_values = texisting_values.unsafe_fixnum(); @@ -669,9 +827,9 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure vm.push(sp, multipleValues.valueGet(i, nvalues).raw_()); vm.push(sp, make_fixnum(nvalues + existing_values).raw_()); pc++; - break; + VM_NEXT; } - case vm_code::pop_values: { + VM_CASE(pop_values): { DBG_VM("pop-values\n"); T_sp texisting_values((gctools::Tagged)vm.pop(sp)); size_t existing_values = texisting_values.unsafe_fixnum(); @@ -680,9 +838,9 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure multipleValues.setSize(existing_values); vm.drop(sp, existing_values); pc++; - break; + VM_NEXT; } - case vm_code::mv_call: { + VM_CASE(mv_call): { DBG_VM("mv-call\n"); T_sp tnargs((gctools::Tagged)vm.pop(sp)); size_t nargs = tnargs.unsafe_fixnum(); @@ -695,12 +853,12 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure vm._pc = pc; vm._stackPointer = sp; T_mv res = func->apply_raw(nargs, args); - vm.drop(sp, nargs + 1 + 1); // 1 each for func, pc + vm.drop(sp, nargs + 1 + 1); multipleValues.setN(res.raw_(), res.number_of_values()); pc++; - break; + VM_NEXT; } - case vm_code::mv_call_receive_one: { + VM_CASE(mv_call_receive_one): { DBG_VM("mv-call-receive-one\n"); T_sp tnargs((gctools::Tagged)vm.pop(sp)); size_t nargs = tnargs.unsafe_fixnum(); @@ -717,9 +875,9 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure multipleValues.set1(res); vm.push(sp, res.raw_()); pc++; - break; + VM_NEXT; } - case vm_code::mv_call_receive_fixed: { + VM_CASE(mv_call_receive_fixed): { uint8_t nvals = *(++pc); DBG_VM("mv-call-receive-fixed %" PRIu8 "\n", nvals); T_sp tnargs((gctools::Tagged)vm.pop(sp)); @@ -740,49 +898,56 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure vm.push(sp, multipleValues.valueGet(i, svalues).raw_()); } pc++; - break; + VM_NEXT; } - case vm_code::save_sp: { + VM_CASE(save_sp): { uint8_t n = *(++pc); DBG_VM("save sp %" PRIu8 "\n", n); vm.savesp(fp, sp, n); pc++; - break; + VM_NEXT; } - case vm_code::restore_sp: { + VM_CASE(restore_sp): { uint8_t n = *(++pc); DBG_VM("restore sp %" PRIu8 "\n", n); vm.restoresp(fp, sp, n); pc++; - break; + VM_NEXT; } - case vm_code::entry: { + VM_CASE(entry): { uint8_t n = *(++pc); DBG_VM("entry %" PRIu8 "\n", n); pc++; - jmp_buf target; - void* frame = __builtin_frame_address(0); - vm._pc = pc; - TagbodyDynEnv_sp env = TagbodyDynEnv_O::create(frame, &target); - vm.setreg(fp, n, env.raw_()); - gctools::StackAllocate sa_ec(env, my_thread->dynEnvStackGet()); - DynEnvPusher dep(my_thread, sa_ec.asSmartPtr()); - _setjmp(target); - again: - try { - bytecode_vm(vm, literals, closed, closure, fp, sp, lcc_nargs, lcc_args); - sp = vm._stackPointer; - pc = vm._pc; - } catch (Unwind& uw) { - if (uw.getFrame() == frame) { - my_thread->dynEnvStackGet() = sa_ec.asSmartPtr(); - goto again; - } else - throw; - } - break; - } - case vm_code::exit_8: { + // Use the future VMDynRecord address as a unique "frame" identifier + // for this tagbody. __builtin_frame_address would collide across + // multiple tagbodies in the same bytecode_vm activation (we no + // longer have per-tagbody C++ frames). + ASSERT(vm._dynRecordTop < vm._dynRecordLimit); + VMDynRecord* r = vm._dynRecordTop; + // TagbodyDynEnv_O::frame gets our record pointer. Its jmp_buf is + // nullptr; the SJLJ barrier below forces sjlj_unwind to fall back + // to `throw Unwind(r, 1)`, which our outer handler matches by + // frame and resumes dispatch. + TagbodyDynEnv_sp env = TagbodyDynEnv_O::create((void*)r, (jmp_buf*)nullptr); + vm.setreg(fp, n, env.raw_()); // compiler stores env in this reg for `go` + T_sp dynenv_entry = my_thread->dynEnvStackGet(); + Cons_sp env_cons = Cons_O::create(env, dynenv_entry); + UnknownDynEnv_sp barrier = gctools::GC::allocate(); + Cons_sp barrier_cons = Cons_O::create(barrier, env_cons); + my_thread->dynEnvStackSet(barrier_cons); + // Commit the record. + vm._dynRecordTop++; + r->kind = VMDynKind::Tagbody; + r->frame = (void*)r; + r->slot0 = env.raw_(); // keep env rooted through the record + r->slot1 = nullptr; + r->sp_mark = sp; + r->fp_mark = fp; + r->target_pc = nullptr; // target set by exit_* into vm._pc + r->dynenv_mark = dynenv_entry.raw_(); + VM_NEXT; + } + VM_CASE(exit_8): { int8_t rel = *(pc + 1); DBG_VM("exit %" PRId8 "\n", rel); vm._pc = pc + rel; @@ -790,7 +955,7 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure TagbodyDynEnv_sp tde = gc::As_assert(ttde); sjlj_unwind(tde, 1); } - case vm_code::exit_16: { + VM_CASE(exit_16): { int16_t rel = read_s16(pc + 1); DBG_VM("exit %" PRId16 "\n", rel); vm._pc = pc + rel; @@ -798,7 +963,7 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure TagbodyDynEnv_sp tde = gc::As_assert(ttde); sjlj_unwind(tde, 1); } - case vm_code::exit_24: { + VM_CASE(exit_24): { int32_t rel = read_label(pc, 3); DBG_VM("exit %" PRId32 "\n", rel); vm._pc = pc + rel; @@ -806,88 +971,128 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure TagbodyDynEnv_sp tde = gc::As_assert(ttde); sjlj_unwind(tde, 1); } - case vm_code::entry_close: { + VM_CASE(entry_close): { DBG_VM("entry-close\n"); - // This sham return value just gets us out of the bytecode_vm call in - // vm_code::entry, above. - vm._pc = pc + 1; - vm._stackPointer = sp; - return gctools::return_type(nil().raw_(), 0); + // Normal exit from a tagbody: body completed without a matching + // `go`. Pop the Tagbody record and reset the dyn-env stack. + ASSERT(vm._dynRecordTop > vm._dynRecordBottom); + VMDynRecord* r = --vm._dynRecordTop; + ASSERT(r->kind == VMDynKind::Tagbody); + my_thread->dynEnvStackSet(T_sp((gctools::Tagged)r->dynenv_mark)); + pc++; + VM_NEXT; } - case vm_code::catch_8: { + VM_CASE(catch_8): { int8_t rel = *(pc + 1); DBG_VM("catch-8 %" PRId8 "\n", rel); - unsigned char* target = pc + rel; - bool thrown = true; + unsigned char* catch_target = pc + rel; pc += 2; T_sp tag((gctools::Tagged)(vm.pop(sp))); - vm._pc = pc; - call_with_catch(tag, [&]() { - T_mv result = bytecode_vm(vm, literals, closed, closure, fp, sp, lcc_nargs, lcc_args); - thrown = false; - return result; - }); - if (thrown) pc = target; - else { - pc = vm._pc; - sp = vm._stackPointer; - } - break; - } - case vm_code::catch_16: { + // Two conses on the dyn-env stack: + // - CatchDynEnv_O carrying the tag, so sjlj_throw_search can find + // the catch by tag. Its jmp_buf is nullptr; it is never jumped to. + // - UnknownDynEnv_O barrier above it, so sjlj_throw falls back to + // throw CatchThrow (our outer handler catches and matches). + T_sp dynenv_entry = my_thread->dynEnvStackGet(); + CatchDynEnv_sp cde = gctools::GC::allocate((jmp_buf*)nullptr, tag); + Cons_sp cde_cons = Cons_O::create(cde, dynenv_entry); + UnknownDynEnv_sp barrier = gctools::GC::allocate(); + Cons_sp barrier_cons = Cons_O::create(barrier, cde_cons); + my_thread->dynEnvStackSet(barrier_cons); + ASSERT(vm._dynRecordTop < vm._dynRecordLimit); + VMDynRecord* r = vm._dynRecordTop++; + r->kind = VMDynKind::Catch; + r->frame = nullptr; + r->slot0 = tag.raw_(); + r->slot1 = nullptr; + r->sp_mark = sp; + r->fp_mark = fp; + r->target_pc = catch_target; + r->dynenv_mark = dynenv_entry.raw_(); + VM_NEXT; + } + VM_CASE(catch_16): { int16_t rel = read_s16(pc + 1); - DBG_VM("catch-8 %" PRId16 "\n", rel); - unsigned char* target = pc + rel; - bool thrown = true; + DBG_VM("catch-16 %" PRId16 "\n", rel); + unsigned char* catch_target = pc + rel; pc += 3; T_sp tag((gctools::Tagged)(vm.pop(sp))); - vm._pc = pc; - call_with_catch(tag, [&]() { - T_mv result = bytecode_vm(vm, literals, closed, closure, fp, sp, lcc_nargs, lcc_args); - thrown = false; - return result; - }); - if (thrown) pc = target; - else { - pc = vm._pc; - sp = vm._stackPointer; - } - break; - } - case vm_code::_throw: { + T_sp dynenv_entry = my_thread->dynEnvStackGet(); + CatchDynEnv_sp cde = gctools::GC::allocate((jmp_buf*)nullptr, tag); + Cons_sp cde_cons = Cons_O::create(cde, dynenv_entry); + UnknownDynEnv_sp barrier = gctools::GC::allocate(); + Cons_sp barrier_cons = Cons_O::create(barrier, cde_cons); + my_thread->dynEnvStackSet(barrier_cons); + ASSERT(vm._dynRecordTop < vm._dynRecordLimit); + VMDynRecord* r = vm._dynRecordTop++; + r->kind = VMDynKind::Catch; + r->frame = nullptr; + r->slot0 = tag.raw_(); + r->slot1 = nullptr; + r->sp_mark = sp; + r->fp_mark = fp; + r->target_pc = catch_target; + r->dynenv_mark = dynenv_entry.raw_(); + VM_NEXT; + } + VM_CASE(_throw): { DBG_VM("throw\n"); T_sp tag((gctools::Tagged)(vm.pop(sp))); sjlj_throw(tag); } - case vm_code::catch_close: { - DBG_VM("entry-close\n"); - vm._pc = pc + 1; - vm._stackPointer = sp; - return gctools::return_type(nil().raw_(), 0); + VM_CASE(catch_close): { + DBG_VM("catch-close\n"); + // Normal exit from a catch scope: the body completed without a + // matching throw. Pop the Catch record and reset the dyn-env stack + // to the state it had before catch_8/16 pushed its two conses. + ASSERT(vm._dynRecordTop > vm._dynRecordBottom); + VMDynRecord* r = --vm._dynRecordTop; + ASSERT(r->kind == VMDynKind::Catch); + my_thread->dynEnvStackSet(T_sp((gctools::Tagged)r->dynenv_mark)); + pc++; + VM_NEXT; } - case vm_code::special_bind: { + VM_CASE(special_bind): { uint8_t c = *(++pc); DBG_VM("special-bind %" PRIu8 "\n", c); T_sp value((gctools::Tagged)(vm.pop(sp))); pc++; - T_sp cell((gctools::Tagged)literals[c]); - vm._pc = pc; - call_with_cell_bound(gc::As_assert(cell), value, - [&]() { return bytecode_vm(vm, literals, closed, closure, fp, sp, lcc_nargs, lcc_args); }); - sp = vm._stackPointer; - pc = vm._pc; - break; - } - case vm_code::symbol_value: { + // Inline the binding instead of recursing into bytecode_vm. The paired + // `unbind` opcode pops the record and restores the value. + T_sp cell_sp((gctools::Tagged)literals[c]); + VariableCell_sp cell = gc::As_assert(cell_sp); + T_sp oldval = cell->bind(value); + // Keep a BindingDynEnv_O on the dyn-env stack so that SJLJ unwinders + // that longjmp past this frame still call BindingDynEnv_O::proceed() + // and restore the binding. We heap-allocate it because it must live + // across opcodes, not in a C++ stack scope. + BindingDynEnv_sp bde = gctools::GC::allocate(cell, oldval); + Cons_sp bde_cons = Cons_O::create(bde, my_thread->dynEnvStackGet()); + my_thread->dynEnvStackSet(bde_cons); + // Push a VM record so `unbind` and the outer catch(Unwind&) handler + // can find and restore the binding without a recursive bytecode_vm. + ASSERT(vm._dynRecordTop < vm._dynRecordLimit); + VMDynRecord* r = vm._dynRecordTop++; + r->kind = VMDynKind::SpecialBind; + r->frame = nullptr; + r->slot0 = cell.raw_(); + r->slot1 = oldval.raw_(); + r->sp_mark = sp; + r->fp_mark = fp; + r->target_pc = nullptr; + r->dynenv_mark = bde_cons.raw_(); + VM_NEXT; + } + VM_CASE(symbol_value): { uint8_t c = *(++pc); DBG_VM("symbol-value %" PRIu8 "\n", c); T_sp cell_sp((gctools::Tagged)literals[c]); VariableCell_sp cell = gc::As_assert(cell_sp); vm.push(sp, cell->value().raw_()); pc++; - break; + VM_NEXT; } - case vm_code::symbol_value_set: { + VM_CASE(symbol_value_set): { uint8_t c = *(++pc); DBG_VM("symbol-value-set %" PRIu8 "\n", c); T_sp cell_sp((gctools::Tagged)literals[c]); @@ -895,34 +1100,53 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure T_sp value((gctools::Tagged)(vm.pop(sp))); cell->set_value(value); pc++; - break; + VM_NEXT; } - case vm_code::unbind: { + VM_CASE(unbind): { DBG_VM("unbind\n"); - vm._pc = pc + 1; + // Close the topmost dynenv record. One unbind closes one record: + // - SpecialBind — one cell restored + // - Progv — all N cells in the progv restored at once + // (compiler emits one unbind per special-bind, and one unbind for a + // whole progv regardless of N). Sync sp defensively for the + // (unexpected) UnwindProtect case. + ASSERT(vm._dynRecordTop > vm._dynRecordBottom); vm._stackPointer = sp; - // This return value is not actually used - we're just returning from - // a bytecode_vm recursively invoked by vm_special_bind above. - // (or vm_progv) - return gctools::return_type(nil().raw_(), 0); + vm_pop_dyn_record(vm); + pc++; + VM_NEXT; } - case vm_code::progv: { + VM_CASE(progv): { uint8_t c = *(++pc); DBG_VM1("progv %" PRIu8 "\n", c); T_sp env((gctools::Tagged)literals[c]); T_sp vals((gctools::Tagged)(vm.pop(sp))); T_sp vars((gctools::Tagged)(vm.pop(sp))); - vm._pc = ++pc; - fprogv_env(env, vars, vals, [&]() { return bytecode_vm(vm, literals, closed, closure, fp, sp, lcc_nargs, lcc_args); }); - sp = vm._stackPointer; - pc = vm._pc; - break; - } - case vm_code::fdefinition: { - // We have function cells in the literals vector. While these are - // themselves callable, we have to resolve the cell because we - // use vm_code::fdefinition for lookup of #'foo, which may e.g. - // have its type or identity tested. + pc++; + // Resolve symbols → cells, allocate an oldvals vector, and install the + // new bindings. One VMDynRecord covers all N bindings — one matching + // `unbind` opcode closes them all, matching compiler expectations. + SimpleVector_sp cells = resolve_progv_symbols(vars, env); + size_t ncells = cells->length(); + SimpleVector_sp oldvals = SimpleVector_O::make(ncells); + progv_set_values(cells, oldvals, vals); + // Keep a ProgvDynEnv_O on the dyn-env stack for SJLJ compatibility. + ProgvDynEnv_sp pde = gctools::GC::allocate(cells, oldvals); + Cons_sp pde_cons = Cons_O::create(pde, my_thread->dynEnvStackGet()); + my_thread->dynEnvStackSet(pde_cons); + ASSERT(vm._dynRecordTop < vm._dynRecordLimit); + VMDynRecord* r = vm._dynRecordTop++; + r->kind = VMDynKind::Progv; + r->frame = nullptr; + r->slot0 = cells.raw_(); + r->slot1 = oldvals.raw_(); + r->sp_mark = sp; + r->fp_mark = fp; + r->target_pc = nullptr; + r->dynenv_mark = pde_cons.raw_(); + VM_NEXT; + } + VM_CASE(fdefinition): { uint8_t c = *(++pc); DBG_VM1("fdefinition %" PRIu8 "\n", c); T_sp cell((gctools::Tagged)literals[c]); @@ -930,35 +1154,36 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure vm.push(sp, fun.raw_()); VM_RECORD_PLAYBACK(fun.raw_(), "fdefinition"); pc++; - break; + VM_NEXT; } - case vm_code::nil: + VM_CASE(nil): { DBG_VM("nil\n"); vm.push(sp, nil().raw_()); pc++; - break; - case vm_code::push: { + VM_NEXT; + } + VM_CASE(push): { DBG_VM1("push\n"); vm.push(sp, multipleValues.valueGet(0, multipleValues.getSize()).raw_()); pc++; - break; + VM_NEXT; } - case vm_code::pop: { + VM_CASE(pop): { DBG_VM1("pop\n"); T_sp obj((gctools::Tagged)vm.pop(sp)); multipleValues.set1(obj); pc++; - break; + VM_NEXT; } - case vm_code::dup: { + VM_CASE(dup): { DBG_VM1("dup\n"); T_O* obj = vm.pop(sp); vm.push(sp, obj); vm.push(sp, obj); pc++; - break; + VM_NEXT; } - case vm_code::fdesignator: { + VM_CASE(fdesignator): { uint8_t c = *(++pc); DBG_VM1("fdesignator %" PRIu8 "\n", c); T_sp env((gctools::Tagged)literals[c]); @@ -967,75 +1192,91 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure vm.push(sp, fun.raw_()); VM_RECORD_PLAYBACK(run.raw_(), "fdesignator"); pc++; - break; - } - case vm_code::called_fdefinition: { - // This is like FDEFINITION except that we know the result will - // just be called. So, we can just use the cell directly - // without checking fboundedness, and this is just like const. - // (const would be different on an implementation that doesn't - // have funcallable function cells.) + VM_NEXT; + } + VM_CASE(called_fdefinition): { uint8_t c = *(++pc); DBG_VM1("called-fdefinition %" PRIu8 "\n", c); T_O* fun = literals[c]; vm.push(sp, fun); VM_RECORD_PLAYBACK(fun, "called-fdefinition"); pc++; - break; + VM_NEXT; } - case vm_code::protect: { + VM_CASE(protect): { uint8_t c = *(++pc); DBG_VM("protect %" PRIu8 "\n", c); - // Build a closure - this works mostly like make_closure. T_sp fn_sp((gctools::Tagged)literals[c]); BytecodeSimpleFun_sp fn = fn_sp.as_assert(); size_t nclosed = fn->environmentSize(); DBG_VM(" nclosed = %zu\n", nclosed); - // Technically we could avoid consing a closure when nclosed = 0 - // but I don't know that it's worth the trouble. Closure_sp cleanup = Closure_O::make_bytecode_closure(fn, nclosed); vm.copyto(sp, nclosed, (T_O**)(cleanup->_Slots.data())); vm.drop(sp, nclosed); - // Now stick it onto the dynamic environment. - vm._pc = ++pc; - T_mv result = funwind_protect([&]() { - return bytecode_vm(vm, literals, closed, closure, fp, sp, lcc_nargs, lcc_args); - }, - [&]() { eval::funcall(cleanup); }); - // copied from vm_code::call - required to avoid the cleanup's values - // for... some reason. I'm not totally sure. - multipleValues.setN(result.raw_(), result.number_of_values()); - sp = vm._stackPointer; - pc = vm._pc; - break; - } - case vm_code::cleanup: { + pc++; + // Push an UnknownDynEnv_O on the dyn-env stack. Its search() returns + // FallBack, so any SJLJ unwind that tries to cross this protect will + // fall back to `throw Unwind` (or `throw CatchThrow` for sjlj_throw), + // which our outer C++ handler catches and drives the cleanup through + // vm_pop_dyn_record. This replaces the setjmp inside the old + // funwind_protect — the cleanup closure lives in the VMDynRecord. + UnknownDynEnv_sp barrier = gctools::GC::allocate(); + Cons_sp barrier_cons = Cons_O::create(barrier, my_thread->dynEnvStackGet()); + my_thread->dynEnvStackSet(barrier_cons); + ASSERT(vm._dynRecordTop < vm._dynRecordLimit); + VMDynRecord* r = vm._dynRecordTop++; + r->kind = VMDynKind::UnwindProtect; + r->frame = nullptr; + r->slot0 = cleanup.raw_(); + r->slot1 = nullptr; + r->sp_mark = sp; + r->fp_mark = fp; + r->target_pc = nullptr; + r->dynenv_mark = barrier_cons.raw_(); + VM_NEXT; + } + VM_CASE(cleanup): { DBG_VM("cleanup\n"); + // Normal-path cleanup: the protected body finished without unwinding. + // Pop the VM record and the SJLJ barrier, save the body's multiple + // values, run the cleanup thunk with *interrupts-enabled* = NIL, + // restore the values, and continue to the next opcode. On the unwind + // path the same work is done by vm_pop_dyn_record. + ASSERT(vm._dynRecordTop > vm._dynRecordBottom); + VMDynRecord* r = --vm._dynRecordTop; + ASSERT(r->kind == VMDynKind::UnwindProtect); + T_sp cleanup_fn((gctools::Tagged)r->slot0); + my_thread->dynEnvStackSet(CONS_CDR(my_thread->dynEnvStackGet())); + // Sync local sp / pc into the VM before calling out: the cleanup + // closure may recurse into bytecode_call which reads vm._stackPointer + // and vm._pc when setting up the callee frame. vm._pc = pc + 1; vm._stackPointer = sp; - // We need to return the actual current values, or at least - // their correct count, so that funwind_protect can save them. - size_t nvalues = multipleValues.getSize(); - return gctools::return_type(multipleValues.valueGet(0, nvalues).raw_(), nvalues); + size_t nvals = multipleValues.getSize(); + T_O* mv_temp[nvals]; + multipleValues.saveToTemp(nvals, mv_temp); + call_with_variable_bound(_sym_STARinterrupts_enabledSTAR, nil(), + [&]() { return eval::funcall(cleanup_fn); }); + multipleValues.loadFromTemp(nvals, mv_temp); + sp = vm._stackPointer; + pc++; + VM_NEXT; } - case vm_code::encell: { - // abbreviation for ref N; make-cell; set N + VM_CASE(encell): { uint8_t n = *(++pc); DBG_VM1("encell %" PRIu8 "\n", n); T_sp val((gctools::Tagged)(*(vm.reg(fp, n)))); vm.setreg(fp, n, Cons_O::create(val, nil()).raw_()); pc++; - break; + VM_NEXT; } - case vm_code::_long: { - // In a separate function to facilitate better icache utilization - // by bytecode_vm (hopefully) + VM_CASE(_long): { pc++; - // FIXME: This is a stupid way of returning two values. pc = long_dispatch(vm, pc, multipleValues, literals, closed, closure, fp, sp, lcc_nargs, lcc_args, *pc); sp = vm._stackPointer; - break; + VM_NEXT; } +#ifndef USE_COMPUTED_GOTO default: SimpleFun_sp ep = closure->entryPoint(); BytecodeModule_sp bcm = gc::As(ep)->code(); @@ -1044,6 +1285,75 @@ bytecode_vm(VirtualMachine& vm, T_O** literals, T_O** closed, Closure_O* closure SIMPLE_ERROR("Unknown opcode {} pc: {} module: {} - {}", *pc, (void*)pc, (void*)codeStart, (void*)codeEnd); }; } +#endif + } catch (Unwind& uw) { + // An exit_* from within our activation (or any Unwind bubbling up + // from below). Walk records from the top down, running cleanups + // for SpecialBind / Progv / UnwindProtect and resetting dyn-env + // state for Catch / Tagbody records we pass. If we find a Tagbody + // record whose frame matches uw.getFrame(), resume dispatch at + // the target pc stashed in vm._pc by exit_*. The Tagbody record + // itself is NOT popped — the tagbody stays live, ready for the + // next `go`. By the time we reach a match, the walk has already + // restored dynEnvStackGet to the state just after `entry` pushed + // its TagbodyDynEnv + barrier pair, so no re-push is needed. + vm._stackPointer = sp; + void* unwindFrame = uw.getFrame(); + bool matched = false; + while (vm._dynRecordTop > dyn_entry_mark) { + VMDynRecord* r = vm._dynRecordTop - 1; + if (r->kind == VMDynKind::Tagbody && r->frame == unwindFrame) { + pc = vm._pc; // target set by exit_* + sp = r->sp_mark; // body sees tagbody-entry stack state + fp = r->fp_mark; + matched = true; + break; + } + vm_pop_dyn_record(vm); + } + if (matched) continue; + throw; + } catch (CatchThrow& ct) { + // A matching catch may be in our record stack. Walk records from + // the top down, running cleanups for SpecialBind / Progv / + // UnwindProtect. When we find a Catch record whose tag matches, + // reset pc / sp / fp / dyn-env stack and resume dispatch via the + // enclosing while(true). If we reach the entry mark without a + // match, rethrow to let an outer scope handle it. + vm._stackPointer = sp; + T_sp throwTag = ct.getTag(); + bool matched = false; + while (vm._dynRecordTop > dyn_entry_mark) { + VMDynRecord* r = vm._dynRecordTop - 1; + if (r->kind == VMDynKind::Catch + && T_sp((gctools::Tagged)r->slot0) == throwTag) { + pc = r->target_pc; + sp = r->sp_mark; + fp = r->fp_mark; + my_thread->dynEnvStackSet(T_sp((gctools::Tagged)r->dynenv_mark)); + --vm._dynRecordTop; + matched = true; + break; + } + vm_pop_dyn_record(vm); + } + if (matched) continue; // re-enter try { dispatch } + throw; + } catch (...) { + // std exceptions and other foreign throws must still trigger + // unwind-protect cleanups on the way through — cleanup + rethrow + // only, no matching. + vm._stackPointer = sp; + while (vm._dynRecordTop > dyn_entry_mark) + vm_pop_dyn_record(vm); + throw; + } + } // while (true) — only reached on resume-after-match (future work). +#undef VM_CASE +#undef VM_NEXT +#ifdef USE_COMPUTED_GOTO +#undef USE_COMPUTED_GOTO +#endif } static unsigned char* long_dispatch(VirtualMachine& vm, unsigned char* pc, MultipleValues& multipleValues, T_O** literals, @@ -1516,6 +1826,7 @@ void VMFrameDynEnv_O::proceed() { VirtualMachine& vm = my_thread->_VM; vm._stackPointer = this->old_sp; vm._framePointer = this->old_fp; + vm._dynRecordTop = this->old_dyn_top; } }; // namespace core @@ -1559,17 +1870,24 @@ gctools::return_type bytecode_call(unsigned char* pc, core::T_O* lcc_closure, si // being unwound to. core::T_O** old_fp = vm._framePointer; core::T_O** old_sp = vm._stackPointer; - // Push the args and FP for debugging (see backtrace.cc) - // This is mildly wasteful of stack space, but when calling bytecode from - // non-bytecode the arguments won't be on the VM stack, so this is the - // best I got. + core::VMDynRecord* old_dyn_top = vm._dynRecordTop; + // Push (nargs, args, old_fp) onto the VM stack to build the bytecode VM + // frame chain. Backtrace walks this chain via BYTECODE_FRAME_*_OFFSET to + // recover (pc, fp) for each parent bytecode frame; the saved fp at offset + // 0 also acts as the chain terminator (NULL when called from C, not + // bytecode). Without these pushes, only the topmost bytecode frame has + // its lexical bindings visible in the debugger — caller bytecode frames + // would have no way to find their VM frame pointer. + // The args themselves are also recoverable from the arena trampoline's + // C frame ([rbp-0x10]); the trampoline path is preferred for args because + // it works even for the topmost frame with no caller `call` opcode push. vm.push(vm._stackPointer, core::make_fixnum(lcc_nargs).raw_()); vm.push(vm._stackPointer, (core::T_O*)lcc_args); vm.push(vm._stackPointer, (core::T_O*)old_fp); core::T_O** fp = vm._framePointer = vm._stackPointer; core::T_O** sp = vm.push_frame(fp, nlocals); try { - gctools::StackAllocate frame(old_sp, old_fp); + gctools::StackAllocate frame(old_sp, old_fp, old_dyn_top); gctools::StackAllocate sa_ec(frame.asSmartPtr(), my_thread->dynEnvStackGet()); core::DynEnvPusher dep(my_thread, sa_ec.asSmartPtr()); gctools::return_type res = bytecode_vm(vm, literals, closed, closure, fp, sp, lcc_nargs, lcc_args); @@ -1581,6 +1899,14 @@ gctools::return_type bytecode_call(unsigned char* pc, core::T_O* lcc_closure, si } } +// Legacy-path hook retained for src/core/trampoline/trampoline.cc which is +// still compiled into libclasp (its IR is also embedded as global_trampoline +// for target-datalayout extraction). The arena-mode trampolines no longer +// indirect through this pointer — they call bytecode_call via an absolute +// address baked into the trampoline template by LLVM. Will be removed when +// the legacy LLVM trampoline path is deleted. +gctools::return_type (*g_bytecode_call_ptr)(unsigned char*, core::T_O*, size_t, core::T_O**) = bytecode_call; + }; // extern C namespace core { diff --git a/src/core/bytecode_compiler.cc b/src/core/bytecode_compiler.cc index 5dcbfdd202..d4b24b7c3b 100644 --- a/src/core/bytecode_compiler.cc +++ b/src/core/bytecode_compiler.cc @@ -7,6 +7,7 @@ #include // source info stuff #include // also for source info #include +#include // cmp__compile_trampoline #include #include // max diff --git a/src/core/commandLineOptions.cc b/src/core/commandLineOptions.cc index 876896ed89..92274cd0ed 100644 --- a/src/core/commandLineOptions.cc +++ b/src/core/commandLineOptions.cc @@ -107,6 +107,14 @@ const char* help = R"dx(Usage: clasp Seed the random number generator with -w, --wait Print the PID and wait for the user to hit a key + --ext:[=] + Reserved namespace for extension command-line options. The kernel + collects these verbatim and passes them to extension code via + core:extension-command-line-arguments. See ext:register-command-line-option. + Unknown - [value...] + Any unrecognized flag (anything starting with '-' that the kernel + doesn't know) is also collected for extensions, along with any + following non-flag tokens. If no extension claims it, startup errors. -- * Trailing not processed and are added to core:*command-line-arguments* @@ -394,6 +402,14 @@ void process_clasp_arguments(CommandLineOptions* options) { options->_LoadEvalList.push_back(pair(std::make_pair(cloScript, *++arg))); } else if (*arg == "-S" || *arg == "--seed") { options->_RandomNumberSeed = atoi((*++arg).c_str()); + } else if (arg->compare(0, 6, "--ext:") == 0) { + options->_ExtensionArguments.push_back(*arg); + } else if (!arg->empty() && (*arg)[0] == '-') { + options->_ExtensionArguments.push_back(*arg); + while (arg + 1 != end && !(arg + 1)->empty() && (*(arg + 1))[0] != '-') { + ++arg; + options->_ExtensionArguments.push_back(*arg); + } } else { fmt::print(std::cerr, "{}: unrecognized option '{}'\n", gctools::program_name(), *arg); exit(1); @@ -490,6 +506,15 @@ CL_DEFUN List_sp core__command_line_load_eval_sequence() { return cl__nreverse(loadEvals); } +DOCGROUP(clasp); +CL_DEFUN List_sp core__extension_command_line_arguments() { + List_sp result = nil(); + for (auto it = global_options->_ExtensionArguments.rbegin(); it != global_options->_ExtensionArguments.rend(); ++it) { + result = Cons_O::create(SimpleBaseString_O::make(*it), result); + } + return result; +} + void maybeHandleAddressesOption(CommandLineOptions* options) { if (options->_AddressesP) { FILE* fout = fopen(options->_AddressesFileName.c_str(), "w"); diff --git a/src/core/compiler.cc b/src/core/compiler.cc index 73a51179c0..ec109d61be 100644 --- a/src/core/compiler.cc +++ b/src/core/compiler.cc @@ -58,6 +58,7 @@ THE SOFTWARE. #include #include #include +#include #include #include #include @@ -354,43 +355,29 @@ CL_DEFUN T_sp core__startup_image_pathname(bool extension) { return pn; }; -int global_jit_pid = -1; -FILE* global_jit_log_stream = NULL; bool global_jit_log_symbols = false; +// Route LLVM-ORC per-symbol callbacks through the shared perf-PID.map +// writer in trampoline_arena.cc so arena trampoline entries and native- +// compiled symbols coexist in a single file. Without the shared writer, +// each side would fopen("w") and truncate the other's entries. void jit_register_symbol(const std::string& name, size_t size, void* address) { - WITH_READ_WRITE_LOCK(globals_->_JITLogMutex); - int gpid = getpid(); - if (global_jit_log_stream && (global_jit_pid != gpid)) { - fclose(global_jit_log_stream); - global_jit_log_stream = NULL; - global_jit_pid = -1; - } - if (global_jit_pid == -1) { - global_jit_pid = gpid; - stringstream filename; - filename << "/tmp/perf-" << gpid << ".map"; - global_jit_log_stream = fopen(filename.str().c_str(), "w"); - } - if (global_jit_log_stream) { - char nameBuffer[1024]; - char* namecur = nameBuffer; - char prevchar = ' '; - for (int i = 0; i < name.size() && i < 1023; i++) { - if (name[i] == '\r') - continue; - if (name[i] == '\n') - continue; - if (name[i] < 32 && name[i] == prevchar) - continue; - *namecur = name[i]; - prevchar = name[i]; - namecur++; - } - *namecur = '\0'; - fprintf(global_jit_log_stream, "%0lx %lx %s\n", (uintptr_t)address, size, nameBuffer); - fflush(global_jit_log_stream); + // Strip CR/LF and runs of other control chars from the LLVM symbol name + // before logging — some mangled names contain embedded whitespace/control + // characters that break the perf-PID.map line-oriented format. + char nameBuffer[1024]; + char* namecur = nameBuffer; + char prevchar = ' '; + for (int i = 0; i < (int)name.size() && (namecur - nameBuffer) < 1023; i++) { + if (name[i] == '\r') continue; + if (name[i] == '\n') continue; + if (name[i] < 32 && name[i] == prevchar) continue; + *namecur = name[i]; + prevchar = name[i]; + namecur++; } + *namecur = '\0'; + llvmo::perf_map_append((uint8_t*)address, size, std::string(nameBuffer)); } CL_DEFUN void core__jit_register_symbol(const std::string& name, size_t size, void* address) { diff --git a/src/core/cscript.lisp b/src/core/cscript.lisp index fc957543ed..be5435fe64 100644 --- a/src/core/cscript.lisp +++ b/src/core/cscript.lisp @@ -1,5 +1,6 @@ (k:sources :libclasp #~"dummy.cc" + #~"sampling_profiler.cc" #~"mpPackage.cc" #~"nativeVector.cc" #~"evaluator.cc" diff --git a/src/core/debugger.cc b/src/core/debugger.cc index b27e21f3e6..2fbee3463d 100644 --- a/src/core/debugger.cc +++ b/src/core/debugger.cc @@ -70,6 +70,7 @@ THE SOFTWARE. #include #include #include +#include #include #include @@ -299,6 +300,16 @@ bool library_with_name(const std::string& name, bool isExecutable, std::string& bool lookup_address(uintptr_t address, const char*& symbol, uintptr_t& start, uintptr_t& end) { void* ip = (void*)address; + // Arena trampoline slot: side table has the name and exact range. The + // returned `symbol` pointer aliases the side-table entry's std::string, + // which is stable for the process lifetime (the table is append-only and + // never deallocates entries). + if (const llvmo::TrampolineEntry* e = llvmo::arena_lookup_by_pc(address)) { + symbol = e->name.c_str(); + start = (uintptr_t)e->code_start; + end = start + e->code_size; + return true; + } T_sp of = llvmo::only_object_file_for_instruction_pointer(ip); if (!gc::IsA(of)) return false; // no ofi found diff --git a/src/core/funcallableInstance.cc b/src/core/funcallableInstance.cc index 9ae94c2e3d..112623bd3f 100644 --- a/src/core/funcallableInstance.cc +++ b/src/core/funcallableInstance.cc @@ -44,6 +44,11 @@ THE SOFTWARE. #include #include #include +#include +#include // cmp__compile_gf_trampoline +#include +#include +#include #include #include #include @@ -482,6 +487,16 @@ struct GFBytecodeEntryPoint { } }; +// Exported pointer to GFBytecodeEntryPoint::entry_point_n so llvmoPackage.cc +// can embed this address in the generic-function arena trampoline template +// (the struct itself is local to this translation unit). Static-initialized +// before any GF dispatcher is created, so the value is stable by the time +// the arena init reads it. +extern "C" { +gctools::return_type (*g_gf_dispatch_entry_point_n)(T_O*, size_t, T_O**) = + &GFBytecodeEntryPoint::entry_point_n; +} + GFBytecodeSimpleFun_O::GFBytecodeSimpleFun_O(FunctionDescription_sp fdesc, unsigned int entryPcN, SimpleVector_byte8_t_sp bytecode, SimpleVector_sp literals, Function_sp generic_function, size_t specialized_length) : SimpleFun_O(fdesc, nil(), XepStereotype(specialized_length)), @@ -501,6 +516,21 @@ GFBytecodeSimpleFun_sp GFBytecodeSimpleFun_O::make(Function_sp generic_function) SimpleVector_sp literals = gc::As(mv.third(compiled.number_of_values())); size_t specialized_length = mv.fourth(compiled.number_of_values()).unsafe_fixnum(); auto obj = gctools::GC::allocate(fdesc, 0, bytecode, literals, generic_function, specialized_length); + + // Replace _EntryPoints[0] (initially set by XepStereotype to the static + // entry_point_n, which all GFs would share) with a per-GF arena trampoline + // so flame charts / perf-PID.map show the specific GF name. The trampoline + // tail-calls the same entry_point_n so dispatch semantics are unchanged. + Pointer_sp gf_tramp = llvmo::cmp__compile_gf_trampoline(name); + obj->_EntryPoints._EntryPoints[0] = (ClaspXepAnonymousFunction)gf_tramp->ptr(); + + // Register on the global list so the post-snapshot-load pass can walk + // every GFBytecodeSimpleFun and re-attach an arena trampoline. Atomic + // CAS-push to be safe under concurrent dispatch compilation. + T_sp old = _lisp->_Roots._AllGFBytecodeFuns.load(std::memory_order_relaxed); + Cons_sp newc = Cons_O::create(obj, old); + while (!_lisp->_Roots._AllGFBytecodeFuns.compare_exchange_weak(old, newc, std::memory_order_relaxed)) + newc->setCdr(old); return obj; } @@ -522,6 +552,24 @@ std::string GFBytecodeSimpleFun_O::__repr__() const { } void GFBytecodeSimpleFun_O::fixupInternalsForSnapshotSaveLoad(snapshotSaveLoad::Fixup* fixup) { + // _EntryPoints[0] points at a per-GF arena trampoline (set by make()). + // The arena slot's address won't survive a restart, so on save substitute + // the static GFBytecodeEntryPoint::entry_point_n forwarder address — that's + // a libclasp symbol the encoder can serialize. The post-load regen pass + // (arena_post_load_regenerate_trampolines) walks _AllGFBytecodeFuns and + // re-installs a fresh per-GF arena trampoline. + if (snapshotSaveLoad::operation(fixup) == snapshotSaveLoad::SaveOp + && llvmo::arena_owns_pc((uintptr_t)this->_EntryPoints._EntryPoints[0].load(std::memory_order_relaxed))) { + this->_EntryPoints._EntryPoints[0].store( + (ClaspXepAnonymousFunction)g_gf_dispatch_entry_point_n, std::memory_order_relaxed); + } + // _Trampoline isn't actually invoked at runtime for GFBytecodeSimpleFun + // (dispatch goes through entry_point_n via _EntryPoints[0]), but keep the + // substitution as defense-in-depth in case anything ever reads the field. + if (snapshotSaveLoad::operation(fixup) == snapshotSaveLoad::SaveOp + && llvmo::arena_owns_pc((uintptr_t)this->_Trampoline)) { + this->_Trampoline = (BytecodeTrampolineFunction)bytecode_call; + } this->fixupOneCodePointer(fixup, (void**)&this->_Trampoline); this->Base::fixupInternalsForSnapshotSaveLoad(fixup); } diff --git a/src/core/function.cc b/src/core/function.cc index 9e636128a2..7201d8baaf 100644 --- a/src/core/function.cc +++ b/src/core/function.cc @@ -49,6 +49,7 @@ THE SOFTWARE. #include #include #include +#include namespace core { @@ -86,8 +87,13 @@ bool SimpleFun_O::dladdrablep(std::set& uniques) { for (size_t ii = 0; ii < ClaspXepFunction::Entries; ++ii) { void* address = (void*)this->_EntryPoints._EntryPoints[ii].load(std::memory_order_relaxed); if (!uniques.contains(address)) { - Dl_info info; uniques.insert(address); + // Entry points that land in an arena trampoline slot are not + // backed by any loaded image, so dladdr necessarily fails on + // them. The arena's side table keeps their names, which is the + // symbolic-attainability this check is really about — accept. + if (llvmo::arena_owns_pc((uintptr_t)address)) continue; + Dl_info info; if (dladdr(address, &info) == 0) return false; } @@ -153,6 +159,24 @@ void BytecodeSimpleFun_O::fixupInternalsForSnapshotSaveLoad(snapshotSaveLoad::Fi // function pointers installed, and that they're properly relative // to the native ObjectFile. T_sp code = entryPoint()->_Code; + // Arena trampolines live in mmap'd pages whose addresses won't exist after + // a restart, so they can't be encoded as a stable library/code reference. + // Substitute the default bytecode_call (a libclasp symbol the encoder can + // handle) before the standard fixup runs; the post-load pass in llvmo + // (arena_post_load_regenerate_trampolines) re-attaches a fresh arena + // trampoline if the user is running with the arena backend. + if (snapshotSaveLoad::operation(fixup) == snapshotSaveLoad::SaveOp + && llvmo::arena_owns_pc((uintptr_t)this->_Trampoline)) { + static std::atomic s_subbed{0}; + size_t n = s_subbed.fetch_add(1) + 1; + if (n <= 3 || (n % 10000) == 0) { + fprintf(stderr, + "[trampoline-arena] save-side substitute #%zu '%s' was %p -> bytecode_call\n", + n, _rep_(this->functionName()).c_str(), (void*)this->_Trampoline); + fflush(stderr); + } + this->_Trampoline = (BytecodeTrampolineFunction)bytecode_call; + } this->fixupOneCodePointer(fixup, (void**)&this->_Trampoline, code); for (size_t ii = 0; ii < ClaspXepFunction::Entries; ++ii) this->fixupOneCodePointer(fixup, (void**)&this->_EntryPoints._EntryPoints[ii], code); diff --git a/src/core/lisp.cc b/src/core/lisp.cc index b4a1c7c383..f6ccb9936f 100644 --- a/src/core/lisp.cc +++ b/src/core/lisp.cc @@ -191,7 +191,7 @@ struct FindApropos : public KeyValueMapper //, public gctools::StackRoot // Lisp::GCRoots::GCRoots() : _ClaspJIT(nil()), _AllObjectFiles(nil()), _AllCodeBlocks(nil()), _AllLibraries(nil()), - _AllBytecodeModules(nil()), + _AllBytecodeModules(nil()), _AllGFBytecodeFuns(nil()), #ifdef CLASP_THREADS _UnboundCellFunctionEntryPoint(unbound()), _ActiveThreads(nil()), _DefaultSpecialBindings(nil()), #endif diff --git a/src/core/loadltv.cc b/src/core/loadltv.cc index d137f7e1f8..feec4349d5 100644 --- a/src/core/loadltv.cc +++ b/src/core/loadltv.cc @@ -17,7 +17,8 @@ #include // making packages #include // making pathnames #include // cl__truename -#include // cmp__compile_trampoline +#include +#include // cmp__compile_trampoline #include // native module stuff #include #include @@ -725,8 +726,8 @@ struct loadltv { uint16_t nclosed = read_u16(); BytecodeModule_sp module = gc::As(get_ltv(read_index())); FunctionDescription_sp fdesc = makeFunctionDescription(nil(), nil(), nil(), nil(), nil(), -1, -1, -1); - BytecodeSimpleFun_sp fun = core__makeBytecodeSimpleFun(fdesc, module, nlocals, nclosed, entry_point, final_size, - llvmo::cmp__compile_trampoline(nil())); + core::Pointer_sp tramp = llvmo::cmp__compile_trampoline(nil()); + BytecodeSimpleFun_sp fun = core__makeBytecodeSimpleFun(fdesc, module, nlocals, nclosed, entry_point, final_size, tramp ); set_ltv(fun, index); } @@ -812,8 +813,10 @@ struct loadltv { if (gc::IsA(named)) { Function_sp fun = gc::As_unsafe(named); fun->setf_functionName(name); - if (gc::IsA(named)) - gc::As_unsafe(named)->set_trampoline(llvmo::cmp__compile_trampoline(name)); + if (gc::IsA(named)) { + core::Pointer_sp tramp = llvmo::cmp__compile_trampoline(name); + gc::As_unsafe(named)->set_trampoline(tramp); + } } } diff --git a/src/core/mpPackage.cc b/src/core/mpPackage.cc index 51270c71a2..c00123adc7 100644 --- a/src/core/mpPackage.cc +++ b/src/core/mpPackage.cc @@ -44,6 +44,7 @@ THE SOFTWARE. #include #include #include +#include extern "C" { void mutex_lock_enter(char* nameword) { (void)0; }; @@ -180,6 +181,11 @@ void Process_O::run(void* cold_end_of_stack) { // my_thread->create_sigaltstack(); _ThreadInfo = my_thread; + // Register with the sampling profiler so SIGPROF samples on this + // thread produce full frame-walked stacks rather than leaf-only. + // Safe no-op if the profiler is not running. + core::sampling_profiler_register_current_thread(); + // We're ready to run Lisp runInner(core::cl__reverse(_InitialSpecialBindings)); diff --git a/src/core/sampling_profiler.cc b/src/core/sampling_profiler.cc new file mode 100644 index 0000000000..0fc3af3fae --- /dev/null +++ b/src/core/sampling_profiler.cc @@ -0,0 +1,718 @@ +/* + * sampling_profiler.cc — Phase 1: timer + SIGPROF + bump-buffer plumbing. + * + * Phase 1 scope: + * - Allocate a single process-wide ring buffer (bump pointer, drop-on-full). + * - Install an async-signal-safe SIGPROF handler that records a sample + * header plus only the interrupted RIP. Frame-pointer walking is a + * Phase 2 addition. + * - Drive ITIMER_PROF at the requested rate. Portable between Linux and + * macOS — both support setitimer(ITIMER_PROF). + * - Expose start / stop / reset / save / diagnostics to Lisp via CL_DEFUN. + * + * The existing clasp signal infrastructure (src/gctools/interrupt.cc) + * installs its own SIGPROF handler. We save it at profile-start and + * restore it at profile-stop; when not profiling clasp's original + * dispatch is untouched. + * + * Save format in Phase 1 is deliberately minimal — hex-only stack traces, + * no symbolication. Symbolication lands in Phase 4. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef _TARGET_OS_DARWIN +#define _XOPEN_SOURCE 600 +#include +#else +#include +#endif +#if defined(__linux__) +# include +#endif + +#include +#include +#include +#include +#include + +#include +#include +#include +#include // arena_lookup_by_pc + +namespace core { + +namespace { + +// --------------------------------------------------------------------------- +// State (file-static). The profiler is process-wide, so a single set of +// globals is correct. Load/store discipline must be async-signal-safe for +// anything the SIGPROF handler touches. +// --------------------------------------------------------------------------- + +std::atomic g_running{false}; // handler fast-exits if false +uint8_t* g_buffer = nullptr; // bump region (mmap'd) +size_t g_buffer_bytes = 0; +std::atomic g_write_offset{0}; // next free byte in g_buffer +unsigned g_max_depth = 8192; +std::atomic g_samples_recorded{0}; +std::atomic g_samples_dropped{0}; + +struct sigaction g_prev_sigaction; // clasp's original SIGPROF handler +bool g_prev_sigaction_saved = false; + +std::mutex g_lifecycle_lock; // serializes start/stop/save/reset + +// Read CLOCK_MONOTONIC in nanoseconds. clock_gettime is async-signal-safe +// on Linux and macOS for CLOCK_MONOTONIC. +static inline uint64_t now_ns_signal_safe() { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (uint64_t)ts.tv_sec * 1000000000ull + (uint64_t)ts.tv_nsec; +} + +// Read the interrupted instruction pointer out of the context structure. +// x86_64 only for now — portable to arm64 when we need it. +static inline uintptr_t ucontext_rip(void* ucptr) { +#if defined(__x86_64__) +# if defined(__linux__) + ucontext_t* uc = (ucontext_t*)ucptr; + return (uintptr_t)uc->uc_mcontext.gregs[REG_RIP]; +# elif defined(__APPLE__) + ucontext_t* uc = (ucontext_t*)ucptr; + return (uintptr_t)uc->uc_mcontext->__ss.__rip; +# else + (void)ucptr; return 0; +# endif +#else + (void)ucptr; return 0; // TODO: arm64 x29+x30 +#endif +} + +// Read the frame-base-pointer register (rbp) out of the ucontext. +static inline uintptr_t ucontext_rbp(void* ucptr) { +#if defined(__x86_64__) +# if defined(__linux__) + ucontext_t* uc = (ucontext_t*)ucptr; + return (uintptr_t)uc->uc_mcontext.gregs[REG_RBP]; +# elif defined(__APPLE__) + ucontext_t* uc = (ucontext_t*)ucptr; + return (uintptr_t)uc->uc_mcontext->__ss.__rbp; +# else + (void)ucptr; return 0; +# endif +#else + (void)ucptr; return 0; // TODO: arm64 x29 +#endif +} + +// --------------------------------------------------------------------------- +// Per-thread stack-bounds cache. Used by the walker to bound frame-pointer +// chasing: any rbp outside [stack_lo, stack_hi) is treated as end-of-stack. +// Populated lazily on first entry per thread via pthread_getattr_np — which +// is NOT async-signal-safe, so we do it outside the handler by gating on +// thread_local flags and catching up on the first non-handler call, or at +// start-of-profile. To keep Phase 2 simple and always-safe, the handler +// self-populates the cache the first time it fires on a thread: it calls +// pthread_getattr_np (which on Linux+glibc is safe-enough in practice — it +// doesn't allocate for the already-initialized thread). On macOS we use +// pthread_get_stackaddr_np + pthread_get_stacksize_np which are simple +// accessors and signal-safe in practice. +// --------------------------------------------------------------------------- + +struct ThreadStackBounds { + uintptr_t lo; + uintptr_t hi; + bool populated; +}; + +thread_local ThreadStackBounds t_stack_bounds{0, 0, false}; + +static void populate_stack_bounds_for_this_thread() { + if (t_stack_bounds.populated) return; +#if defined(__linux__) + pthread_attr_t attr; + if (pthread_getattr_np(pthread_self(), &attr) != 0) return; + void* addr = nullptr; + size_t size = 0; + pthread_attr_getstack(&attr, &addr, &size); + pthread_attr_destroy(&attr); + t_stack_bounds.lo = (uintptr_t)addr; + t_stack_bounds.hi = (uintptr_t)addr + size; +#elif defined(__APPLE__) + void* top = pthread_get_stackaddr_np(pthread_self()); // top (high addr) + size_t size = pthread_get_stacksize_np(pthread_self()); + t_stack_bounds.hi = (uintptr_t)top; + t_stack_bounds.lo = (uintptr_t)top - size; +#else + return; +#endif + t_stack_bounds.populated = true; +} + +// Validate an rbp candidate: word-aligned and inside the current thread's +// stack range. The walker terminates as soon as this check fails. +static inline bool plausible_rbp(uintptr_t rbp) { + if ((rbp & 7) != 0) return false; + if (!t_stack_bounds.populated) return false; + return rbp >= t_stack_bounds.lo && rbp + 16 <= t_stack_bounds.hi; +} + +// Walk the frame-pointer chain starting at (rip, rbp) and fill `out` with +// up to `max_depth` native PCs. Returns the number of frames recorded. +// Terminates on: out-of-stack-range rbp, null saved rbp, zero saved rip, +// non-advancing rbp, or max_depth. +// +// Safety: uses only register-read + bounded pointer walk + plausibility +// checks + out-of-process writes. No libc calls, no allocation, no locks. +static uint32_t walk_fp(uintptr_t rip_top, uintptr_t rbp_top, + uint64_t* out, uint32_t max_depth) { + if (max_depth == 0) return 0; + uint32_t d = 0; + out[d++] = (uint64_t)rip_top; + uintptr_t rbp = rbp_top; + while (d < max_depth && plausible_rbp(rbp)) { + uintptr_t saved_rbp = *((uintptr_t*)rbp); + uintptr_t saved_rip = *((uintptr_t*)(rbp + 8)); + if (saved_rip == 0) break; + out[d++] = (uint64_t)saved_rip; + // In stack-grows-down SysV ABI the caller's rbp lives at a higher + // address than the callee's. A non-advancing (or going-down) saved_rbp + // means the chain is broken or we hit a leaf without a frame. + if (saved_rbp <= rbp) break; + rbp = saved_rbp; + } + return d; +} + +// Reserve `bytes` from the bump buffer. Returns nullptr when the buffer is +// full — the caller increments the drop counter. Async-signal-safe: single +// CAS loop on a plain atomic counter, no allocation, no libc. +static inline uint8_t* ring_reserve(size_t bytes) { + size_t cur = g_write_offset.load(std::memory_order_relaxed); + for (;;) { + size_t next = cur + bytes; + if (next > g_buffer_bytes) return nullptr; + if (g_write_offset.compare_exchange_weak(cur, next, + std::memory_order_acq_rel, + std::memory_order_relaxed)) { + return g_buffer + cur; + } + // cur was updated by CAS failure; retry. + } +} + +// SIGPROF handler. Runs on an arbitrary Lisp thread at signal-delivery time. +// Must be async-signal-safe end-to-end. +// +// Strategy: walk the frame-pointer chain into a small on-stack buffer with +// plausibility checks, then reserve exactly the right number of bytes in +// the ring and copy the result in. This avoids over-reserving or needing +// a two-step reserve/commit protocol. +static void sigprof_handler(int /*sig*/, siginfo_t* /*info*/, void* ucptr) { + if (!g_running.load(std::memory_order_acquire)) return; + + // NEVER call pthread_getattr_np (or anything that can malloc) from a + // signal handler. On glibc pthread_getattr_np calls malloc, and if the + // interrupted thread was already inside malloc holding the glibc arena + // lock, the handler's malloc call deadlocks waiting for the same lock. + // Observed concretely under SLIME+compile: __cxa_allocate_exception + // from sjlj_unwind holds the arena lock when SIGPROF fires. + // + // Stack bounds must be populated before the thread ever receives a + // sample: in sampling_profiler_start for the calling thread, and via + // ext:profile-register-thread for any other Lisp thread. If bounds + // are not populated we fall back to leaf-only sampling, which is a + // usable data point and never deadlocks. + + uintptr_t rip = ucontext_rip(ucptr); + uintptr_t rbp = ucontext_rbp(ucptr); + + // Walk into a stack-local buffer. 8K worst case = 64 KiB, on a typical + // 8 MiB stack that's fine; samples with shorter stacks don't waste the + // ring buffer because we use the actual depth when reserving. + uint64_t pcs[8192]; + uint32_t cap = g_max_depth; + if (cap > 8192) cap = 8192; + uint32_t depth; + if (t_stack_bounds.populated) { + depth = walk_fp(rip, rbp, pcs, cap); + } else { + pcs[0] = (uint64_t)rip; + depth = 1; + } + + const size_t record_bytes = sizeof(SampleHeader) + depth * sizeof(uint64_t); + uint8_t* slot = ring_reserve(record_bytes); + if (!slot) { + g_samples_dropped.fetch_add(1, std::memory_order_relaxed); + return; + } + + SampleHeader* h = (SampleHeader*)slot; + h->timestamp_ns = now_ns_signal_safe(); + h->vm_pc = 0; // TODO: Phase 2b — capture my_thread->_VM._pc if in bytecode_vm +#if defined(__linux__) + h->thread_id = (uint32_t)syscall(SYS_gettid); +#else + h->thread_id = 0; // TODO: pthread_mach_thread_np on macOS +#endif + h->depth = depth; + std::memcpy(slot + sizeof(SampleHeader), pcs, depth * sizeof(uint64_t)); + + g_samples_recorded.fetch_add(1, std::memory_order_relaxed); +} + +// --------------------------------------------------------------------------- +// Timer control. +// --------------------------------------------------------------------------- + +static bool install_sigaction() { + struct sigaction sa; + std::memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = &sigprof_handler; + sigemptyset(&sa.sa_mask); + sa.sa_flags = SA_SIGINFO | SA_RESTART; + if (sigaction(SIGPROF, &sa, &g_prev_sigaction) != 0) { + fprintf(stderr, "[sampling-profiler] sigaction failed: %s\n", strerror(errno)); + return false; + } + g_prev_sigaction_saved = true; + return true; +} + +static void restore_sigaction() { + if (!g_prev_sigaction_saved) return; + sigaction(SIGPROF, &g_prev_sigaction, nullptr); + g_prev_sigaction_saved = false; +} + +static bool arm_timer(unsigned rate_hz) { + struct itimerval it; + // Interval chosen so the first tick arrives promptly rather than after + // a full period (value = it_interval = 1/rate). + long usec = 1000000L / (long)rate_hz; + if (usec < 1) usec = 1; + it.it_interval.tv_sec = 0; + it.it_interval.tv_usec = usec; + it.it_value = it.it_interval; + if (setitimer(ITIMER_PROF, &it, nullptr) != 0) { + fprintf(stderr, "[sampling-profiler] setitimer failed: %s\n", strerror(errno)); + return false; + } + return true; +} + +static void disarm_timer() { + struct itimerval it; + std::memset(&it, 0, sizeof(it)); + setitimer(ITIMER_PROF, &it, nullptr); +} + +} // anonymous namespace + +// --------------------------------------------------------------------------- +// Public API. +// --------------------------------------------------------------------------- + +bool sampling_profiler_running() { + return g_running.load(std::memory_order_acquire); +} + +bool sampling_profiler_start(unsigned rate_hz, unsigned max_depth, size_t buffer_bytes) { + std::lock_guard g(g_lifecycle_lock); + if (g_running.load(std::memory_order_acquire)) { + fprintf(stderr, "[sampling-profiler] start: already running\n"); + return false; + } + + if (rate_hz < 1) rate_hz = 1; + if (rate_hz > 10000) rate_hz = 10000; + if (max_depth < 1) max_depth = 1; + if (max_depth > 8192) max_depth = 8192; + if (buffer_bytes == 0) buffer_bytes = 256ull * 1024ull * 1024ull; // 256 MiB + + // (Re-)allocate the ring buffer if size changed or first use. + if (!g_buffer || g_buffer_bytes != buffer_bytes) { + if (g_buffer) { + munmap(g_buffer, g_buffer_bytes); + g_buffer = nullptr; + g_buffer_bytes = 0; + } + void* p = mmap(nullptr, buffer_bytes, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (p == MAP_FAILED) { + fprintf(stderr, "[sampling-profiler] mmap(%zu) failed: %s\n", + buffer_bytes, strerror(errno)); + return false; + } + g_buffer = (uint8_t*)p; + g_buffer_bytes = buffer_bytes; + } + g_write_offset.store(0, std::memory_order_release); + g_samples_recorded.store(0, std::memory_order_release); + g_samples_dropped.store(0, std::memory_order_release); + g_max_depth = max_depth; + + if (!install_sigaction()) return false; + // Populate this thread's stack bounds now, from a safe context, before + // any sample can fire. pthread_getattr_np is not async-signal-safe. + populate_stack_bounds_for_this_thread(); + // Publish running=true BEFORE arming the timer so the first tick sees + // it. Release ordering pairs with the handler's acquire load. + g_running.store(true, std::memory_order_release); + if (!arm_timer(rate_hz)) { + g_running.store(false, std::memory_order_release); + restore_sigaction(); + return false; + } + +#if 0 + fprintf(stderr, + "[sampling-profiler] started: rate=%u Hz max_depth=%u buffer=%zu MiB\n", + rate_hz, max_depth, buffer_bytes / (1024 * 1024)); + fflush(stderr); +#endif + return true; +} + +void sampling_profiler_stop() { + std::lock_guard g(g_lifecycle_lock); + if (!g_running.load(std::memory_order_acquire)) return; + disarm_timer(); + // Flip running=false BEFORE restoring the handler — any in-flight + // handler invocation will see the flag and fast-exit; the + // setitimer-disarm above prevents new signals. + g_running.store(false, std::memory_order_release); + restore_sigaction(); +#if 0 + fprintf(stderr, + "[sampling-profiler] stopped: %lu samples recorded, %lu dropped, %zu/%zu bytes used\n", + (unsigned long)g_samples_recorded.load(), + (unsigned long)g_samples_dropped.load(), + g_write_offset.load(), g_buffer_bytes); + fflush(stderr); +#endif +} + +void sampling_profiler_reset() { + std::lock_guard g(g_lifecycle_lock); + g_write_offset.store(0, std::memory_order_release); + g_samples_recorded.store(0, std::memory_order_release); + g_samples_dropped.store(0, std::memory_order_release); +} + +// --------------------------------------------------------------------------- +// Symbolication (Phase 4) + collapsed-stacks aggregation (Phase 5). +// --------------------------------------------------------------------------- + +namespace { + +// Sanitize a symbol name for collapsed-stacks output. flamegraph.pl uses +// ';' as the frame separator and treats the trailing token as a numeric +// count — so any ';' or whitespace in a Lisp symbol name (e.g. +// "FOO; BAR" or "(SETF X)") would corrupt the line. Replace them with '_'. +static std::string sanitize_frame(const std::string& s) { + std::string out; out.reserve(s.size()); + for (char c : s) { + if (c == ';' || c == ' ' || c == '\t' || c == '\n' || c == '\r') + out += '_'; + else + out += c; + } + return out; +} + +// Per-process JIT-symbol index built from /tmp/perf-.map. Clasp's +// trampoline arena and the LLVM-ORC post-link callback both write +// lines to this file as code is generated; +// we read the current state at symbolication time. Sorted by start +// address so lookup is O(log N). +struct PerfMapEntry { + uint64_t start; + uint64_t size; // 0 bumped to 1 so single-byte stubs cover their own PC + std::string name; +}; + +static std::vector load_perf_map() { + std::vector out; + char path[64]; + snprintf(path, sizeof path, "/tmp/perf-%d.map", getpid()); + FILE* fp = fopen(path, "r"); + if (!fp) return out; + char line[2048]; + while (fgets(line, sizeof line, fp)) { + uint64_t addr = 0, size = 0; + char name[1024] = {0}; + if (sscanf(line, "%lx %lx %1023[^\n]", &addr, &size, name) >= 3) { + out.push_back({addr, size ? size : 1, std::string(name)}); + } + } + fclose(fp); + std::sort(out.begin(), out.end(), + [](const PerfMapEntry& a, const PerfMapEntry& b) { + return a.start < b.start; + }); + return out; +} + +static const PerfMapEntry* +perf_map_lookup(const std::vector& idx, uint64_t pc) { + auto it = std::upper_bound(idx.begin(), idx.end(), pc, + [](uint64_t p, const PerfMapEntry& e) { + return p < e.start; + }); + if (it == idx.begin()) return nullptr; + --it; + if (pc >= it->start && pc < it->start + it->size) return &*it; + return nullptr; +} + +// Resolve a PC to a human-readable frame name. Lookup order: +// 1. Trampoline arenas (bytecode + GF) — O(log N) or O(N) side-table scan. +// 2. perf-map — Clasp-JIT'd native code (both bytecode trampolines and +// ORC-JIT-linked ObjectFile symbols end up here). +// 3. dladdr — covers libclasp, libc, libLLVM, other shared objects. +// 4. Hex fallback. +// +// Cache results: the same PC reappears in many samples (especially the +// bytecode VM's inner-loop return address) and dladdr isn't free. +static std::string symbolicate_one(uint64_t pc, + std::unordered_map& cache, + const std::vector& perf_map) { + auto it = cache.find(pc); + if (it != cache.end()) return it->second; + + std::string name; + if (const llvmo::TrampolineEntry* e = llvmo::arena_lookup_by_pc((uintptr_t)pc)) { + name = e->name; + } else if (const PerfMapEntry* p = perf_map_lookup(perf_map, pc)) { + name = p->name; + } else { + Dl_info info; + if (dladdr((void*)(uintptr_t)pc, &info) && info.dli_sname && info.dli_sname[0]) { + name = info.dli_sname; + } else { + char buf[32]; + snprintf(buf, sizeof buf, "0x%lx", (unsigned long)pc); + name = buf; + } + } + name = sanitize_frame(name); + cache.emplace(pc, name); + return name; +} + +} // anonymous namespace + +std::vector sampling_profiler_symbolicated_samples() { + std::lock_guard g(g_lifecycle_lock); + std::vector out; + if (g_running.load(std::memory_order_acquire)) { + fprintf(stderr, "[sampling-profiler] symbolicated-samples: stop the profiler first\n"); + return out; + } + if (!g_buffer || g_write_offset.load() == 0) return out; + + std::unordered_map sym_cache; + std::vector perf_map = load_perf_map(); + // Dedup by (thread_id, joined frames). Value is an index into `out`. + std::unordered_map group_index; + + size_t end = g_write_offset.load(); + size_t off = 0; + while (off + sizeof(SampleHeader) <= end) { + SampleHeader* h = (SampleHeader*)(g_buffer + off); + size_t record_bytes = sizeof(SampleHeader) + h->depth * sizeof(uint64_t); + if (off + record_bytes > end) break; + uint64_t* pcs = (uint64_t*)(g_buffer + off + sizeof(SampleHeader)); + + std::vector frames; + frames.reserve(h->depth); + for (uint32_t i = h->depth; i-- > 0; ) { + frames.push_back(symbolicate_one(pcs[i], sym_cache, perf_map)); + } + // Build dedup key: "tid|f0;f1;...;fN". + std::string key; + { + char tidbuf[16]; + int n = snprintf(tidbuf, sizeof tidbuf, "%u|", (unsigned)h->thread_id); + key.append(tidbuf, n); + } + for (const auto& f : frames) { key += ';'; key += f; } + + auto it = group_index.find(key); + if (it == group_index.end()) { + SymbolicatedSample s; + s.thread_id = h->thread_id; + s.sample_count = 1; + s.frames = std::move(frames); + group_index.emplace(std::move(key), out.size()); + out.push_back(std::move(s)); + } else { + out[it->second].sample_count++; + } + off += record_bytes; + } + return out; +} + +bool sampling_profiler_save(const char* path) { + auto groups = sampling_profiler_symbolicated_samples(); + if (groups.empty()) { + fprintf(stderr, "[sampling-profiler] save: no samples available\n"); + return false; + } + + FILE* fp = fopen(path, "w"); + if (!fp) { + fprintf(stderr, "[sampling-profiler] save: fopen(%s) failed: %s\n", path, strerror(errno)); + return false; + } + + // Collapsed-stacks output for flamegraph.pl: + // frame_root;frame_mid;...;frame_leaf \n + // The flamegraph format has no thread dimension, so we collapse + // same-frames groups across threads by summing sample_count. + std::unordered_map counts; + size_t total_samples = 0; + for (const auto& g : groups) { + std::string key; + size_t est = 0; + for (const auto& f : g.frames) est += f.size() + 1; + key.reserve(est); + for (const auto& f : g.frames) { + if (!key.empty()) key += ';'; + key += f; + } + counts[key] += g.sample_count; + total_samples += g.sample_count; + } + + for (const auto& kv : counts) { + fprintf(fp, "%s %zu\n", kv.first.c_str(), kv.second); + } + fclose(fp); + fprintf(stderr, + "[sampling-profiler] wrote %zu samples (%zu unique stacks) to %s\n", + total_samples, counts.size(), path); + return true; +} + +size_t sampling_profiler_samples_recorded() { return g_samples_recorded.load(); } +size_t sampling_profiler_samples_dropped() { return g_samples_dropped.load(); } +size_t sampling_profiler_bytes_used() { return g_write_offset.load(); } + +void sampling_profiler_register_current_thread() { + populate_stack_bounds_for_this_thread(); +} + +core::T_sp SymbolicatedSample::encode() { + core::SimpleVector_sp sample = core::SimpleVector_O::make(3); + (*sample)[0] = clasp_make_fixnum(this->thread_id); + (*sample)[1] = clasp_make_fixnum(this->sample_count); + core::SimpleVector_sp frames = core::SimpleVector_O::make(this->frames.size()); + size_t idx = 0; + for ( auto& fr : this->frames ) { + (*frames)[idx++] = core::SimpleBaseString_O::make(fr); + } + (*sample)[2] = frames; + return sample; +} + + +// --------------------------------------------------------------------------- +// Lisp bindings. +// --------------------------------------------------------------------------- + +CL_DOCSTRING(R"dx(Start the sampling profiler. +Args: + rate : sampling rate in Hz (default 97). Clamped to [1, 10000]. + max-depth : per-sample stack cap (default 8192). Clamped to [1, 8192]. + buffer-bytes : ring size in bytes, 0 = 256 MiB default. +Returns T on success, NIL if already running or setup failed.)dx"); +CL_LAMBDA(&key (rate 97) (max-depth 8192) (buffer-bytes 0)); +DOCGROUP(clasp); +CL_DEFUN bool ext__profile_start(uint rate, uint max_depth, size_t buffer_bytes) { + return sampling_profiler_start(rate, max_depth, buffer_bytes); +} + +CL_DOCSTRING(R"dx(Stop the sampling profiler. Samples remain in the buffer +until profile-save or profile-reset is called.)dx"); +DOCGROUP(clasp); +CL_DEFUN void ext__profile_stop() { sampling_profiler_stop(); } + +CL_DOCSTRING(R"dx(True while the sampling profiler is running.)dx"); +DOCGROUP(clasp); +CL_DEFUN bool ext__profile_running_p() { return sampling_profiler_running(); } + +CL_DOCSTRING(R"dx(Discard all recorded samples and reset counters.)dx"); +DOCGROUP(clasp); +CL_DEFUN void ext__profile_reset() { sampling_profiler_reset(); } + +CL_DOCSTRING(R"dx(Return the symbolicated samples as a vector of symbolicated-sample instances)dx"); +DOCGROUP(clasp); +CL_DEFUN core::T_sp ext__profile_symbolicated_samples() { + std::vector res = sampling_profiler_symbolicated_samples(); + core::ComplexVector_T_sp vec = core::ComplexVector_T_O::make(16384,nil(),clasp_make_fixnum(0)); + for ( auto& one : res ) { + core::T_sp obj = one.encode(); + vec->vectorPushExtend(obj); + } + return vec; +} + +CL_DOCSTRING(R"dx(Write the captured samples to PATH. +Phase 1: one raw record per line — timestamp, tid, depth, hex PCs. +Later phases will emit symbolicated collapsed-stacks / speedscope JSON.)dx"); +DOCGROUP(clasp); +CL_DEFUN bool ext__profile_save(core::String_sp path) { + return sampling_profiler_save(path->get_std_string().c_str()); +} + +CL_DOCSTRING(R"dx(Return the number of samples recorded so far.)dx"); +DOCGROUP(clasp); +CL_DEFUN size_t ext__profile_samples_recorded() { + return sampling_profiler_samples_recorded(); +} + +CL_DOCSTRING(R"dx(Return the number of samples dropped because the buffer was full.)dx"); +DOCGROUP(clasp); +CL_DEFUN size_t ext__profile_samples_dropped() { + return sampling_profiler_samples_dropped(); +} + +CL_DOCSTRING(R"dx(Return the bytes used in the ring buffer so far.)dx"); +DOCGROUP(clasp); +CL_DEFUN size_t ext__profile_bytes_used() { + return sampling_profiler_bytes_used(); +} +CL_DOCSTRING(R"dx(Return the bytes available in the ring buffer.)dx"); +DOCGROUP(clasp); +CL_DEFUN size_t ext__profile_bytes_available() { + return g_buffer_bytes; +} + +CL_DOCSTRING(R"dx(Populate the current thread's stack bounds so that samples +taken on this thread include full frame-pointer-walked stacks rather than +leaf-only PCs. Call once per Lisp thread that should be fully profiled, +from a safe context (not a signal handler).)dx"); +DOCGROUP(clasp); +CL_DEFUN void ext__profile_register_thread() { + sampling_profiler_register_current_thread(); +} + +} // namespace core diff --git a/src/core/trampoline/trampoline.cc b/src/core/trampoline/trampoline.cc index eecc9285b9..988aef7f5a 100644 --- a/src/core/trampoline/trampoline.cc +++ b/src/core/trampoline/trampoline.cc @@ -21,7 +21,14 @@ void *CLASP_LITERALS(trampoline)[0]; // Use asm and nodebug to add declaration for the LLVM intrinsic. __attribute__((nodebug)) void LLVM_EXPERIMENTAL_STACKMAP(uint64_t type, uint32_t dummy, ...) asm("llvm.experimental.stackmap"); -return_type bytecode_call(uint64_t pc, void *closure, uint64_t nargs, void **args); +// Indirect through a global function pointer rather than calling bytecode_call +// directly. Each compiled trampoline's call topology is now identical +// (load-from-global + indirect call), with the per-instance difference +// confined to a single RIP-relative offset field in the load instruction. +// The global is defined in bytecode.cc and initialized to &bytecode_call +// at static-initialization time. +typedef return_type (*bytecode_call_fn_t)(uint64_t pc, void *closure, uint64_t nargs, void **args); +extern bytecode_call_fn_t g_bytecode_call_ptr; // The wrapper name may have a scope resolution operator which would require quotes so we use asm with colon to ensure the name is // quoted in the bitcode. @@ -33,6 +40,15 @@ return_type WRAPPER_NAME(uint64_t pc, void *closure, uint64_t nargs, void **args trampoline_save_args[0] = (uintptr_t)closure; trampoline_save_args[1] = (uintptr_t)nargs; trampoline_save_args[2] = (uintptr_t)args; - return bytecode_call(pc, closure, nargs, args); + return g_bytecode_call_ptr(pc, closure, nargs, args); } + +// End marker placed directly after the trampoline body so we can compute +// the trampoline's compiled size at runtime: size = &end_marker - &wrapper. +// __attribute__((used)) prevents the optimizer from dropping it; the asm +// name participates in the same "wrapper:name" -> unique-name substitution +// (the suffix "_end" survives unchanged), so each trampoline gets its own +// matching end marker symbol. +__attribute__((used, noinline)) void WRAPPER_END_MARKER() asm("wrapper:name_end"); +__attribute__((used, noinline)) void WRAPPER_END_MARKER() {} }; diff --git a/src/gctools/snapshotSaveLoad.cc b/src/gctools/snapshotSaveLoad.cc index 8b32e4c90a..70da083989 100644 --- a/src/gctools/snapshotSaveLoad.cc +++ b/src/gctools/snapshotSaveLoad.cc @@ -37,6 +37,7 @@ #include #include #include +#include #include // wait_for_user_signal #include #include @@ -570,6 +571,12 @@ void encodeEntryPoint(Fixup* fixup, uintptr_t* ptrptr, core::T_sp codebase, core #endif } else if (gc::IsA(codebase)) { encodeEntryPointInLibrary(fixup, ptrptr,"BytecodeModule"); + } else if (codebase.nilp()) { + // No codebase object attached. Used by GFBytecodeSimpleFun where _Code is + // nil: the entry-point pointers are libclasp symbols (entry_point_n and + // its fixed-arity variants) resolvable by dladdr. Encode as a library + // reference so the loader looks them up by name. + encodeEntryPointInLibrary(fixup, ptrptr, "nilCodebase"); } else { ISL_ERROR("The codebase must be a Code_sp or a Library_sp it is %s", _rep_(codebase).c_str()); } @@ -587,6 +594,8 @@ void decodeEntryPoint(Fixup* fixup, uintptr_t* ptrptr, core::T_sp codebase) { decodeEntryPointInLibrary(fixup, ptrptr); } else if (gc::IsA(codebase)) { decodeEntryPointInLibrary(fixup, ptrptr); + } else if (codebase.nilp()) { + decodeEntryPointInLibrary(fixup, ptrptr); } else { SIMPLE_ERROR("The codebase must be a Code_sp or a Library_sp it is {}", _rep_(codebase)); } @@ -1069,11 +1078,20 @@ struct calculate_size_t { gctools::clasp_ptr_t client = HEADER_PTR_TO_GENERAL_PTR(header); size_t objectSize; if (header->_badge_stamp_wtag_mtag._value == DO_SHIFT_STAMP(gctools::STAMPWTAG_llvmo__ObjectFile_O)) { + llvmo::ObjectFile_O* code = (llvmo::ObjectFile_O*)client; + // Transient scaffolding (arena init's shared trampoline / stub + // template) must never appear in the snapshot; these ObjectFiles lack + // the standard __clasp_literals_ / function-vector- symbols + // and would fail force_materialize at load time. Defense-in-depth — + // these are normally unreachable because they skip _AllObjectFiles + // registration, so we shouldn't hit this branch. + if (code->_TransientSkipSnapshot) { + return; + } this->_CodeCount++; // // Calculate the size of a Code_O object keeping only the literals vector // - llvmo::ObjectFile_O* code = (llvmo::ObjectFile_O*)client; size_t saveCodeSize = llvmo::ObjectFile_O::sizeofInState(code, llvmo::SaveState); this->_TotalSize += sizeof(ISLGeneralHeader_s) + saveCodeSize; this->_ObjectFileCount++; @@ -1150,6 +1168,11 @@ struct copy_objects_t { if (header->_badge_stamp_wtag_mtag._value == DO_SHIFT_STAMP(gctools::STAMPWTAG_llvmo__ObjectFile_O)) { llvmo::ObjectFile_O* objectFile = (llvmo::ObjectFile_O*)clientStart; llvmo::ObjectFile_O* code = (llvmo::ObjectFile_O*)clientStart; + // Defense-in-depth: transient arena-init scaffolding must never make + // it into the snapshot. Skip — matching the calculate_size_t pass. + if (code->_TransientSkipSnapshot) { + return; + } ISLGeneralHeader_s islheader(code->frontSize() + code->literalsSize(), (gctools::Header_s*)header, false); char* islh = this->_objects->write_buffer((char*)&islheader, sizeof(ISLGeneralHeader_s)); char* new_client = this->_objects->write_buffer((char*)clientStart, code->frontSize()); @@ -1945,6 +1968,35 @@ void snapshot_save(SaveLispAndDie& data) { core::lisp_write(fmt::format("Finished invoking cmp:invoke-save-hooks\n")); + // + // Unlink transient-scaffolding ObjectFiles (arena-init shared trampoline and + // stub template) from _AllObjectFiles so they are unreachable before the + // walker runs. Must happen BEFORE call_with_stopped_world since we + // allocate cons cells to rebuild the list. Leaving them reachable causes + // the pass-2 forwarding-pointer walk to dereference cons cells that point + // at ObjectFiles we're skipping in the walker. + { + core::T_sp cur = _lisp->_Roots._AllObjectFiles.load(std::memory_order_relaxed); + core::T_sp kept = nil(); + size_t dropped = 0; + while (cur.consp()) { + core::T_sp car = CONS_CAR(gc::As_unsafe(cur)); + if (gc::IsA(car) + && gc::As_unsafe(car)->_TransientSkipSnapshot) { + ++dropped; + } else { + kept = core::Cons_O::create(car, kept); + } + cur = CONS_CDR(gc::As_unsafe(cur)); + } + _lisp->_Roots._AllObjectFiles.store(kept, std::memory_order_relaxed); + if (dropped) { + fprintf(stderr, "[trampoline-arena] snapshot save dropped %zu transient ObjectFile(s) from _AllObjectFiles\n", dropped); + fflush(stderr); + } + } + core::lisp_write(fmt::format("Finished removing transient object-files\n")); + gctools::call_with_stopped_world(snapshot_save_impl, &data); } @@ -2362,8 +2414,10 @@ void snapshot_load(void* maybeStartOfSnapshot, void* maybeEndOfSnapshot, const s // Link all the code objects MaybeTimeStartup time5("Object file linking"); using TP = thread_pool; -#if 0 +#if 1 // Create a pool with one thread for debugging threading issues + printf("%s:%d:%s Snapshot-load limited to one linker thread\n", + __FILE__, __LINE__, __FUNCTION__ ); TP pool(1); #else // Create a pool of multiple threads @@ -2461,7 +2515,7 @@ void snapshot_load(void* maybeStartOfSnapshot, void* maybeEndOfSnapshot, const s // has a race condition. TODO: Fix race condition std::lock_guard lk(g_materialize_lock); if (!obj_claspJIT->force_materialize(jitdylib, objectId)) - ISL_ERROR("Failed to materialize JITDylib"); + ISL_ERROR("Failed to materialize JITDylib objectId=%lu", objectId ); } }); } @@ -2682,6 +2736,12 @@ void snapshot_load(void* maybeStartOfSnapshot, void* maybeEndOfSnapshot, const s // Setup the pathname info for wherever the executable was loaded // core::getcwd(true); // set *default-pathname-defaults* + + // Reattach arena trampolines to bytecode functions whose _Trampoline was + // substituted with bytecode_call during save (the arena slot addresses + // didn't survive the restart). No-op when the arena backend isn't active. + llvmo::arena_post_load_regenerate_trampolines(); + { char* pause_startup = getenv("CLASP_PAUSE_INIT"); if (pause_startup) { diff --git a/src/gctools/threadlocal.cc b/src/gctools/threadlocal.cc index 6b3d0df2de..4107a2ef26 100644 --- a/src/gctools/threadlocal.cc +++ b/src/gctools/threadlocal.cc @@ -144,6 +144,16 @@ void VirtualMachine::startup() { this->enable_guards(); this->_stackPointer = this->_stackBottom; (*this->_stackPointer) = NULL; + + // Allocate the dynenv record stack as roots. Size in T_O* slots — each + // VMDynRecord occupies sizeof(VMDynRecord)/sizeof(T_O*) slots. Conservative + // GC scanning of this block keeps slot0/slot1 alive while a record is live. + static_assert(sizeof(VMDynRecord) % sizeof(core::T_O*) == 0, + "VMDynRecord must be a multiple of T_O* in size"); + size_t dynRecordSlots = VirtualMachine::MaxDynRecords * (sizeof(VMDynRecord) / sizeof(core::T_O*)); + this->_dynRecordBottom = (VMDynRecord*)gctools::GC::allocateRootsAndZero(dynRecordSlots); + this->_dynRecordLimit = this->_dynRecordBottom + VirtualMachine::MaxDynRecords; + this->_dynRecordTop = this->_dynRecordBottom; } void VirtualMachine::enable_guards() { @@ -175,6 +185,8 @@ VirtualMachine::~VirtualMachine() { // We have nothing to free and _stackBottom is just its initial NULL. if (this->_stackBottom) gctools::GC::freeRoots(this->_stackBottom); + if (this->_dynRecordBottom) + gctools::GC::freeRoots((T_O**)this->_dynRecordBottom); } // For main thread initialization - it happens too early and _Nil is undefined diff --git a/src/lisp/cscript.lisp b/src/lisp/cscript.lisp index 3a935738d0..40a8a1b343 100644 --- a/src/lisp/cscript.lisp +++ b/src/lisp/cscript.lisp @@ -34,6 +34,7 @@ #~"kernel/lsp/assert.lisp" #~"kernel/clos/package.lisp" #~"kernel/lsp/ext-package.lisp" + #~"kernel/lsp/command-line-extensions.lisp" #~"kernel/lsp/arraylib.lisp" #~"kernel/lsp/numlib.lisp" #~"kernel/lsp/predlib.lisp" @@ -171,6 +172,7 @@ #~"kernel/lsp/macroexpand-all.lisp" #~"kernel/cmp/external-clang.lisp" #~"kernel/cmp/disltv.lisp" + #~"kernel/lsp/flamegraph.lisp" #~"kernel/cleavir/auto-compile.lisp")) (defun add-extension-sources (target) diff --git a/src/lisp/kernel/lsp/command-line-extensions.lisp b/src/lisp/kernel/lsp/command-line-extensions.lisp new file mode 100644 index 0000000000..c1843180f8 --- /dev/null +++ b/src/lisp/kernel/lsp/command-line-extensions.lisp @@ -0,0 +1,60 @@ +(in-package #:ext) + +(defvar *command-line-extensions* (make-hash-table :test 'equal) + "Maps option-string (e.g. \"--swank\" or \"--ext:swank\") to a plist +of (:takes-arg BOOL :handler FUNCTION :help STRING).") + +(defun register-command-line-option (option &key takes-arg handler help) + "Register an extension command-line option. +OPTION is the literal flag string, e.g. \"--swank\" or \"--ext:swank-port\". +If TAKES-ARG is true, HANDLER is called with the next token as a string. +Otherwise HANDLER is called with no arguments. +HELP is a one-line description shown by core:print-extension-command-line-help." + (check-type option string) + (check-type handler (or symbol function)) + (setf (gethash option *command-line-extensions*) + (list :takes-arg takes-arg :handler handler :help help)) + option) + +(defun unregister-command-line-option (option) + (remhash option *command-line-extensions*)) + +(defun print-extension-command-line-help (&optional (stream *standard-output*)) + (maphash (lambda (option spec) + (format stream " ~a~:[~; ~]~% ~a~%" + option (getf spec :takes-arg) (or (getf spec :help) ""))) + *command-line-extensions*)) + +(defun %parse-ext-equals-form (arg) + "If ARG is of the form --ext:NAME=VALUE, return (values NAME VALUE). +Otherwise return NIL." + (when (and (>= (length arg) 7) + (string= arg "--ext:" :end1 6)) + (let ((eq (position #\= arg :start 6))) + (when eq + (values (subseq arg 0 eq) (subseq arg (1+ eq))))))) + +(defun process-extension-command-line-arguments () + "Walk core:extension-command-line-arguments, dispatching each registered +option to its handler. Errors on any unclaimed token." + (let ((args (core:extension-command-line-arguments))) + (loop while args + for arg = (pop args) + do (multiple-value-bind (eq-name eq-value) (%parse-ext-equals-form arg) + (let* ((lookup (or eq-name arg)) + (spec (gethash lookup *command-line-extensions*))) + (cond + ((null spec) + (error "Unrecognized command-line option: ~a" arg)) + (eq-name + (funcall (getf spec :handler) eq-value)) + ((getf spec :takes-arg) + (unless args + (error "Command-line option ~a requires an argument" arg)) + (let ((value (pop args))) + (when (gethash value *command-line-extensions*) + (error "Command-line option ~a expected a value but got another option ~a" + arg value)) + (funcall (getf spec :handler) value))) + (t + (funcall (getf spec :handler))))))))) diff --git a/src/lisp/kernel/lsp/debug.lisp b/src/lisp/kernel/lsp/debug.lisp index 1c8b0184f1..73ae1fe81d 100644 --- a/src/lisp/kernel/lsp/debug.lisp +++ b/src/lisp/kernel/lsp/debug.lisp @@ -293,8 +293,21 @@ The delimiters and visibility may be ignored by using the lower level FRAME-UP, `(call-with-stack (lambda (,stack) (declare (core:lambda-name with-stack-lambda)) ,@body) ,@kwargs)) +(defun unnamed-bytecode-call-frame-p (frame) + "True for the C++ bytecode_call frame's fallback DebuggerFrame_O — the +one whose function name is the symbol :BYTECODE because we couldn't (or +no longer try to) recover the bytecode-VM frame's pc/fp metadata. The +arena trampoline frame immediately above it already carries the Lisp +function name, so this frame is pure noise. Returns NIL for the named +bytecode frames (e.g., the innermost one resolved via +bytecode_function_for_pc), which we keep because they carry per-frame +bindings/locals." + (and (eq (frame-language frame) :bytecode) + (eq (core:debugger-frame-fname frame) :bytecode))) + (defparameter *frame-filters* (list 'c++-frame-p 'redundant-xep-p + 'unnamed-bytecode-call-frame-p 'package-hider 'fname-hider) "A list of function designators. Any CLASP-DEBUG:FRAME for which any of the functions returns true will be considered invisible by the mid level CLASP-DEBUG interface (e.g. UP, DOWN)") diff --git a/src/lisp/kernel/lsp/ext-package.lisp b/src/lisp/kernel/lsp/ext-package.lisp index e0b3559979..54d6fe4f8e 100644 --- a/src/lisp/kernel/lsp/ext-package.lisp +++ b/src/lisp/kernel/lsp/ext-package.lisp @@ -131,5 +131,9 @@ ;; C++ iterators do-c++-iterator map-c++-iterator ;; Misc - printing-char-p)) + printing-char-p + register-command-line-option + unregister-command-line-option + print-extension-command-line-help + process-extension-command-line-arguments)) ) ; eval-when diff --git a/src/lisp/kernel/lsp/flamegraph.lisp b/src/lisp/kernel/lsp/flamegraph.lisp new file mode 100644 index 0000000000..30fd41a051 --- /dev/null +++ b/src/lisp/kernel/lsp/flamegraph.lisp @@ -0,0 +1,1129 @@ +;;; flamegraph.lisp — Common Lisp port of Brendan Gregg's flamegraph.pl. +;;; +;;; Derived from https://github.com/brendangregg/FlameGraph +;;; Original copyright: 2011 Joyent, 2011 Brendan Gregg, 2016 Netflix. +;;; License: CDDL 1.0 (same as the Perl original). +;;; +;;; Updated by Christian Schafmeister (April 2026) to +;;; support clasp colors and frame filters +;;; +;;; Usage from a shell (SBCL): +;;; sbcl --script flamegraph.lisp [options] input.txt > graph.svg +;;; From a Lisp REPL: +;;; (load "flamegraph.lisp") +;;; (flamegraph:main '("--title" "My Profile" "/tmp/prof.collapsed")) +;;; +;;; Input format: one line per sample, semicolon-separated frames then a +;;; space and an integer count: +;;; +;;; main;foo;bar 42 +;;; main;foo;baz 17 +;;; +;;; Output: SVG with interactive zoom/search JavaScript on stdout. +;;; +;;; Scope differences vs the Perl original: +;;; * Integer counts only. Fractional counts in the input are dropped. +;;; * No differential flame graphs (no second samples column). +;;; * No --nameattr (per-function attribute file). +;;; * No --cp / --pal (consistent palette persistence). +;;; * No "chain" waker annotation. +;;; Everything else (palettes, --reverse, --inverted, --hash, --negate, +;;; --title, --subtitle, --minwidth, zoom/search JS) ported faithfully. +;;; +;;; Only uses ANSI Common Lisp. No external libraries. + +(defpackage #:flamegraph + (:use #:cl) + (:export #:main #:flamegraph)) + +(in-package #:flamegraph) + +;;; --------------------------------------------------------------------------- +;;; Options +;;; --------------------------------------------------------------------------- + +(defstruct opts + (title "") + (subtitle "") + (image-width 1200) + (frame-height 16) + (font-type "Verdana") + (font-size 12) + (font-width 0.59) + (min-width 0.1) + (name-type "Function:") + (count-name "samples") + (colors "hot") + (bg-color1 "#eeeeee") + (bg-color2 "#eeeeb0") + (total nil) + (factor 1.0) + (hash nil) + (reverse-stack nil) + (inverted t) + (negate nil) + (notes "") + (encoding nil) + (search-color "rgb(230,0,230)") + (input-file nil)) + +;;; --------------------------------------------------------------------------- +;;; Command-line +;;; --------------------------------------------------------------------------- + +(defun cmdline-args () + #+sbcl (cdr sb-ext:*posix-argv*) + #+ccl (cdr ccl:*command-line-argument-list*) + #+ecl (cdr (si:command-args)) + #+clisp ext:*args* + #+clasp (cdr core:*command-line-arguments*) + #+abcl (cdr ext:*command-line-argument-list*) + #-(or sbcl ccl ecl clisp clasp abcl) nil) + +(defun usage () + (format *error-output* "~%USAGE: flamegraph.lisp [options] [input.txt] > out.svg~%~%~ + --title TEXT change title text~%~ + --subtitle TEXT second-level title~%~ + --width NUM image width (default 1200)~%~ + --height NUM frame height (default 16)~%~ + --minwidth NUM omit smaller functions (default 0.1 px)~%~ + --fonttype FONT font type (default Verdana)~%~ + --fontsize NUM font size (default 12)~%~ + --fontwidth NUM avg width relative to fontsize (default 0.59)~%~ + --countname TEXT count label (default samples)~%~ + --nametype TEXT name label (default Function:)~%~ + --colors NAME palette: hot mem io wakeup chain java js perl clasp~%~ + red green blue aqua yellow purple orange~%~ + --hash color by function-name hash (stable across runs)~%~ + --reverse reverse each stack before merging~%~ + --inverted icicle (root at top) — the default~%~ + --no-inverted classic flame graph (root at bottom)~%~ + --negate switch differential hues~%~ + --notes TEXT embed a note in the SVG~%~ + --total NUM override total (max) count~%~ + --factor NUM scale counts~%~ + --encoding ENC SVG XML encoding attribute~%~ + --help this message~%")) + +(defun parse-args (args) + (let ((o (make-opts)) + (remaining (copy-list args))) + (loop while remaining + for a = (pop remaining) + do (cond + ((string= a "--title") (setf (opts-title o) (pop remaining))) + ((string= a "--subtitle") (setf (opts-subtitle o) (pop remaining))) + ((string= a "--width") (setf (opts-image-width o) (parse-integer (pop remaining)))) + ((string= a "--height") (setf (opts-frame-height o) (parse-integer (pop remaining)))) + ((string= a "--minwidth") (setf (opts-min-width o) (read-from-string (pop remaining)))) + ((string= a "--fonttype") (setf (opts-font-type o) (pop remaining))) + ((string= a "--fontsize") (setf (opts-font-size o) (read-from-string (pop remaining)))) + ((string= a "--fontwidth") (setf (opts-font-width o) (read-from-string (pop remaining)))) + ((string= a "--countname") (setf (opts-count-name o) (pop remaining))) + ((string= a "--nametype") (setf (opts-name-type o) (pop remaining))) + ((string= a "--colors") (setf (opts-colors o) (pop remaining))) + ((string= a "--total") (setf (opts-total o) (parse-integer (pop remaining)))) + ((string= a "--factor") (setf (opts-factor o) (read-from-string (pop remaining)))) + ((string= a "--notes") (setf (opts-notes o) (pop remaining))) + ((string= a "--encoding") (setf (opts-encoding o) (pop remaining))) + ((string= a "--hash") (setf (opts-hash o) t)) + ((string= a "--reverse") (setf (opts-reverse-stack o) t)) + ((string= a "--inverted") (setf (opts-inverted o) t)) + ((string= a "--no-inverted") (setf (opts-inverted o) nil)) + ((string= a "--negate") (setf (opts-negate o) t)) + ((or (string= a "--help") (string= a "-h")) + (usage) + (return-from parse-args nil)) + ((and (plusp (length a)) (char= (schar a 0) #\-)) + (format *error-output* "Unknown option: ~A~%" a) + (usage) + (return-from parse-args nil)) + (t + (setf (opts-input-file o) a)))) + ;; Background color selection driven by palette. + (cond + ((or (string= (opts-colors o) "mem") (string= (opts-colors o) "chain")) + (setf (opts-bg-color1 o) "#eeeeee" + (opts-bg-color2 o) "#e0e0ff")) + ((member (opts-colors o) + '("io" "wakeup" "red" "green" "blue" "aqua" "yellow" "purple" "orange") + :test #'string=) + (setf (opts-bg-color1 o) "#f8f8f8" + (opts-bg-color2 o) "#e8e8e8"))) + (when (and (plusp (length (opts-notes o))) + (or (find #\< (opts-notes o)) (find #\> (opts-notes o)))) + (error "Notes string can't contain < or >")) + (when (zerop (length (opts-title o))) + (setf (opts-title o) + (if (opts-inverted o) "Icicle Graph" "Flame Graph"))) + o)) + +;;; --------------------------------------------------------------------------- +;;; Utilities +;;; --------------------------------------------------------------------------- + +(defun xml-escape (s) + (with-output-to-string (out) + (loop for c across s + do (case c + (#\& (write-string "&" out)) + (#\< (write-string "<" out)) + (#\> (write-string ">" out)) + (#\" (write-string """ out)) + (t (write-char c out)))))) + +(defun split-string (s sep) + "Split S at every occurrence of SEP (a character). Returns a list of +substrings — empty substrings are preserved." + (loop with start = 0 + with len = (length s) + for pos = (position sep s :start start) + if pos + collect (subseq s start pos) into parts + and do (setf start (1+ pos)) + else + return (nconc parts (list (subseq s start len))))) + +(defun join (list sep) + (with-output-to-string (out) + (loop for cell on list + do (write-string (car cell) out) + (when (cdr cell) (write-char sep out))))) + +(defun list-string< (a b) + "Lexicographic comparison of two lists of strings." + (loop + (cond ((null a) (return (not (null b)))) + ((null b) (return nil)) + ((string< (car a) (car b)) (return t)) + ((string< (car b) (car a)) (return nil)) + (t (pop a) (pop b))))) + +(defun format-with-commas (n) + "Format integer N with thousands-separator commas." + (let* ((s (princ-to-string n)) + (neg (and (plusp (length s)) (char= (schar s 0) #\-))) + (digits (if neg (subseq s 1) s)) + (dlen (length digits)) + (first-group (or (mod dlen 3) 0))) + (when (zerop first-group) (setf first-group 3)) + (with-output-to-string (out) + (when neg (write-char #\- out)) + (loop for i below dlen + do (when (and (plusp i) (zerop (mod (- i first-group) 3))) + (write-char #\, out)) + (write-char (schar digits i) out))))) + +(defun suffix-p (s suffix) + (and (>= (length s) (length suffix)) + (string= s suffix :start1 (- (length s) (length suffix))))) + +(defun starts-with-p (s prefix) + (and (>= (length s) (length prefix)) + (string= s prefix :end1 (length prefix)))) + +(defun strip-frame-annotation (name) + "Strip trailing _[k], _[w], _[i], _[j] annotation if present." + (let ((len (length name))) + (if (and (>= len 4) + (char= (schar name (- len 4)) #\_) + (char= (schar name (- len 3)) #\[) + (find (schar name (- len 2)) "kwij") + (char= (schar name (- len 1)) #\])) + (subseq name 0 (- len 4)) + name))) + +;;; --------------------------------------------------------------------------- +;;; Color palettes +;;; --------------------------------------------------------------------------- + +(defun namehash (name) + "Produce a [0,1) float from NAME, weighting early characters heavier +so semantically-related names get similar colors." + (let* ((tick (position #\` name)) + (name (if tick (subseq name (1+ tick)) name)) + (vector 0.0) + (weight 1.0) + (maxw 1.0) + (modulus 10)) + (loop for c across name + while (<= modulus 12) + do (let ((i (mod (char-code c) modulus))) + (incf vector (* (/ (float i) (float (1- (incf modulus)))) weight)) + (incf maxw weight) + (setf weight (* weight 0.7)))) + (- 1.0 (/ vector maxw)))) + +(defun rand-float () (random 1.0)) + +(defun rgb (r g b) (format nil "rgb(~D,~D,~D)" r g b)) + +(defun color (type hash-p name) + (multiple-value-bind (v1 v2 v3) + (if hash-p + (let ((h1 (namehash name)) + (h2 (namehash (reverse name)))) + (values h1 h2 h2)) + (values (rand-float) (rand-float) (rand-float))) + ;; "Multi" palettes remap by inspecting the frame name, then fall + ;; through to the single-color palettes. + (setf type (remap-type type name)) + (cond + ((equal type "hot") (rgb (+ 205 (floor (* 50 v3))) + (floor (* 230 v1)) + (floor (* 55 v2)))) + ((equal type "mem") (rgb 0 + (+ 190 (floor (* 50 v2))) + (floor (* 210 v1)))) + ((equal type "io") (let ((r (+ 80 (floor (* 60 v1))))) + (rgb r r (+ 190 (floor (* 55 v2)))))) + ((equal type "red") (let ((r (+ 200 (floor (* 55 v1)))) + (x (+ 50 (floor (* 80 v1))))) + (rgb r x x))) + ((equal type "green") (let ((g (+ 200 (floor (* 55 v1)))) + (x (+ 50 (floor (* 60 v1))))) + (rgb x g x))) + ((equal type "blue") (let ((b (+ 205 (floor (* 50 v1)))) + (x (+ 80 (floor (* 60 v1))))) + (rgb x x b))) + ((equal type "yellow") (let ((x (+ 175 (floor (* 55 v1)))) + (b (+ 50 (floor (* 20 v1))))) + (rgb x x b))) + ((equal type "purple") (let ((x (+ 190 (floor (* 65 v1)))) + (g (+ 80 (floor (* 60 v1))))) + (rgb x g x))) + ((equal type "aqua") (rgb (+ 50 (floor (* 60 v1))) + (+ 165 (floor (* 55 v1))) + (+ 165 (floor (* 55 v1))))) + ((equal type "orange") (rgb (+ 190 (floor (* 65 v1))) + (+ 90 (floor (* 65 v1))) + 0)) + (t "rgb(0,0,0)")))) + +(defun remap-type (type name) + (cond + ((equal type "clasp") + (cond + ((search "_bct" name) "blue") + ((starts-with-p name "bytecode_call") "blue") + ((search "::" name) "green") + ((search "bytecode_vm" name) "green") + ((starts-with-p name "`GC_") "red") + ((starts-with-p name "GC_") "red") + ((suffix-p name "^^") "green") + ((find #\^ name) "green") + ((suffix-p name "_[k]") "orange") + (t "yellow"))) + ((equal type "java") + (cond ((suffix-p name "_[j]") "green") + ((suffix-p name "_[i]") "aqua") + ((some (lambda (p) (starts-with-p name p)) + '("java/" "org/" "com/" "io/" "sun/" + "Ljava/" "Lorg/" "Lcom/" "Lio/" "Lsun/")) + "green") + ((suffix-p name "_[k]") "orange") + ((search "::" name) "yellow") + (t "red"))) + ((equal type "perl") + (cond ((search "::" name) "yellow") + ((or (search "Perl" name) (search ".pl" name)) "green") + ((suffix-p name "_[k]") "orange") + (t "red"))) + ((equal type "js") + (cond ((suffix-p name "_[j]") + (if (find #\/ name) "green" "aqua")) + ((search "::" name) "yellow") + ((and (find #\/ name) (search ".js" name)) "green") + ((find #\: name) "aqua") + ((string= name " ") "green") + ((search "_[k]" name) "orange") + (t "red"))) + ((equal type "wakeup") "aqua") + ((equal type "chain") + (if (search "_[w]" name) "aqua" "blue")) + (t type))) + +(defun color-scale (value maxv negate-p) + "Differential color: blue for negative, red for positive, scaled by MAXV." + (let ((r 255) (g 255) (b 255) + (v (if negate-p (- value) value))) + (cond ((> v 0) (setf g (floor (* 210 (/ (- maxv v) (float maxv)))) + b g)) + ((< v 0) (setf r (floor (* 210 (/ (+ maxv v) (float maxv)))) + g r))) + (rgb r g b))) + +;;; --------------------------------------------------------------------------- +;;; Input +;;; --------------------------------------------------------------------------- + +(defun parse-folded-line (line) + "Return (values FRAMES COUNT) from a collapsed-stacks line, where +FRAMES is a list of frame-name strings (outermost-first), or +(values NIL NIL) if the line can't be parsed." + (let ((line (string-right-trim '(#\Space #\Tab #\Return) line))) + (when (plusp (length line)) + (let ((sp (position #\Space line :from-end t))) + (when sp + (let ((num-str (subseq line (1+ sp))) + (stack (string-right-trim '(#\Space #\Tab) + (subseq line 0 sp)))) + (when (and (plusp (length num-str)) + (every #'digit-char-p num-str)) + (values (split-string stack #\;) (parse-integer num-str))))))))) + +(defun read-input (stream) + "Return a list of (STACK . COUNT) pairs from STREAM." + (let ((out '()) + (ignored 0)) + (loop for line = (read-line stream nil nil) + while line + do (multiple-value-bind (stack count) (parse-folded-line line) + (if (and stack count) + (push (cons stack count) out) + (incf ignored)))) + (when (plusp ignored) + (format *error-output* "Ignored ~D lines with invalid format~%" ignored)) + (nreverse out))) + +(defun transform-sample (sym-sample) + ;; sym-sample layout: #(thread-id sample-count #(frame ...)) + (values (coerce (aref sym-sample 2) 'list) + (aref sym-sample 1))) + +(defun transform-symbolicated-samples (sym-samples) + (let ((out nil)) + (loop for sym-sample across sym-samples + do (multiple-value-bind (stack count) (transform-sample sym-sample) + (push (cons stack count) out))) + (nreverse out))) + +(defun reverse-stack-frames (stack) + (reverse stack)) + +;;; --------------------------------------------------------------------------- +;;; Flow / merge +;;; --------------------------------------------------------------------------- + +(defstruct node + func + depth + stime + etime + delta) + +(defun flow (last this v nodes tmp delta) + "Merge stack vectors LAST and THIS at time V. Closes frames from LAST +that differ from THIS (recording them in NODES keyed by func;depth;etime) +and opens new frames from THIS in TMP. Returns THIS." + (let* ((len-a (1- (length last))) + (len-b (1- (length this))) + (len-same 0)) + (loop for i from 0 + while (and (<= i len-a) (<= i len-b) + (string= (aref last i) (aref this i))) + do (incf len-same)) + (loop for i from len-a downto len-same + do (let* ((func (aref last i)) + (k (format nil "~A;~D" func i)) + (partial (gethash k tmp))) + (when partial + (setf (gethash (format nil "~A;~D" k v) nodes) + (make-node :func func :depth i + :stime (node-stime partial) + :etime v + :delta (node-delta partial))) + (remhash k tmp)))) + (loop for i from len-same to len-b + do (let* ((func (aref this i)) + (k (format nil "~A;~D" func i))) + (setf (gethash k tmp) + (make-node :func func :depth i :stime v + :delta (when delta + (if (= i len-b) delta 0)))))) + this)) + +;;; --------------------------------------------------------------------------- +;;; Embedded JavaScript (zoom/search) — ported verbatim from flamegraph.pl +;;; with placeholders for $xpad, $fontsize, $fontwidth, $inverted, $nametype, +;;; $searchcolor substituted at emission time. +;;; --------------------------------------------------------------------------- + +(defun embedded-js (opts) + (let ((xpad 10) + (fontsize (opts-font-size opts)) + (fontwidth (opts-font-width opts)) + (inverted (if (opts-inverted opts) 1 0)) + (nametype (opts-name-type opts)) + (searchcolor (opts-search-color opts))) + (format nil "~ + + +" + nametype fontsize fontwidth xpad xpad xpad xpad xpad xpad inverted searchcolor))) + +;;; --------------------------------------------------------------------------- +;;; SVG emission primitives +;;; --------------------------------------------------------------------------- + +(defun emit-svg-header (out width height opts) + (format out " + + + + +" + (opts-encoding opts) width height width height (opts-notes opts))) + +(defun emit-rect (out x1 y1 x2 y2 fill &optional (extra "")) + (format out "~%" + (float x1) (float y1) (float (- x2 x1)) (float (- y2 y1)) fill extra)) + +(defun emit-text (out color font size x y str &key (anchor "left") (extra "")) + (format out "~A~%" + anchor (float x) y size font color extra str)) + +;;; --------------------------------------------------------------------------- +;;; Top-level drawing +;;; --------------------------------------------------------------------------- + +(defun default-frame-filter (name) + (cond + ((search "bytecode_vm" name) nil) + ((search "bytecode_call" name) nil) + ((search "entry_point_" name) nil) + ((search "default_bytecode_" name) nil) + ((search "apply_inner_" name) nil) + (t t))) + +(defun flamegraph (&key (data (ext:profile-symbolicated-samples)) + (output (make-string-output-stream)) + (title "") (subtitle "") + (image-width 1200) (frame-height 16) + (font-type "Verdana") (font-size 12) (font-width 0.59) + (min-width 0.1) + (name-type "Function:") (count-name "samples") + (colors "clasp") + (bg-color1 "#eeeeee" bg-color1-p) + (bg-color2 "#eeeeb0" bg-color2-p) + total (factor 1.0) + hash reverse-stack (inverted t) negate + (notes "") encoding + (search-color "rgb(230,0,230)") + (frame-filter #'default-frame-filter)) + "Render a flamegraph from DATA to OUTPUT (a stream). +DATA is a list of (FRAMES . COUNT) pairs where FRAMES is a list of +frame-name strings, outermost-first (index 0 = root, last = leaf). +Keyword arguments correspond to fields of the `opts' struct. + +Certain :colors palettes (mem, chain, io, wakeup, red, green, blue, aqua, +yellow, purple, orange) imply specific :bg-color1 / :bg-color2 defaults. +If you pass an explicit :bg-color1 or :bg-color2 alongside one of those +palettes, your value is honored and a warning is printed to +*error-output* noting which palette default was skipped." + (let ((o (make-opts :title title :subtitle subtitle + :image-width image-width :frame-height frame-height + :font-type font-type :font-size font-size + :font-width font-width :min-width min-width + :name-type name-type :count-name count-name + :colors colors + :bg-color1 bg-color1 :bg-color2 bg-color2 + :total total :factor factor + :hash hash :reverse-stack reverse-stack + :inverted inverted :negate negate + :notes notes :encoding encoding + :search-color search-color))) + ;; Accept either the pre-built (frames . count) list or the raw + ;; vector returned by ext:profile-symbolicated-samples. + (when (vectorp data) + (setf data (transform-symbolicated-samples data))) + ;; Drop frames the caller doesn't want. Samples that become empty + ;; (all frames filtered out) are removed entirely. + (when frame-filter + (setf data + (loop for (frames . count) in data + for kept = (remove-if-not frame-filter frames) + when kept + collect (cons kept count)))) + ;; Palette-driven bg defaults — applied only when the caller didn't + ;; supply the corresponding keyword. + (multiple-value-bind (palette-bg1 palette-bg2) + (cond ((or (string= colors "mem") (string= colors "chain")) + (values "#eeeeee" "#e0e0ff")) + ((member colors + '("io" "wakeup" "red" "green" "blue" + "aqua" "yellow" "purple" "orange") + :test #'string=) + (values "#f8f8f8" "#e8e8e8")) + (t (values nil nil))) + (when (and palette-bg1 (or bg-color1-p bg-color2-p)) + (format *error-output* + "flamegraph: :colors ~S has palette-default backgrounds ~ + ~A / ~A but you passed explicit~:[~; :bg-color1~]~ + ~:[~; :bg-color2~]; keeping your values.~%" + colors palette-bg1 palette-bg2 bg-color1-p bg-color2-p)) + (unless bg-color1-p + (when palette-bg1 (setf (opts-bg-color1 o) palette-bg1))) + (unless bg-color2-p + (when palette-bg2 (setf (opts-bg-color2 o) palette-bg2)))) + (when (zerop (length (opts-title o))) + (setf (opts-title o) + (if (opts-inverted o) "Icicle Graph" "Flame Graph"))) + (do-flamegraph o data output) + output)) + +(defun do-flamegraph (opts data output) + (when (opts-reverse-stack opts) + (setf data (mapcar (lambda (p) + (cons (reverse-stack-frames (car p)) (cdr p))) + data))) + (setf data (sort data #'list-string< :key #'car)) + + (let* ((nodes (make-hash-table :test 'equal)) + (tmp (make-hash-table :test 'equal)) + (last (vector "")) + (time 0)) + (dolist (pair data) + (destructuring-bind (stack . samples) pair + (let ((frames (coerce (cons "" stack) 'vector))) + (setf last (flow last frames time nodes tmp nil)) + (incf time samples)))) + (flow last (vector) time nodes tmp nil) + + (when (zerop time) + (format *error-output* "ERROR: No stack counts found~%") + (let* ((iw (opts-image-width opts)) + (ih (* (opts-font-size opts) 5))) + (emit-svg-header output iw ih opts) + (emit-text output "rgb(0,0,0)" (opts-font-type opts) + (+ (opts-font-size opts) 2) + (floor iw 2) (* (opts-font-size opts) 2) + "ERROR: No valid input provided to flamegraph.lisp." + :anchor "middle") + (format output "~%")) + (return-from do-flamegraph nil)) + + (let* ((timemax (let ((tm (opts-total opts))) + (cond ((and tm (< tm time)) + (format *error-output* "Specified --total ~D < actual ~D, ignoring~%" tm time) + time) + (tm tm) + (t time)))) + (imagewidth (opts-image-width opts)) + (frameheight (opts-frame-height opts)) + (fontsize (opts-font-size opts)) + (ypad1 (* fontsize 3)) + (ypad2 (+ (* fontsize 2) 10)) + (ypad3 (* fontsize 2)) + (xpad 10) + (framepad 1) + (widthpertime (/ (- imagewidth (* 2 xpad)) (float timemax))) + (minwidth-time (/ (opts-min-width opts) widthpertime)) + (depthmax 0) + (has-subtitle (plusp (length (opts-subtitle opts))))) + + ;; Prune narrow nodes and record depthmax. + (let ((to-delete '())) + (maphash (lambda (id n) + (if (< (- (node-etime n) (node-stime n)) minwidth-time) + (push id to-delete) + (when (> (node-depth n) depthmax) + (setf depthmax (node-depth n))))) + nodes) + (dolist (id to-delete) (remhash id nodes))) + + (let ((imageheight (+ (* (1+ depthmax) frameheight) ypad1 ypad2 + (if has-subtitle ypad3 0)))) + (emit-svg-header output imagewidth imageheight opts) + + ;; for background gradient + the interactive JS. + (format output " + + + + + +" + (opts-bg-color1 opts) (opts-bg-color2 opts)) + (write-string (embedded-js opts) output) + + ;; Background. + (emit-rect output 0 0 imagewidth imageheight "url(#background)") + + ;; Title + subtitle. + (emit-text output "rgb(0,0,0)" (opts-font-type opts) (+ fontsize 5) + (floor imagewidth 2) (* fontsize 2) + (xml-escape (opts-title opts)) + :anchor "middle") + (when has-subtitle + (emit-text output "rgb(160,160,160)" (opts-font-type opts) fontsize + (floor imagewidth 2) (* fontsize 4) + (xml-escape (opts-subtitle opts)) + :anchor "middle")) + + ;; Details / Reset Zoom / Search / Matched labels. + (emit-text output "rgb(0,0,0)" (opts-font-type opts) fontsize + xpad (- imageheight (floor ypad2 2)) + " " :extra "id=\"details\"") + (emit-text output "rgb(0,0,0)" (opts-font-type opts) fontsize + xpad (* fontsize 2) + "Reset Zoom" + :extra "id=\"unzoom\" tabindex=\"-1\" onclick=\"unzoom()\" style=\"opacity:0.0;cursor:pointer\"") + (emit-text output "rgb(0,0,0)" (opts-font-type opts) fontsize + (- imagewidth xpad 100) (* fontsize 2) + "Search" + :extra "id=\"search\" tabindex=\"-1\" onmouseover=\"searchover()\" onmouseout=\"searchout()\" onclick=\"search_prompt()\" style=\"opacity:0.1;cursor:pointer\"") + (emit-text output "rgb(0,0,0)" (opts-font-type opts) fontsize + (- imagewidth xpad 100) (- imageheight (floor ypad2 2)) + " " :extra "id=\"matched\"") + + ;; Each merged node → a with a tooltip, <rect> and + ;; <text>. + (maphash + (lambda (id n) + (declare (ignore id)) + (let* ((func (node-func n)) + (depth (node-depth n)) + (stime (node-stime n)) + (etime (if (and (string= func "") (zerop depth)) + timemax + (node-etime n))) + (x1 (+ xpad (* stime widthpertime))) + (x2 (+ xpad (* etime widthpertime))) + y1 y2) + (if (opts-inverted opts) + (setf y1 (+ ypad1 (* depth frameheight)) + y2 (+ ypad1 (* (1+ depth) frameheight) (- framepad))) + (setf y1 (- imageheight ypad2 (* (1+ depth) frameheight) (- framepad)) + y2 (- imageheight ypad2 (* depth frameheight)))) + (let* ((samples (round (* (- etime stime) (opts-factor opts)))) + (samples-txt (format-with-commas samples)) + (info + (if (and (string= func "") (zerop depth)) + (format nil "all (~A ~A, 100%)" + samples-txt (opts-count-name opts)) + (let* ((pct (* 100.0 (/ samples + (* timemax (opts-factor opts))))) + (esc (xml-escape (strip-frame-annotation func)))) + (format nil "~A (~A ~A, ~,2F%)" + esc samples-txt (opts-count-name opts) pct)))) + (fill (cond + ((string= func "--") "rgb(160,160,160)") + ((string= func "-") "rgb(200,200,200)") + ((node-delta n) + (color-scale (node-delta n) 1 + (opts-negate opts))) + (t + (color (opts-colors opts) + (opts-hash opts) func))))) + ;; <g> + (format output "<g class=\"func_g\" tabindex=\"-1\" onmouseover=\"s(this)\" onmouseout=\"c()\" onclick=\"zoom(this)\">~%") + (format output "<title>~A" info) + (emit-rect output x1 y1 x2 y2 fill "rx=\"2\" ry=\"2\"") + (let* ((chars (floor (/ (- x2 x1) + (* fontsize (opts-font-width opts))))) + (text (if (< chars 3) + "" + (let ((stripped (strip-frame-annotation func))) + (if (< chars (length stripped)) + (concatenate 'string + (subseq stripped 0 (max 0 (- chars 2))) + "..") + stripped))))) + (emit-text output "rgb(0,0,0)" (opts-font-type opts) fontsize + (+ x1 3) (+ 3 (floor (+ y1 y2) 2)) + (xml-escape text))) + (format output "~%")))) + nodes) + + (format output "~%"))))) + +(defun main (&optional (args (cmdline-args))) +(let ((opts (parse-args args))) + (unless opts (return-from main nil)) + (let* ((in-path (opts-input-file opts)) + (input (if in-path + (open in-path :direction :input :if-does-not-exist :error) + *standard-input*))) + (let ((data (read-input input))) + (unwind-protect (do-flamegraph opts data *standard-output*) + (when in-path (close input))) + t)))) + +;;; Auto-invoke MAIN when loaded as an SBCL script +;;; (`sbcl --script flamegraph.lisp ARGS`). SBCL consumes the script path +;;; from argv, leaving *posix-argv* = ("sbcl" "arg1" "arg2" ...), which +;;; CMDLINE-ARGS reads with (cdr). We detect script mode via the +;;; *script-args*-isn't-bound-in-REPL heuristic: when LOAD is invoked +;;; from the REPL, `*load-truename*` is set *and* the user will call +;;; MAIN explicitly. When we're a --script invocation, SBCL runs the +;;; file's top-level forms and exits. +;;; +;;; Simplest portable approach: always call main when this file is +;;; evaluated at top level. From the REPL the user can just LOAD + +;;; invoke (flamegraph:main '(...args...)) themselves. +;;; --------------------------------------------------------------------------- +;;; SIGUSR2 snapshot +;;; +;;; When CLASP_FLAME_PROFILE is set at load time, sending SIGUSR2 to the +;;; process starts a sampling profile on a background thread and writes +;;; the flame graph SVG to the configured path. Useful for capturing a +;;; profile of a long-running process without a REPL. +;;; +;;; Syntax of CLASP_FLAME_PROFILE: +;;; unset / "" / 0 / off / no / false -> disabled +;;; 1 / on / yes / true -> enabled with defaults +;;; (path=/tmp/clasp-PID.svg, +;;; duration=10s, rate=97Hz) +;;; key=value:key=value:... -> enabled with overrides +;;; +;;; Known keys: path, duration (seconds), rate (Hz). Unknown keys emit +;;; a warning but do not disable the profiler. +;;; +;;; Examples: +;;; CLASP_FLAME_PROFILE=1 cando ... +;;; CLASP_FLAME_PROFILE=path=/tmp/foo.svg:duration=5:rate=499 cando ... +;;; kill -USR2 +;;; --------------------------------------------------------------------------- + +#+clasp +(progn + (defparameter *snapshot-duration* 10 + "Seconds of sampling per SIGUSR2 snapshot.") + (defparameter *snapshot-rate* 97 + "Sampling rate (Hz) for SIGUSR2 snapshots.") + (defparameter *snapshot-path* nil + "SIGUSR2 writes a flame graph SVG to this pathname.") + + (defun parse-flame-profile-env (value) + "Parse the value of CLASP_FLAME_PROFILE. Returns three values: +ENABLED-P, a plist of (:path :duration :rate) overrides, and a list +of error-message strings describing malformed fields. + +Syntax: + unset / \"\" / 0 / off / no / false -> disabled + 1 / on / yes / true -> enabled, all defaults + key=value:key=value:... -> enabled with overrides +Known keys: path, duration, rate." + (when (or (null value) (zerop (length value))) + (return-from parse-flame-profile-env (values nil nil nil))) + (let ((trimmed (string-trim '(#\Space #\Tab) value))) + (cond + ((zerop (length trimmed)) (values nil nil nil)) + ((member trimmed '("0" "off" "no" "false") :test #'string-equal) + (values nil nil nil)) + ((member trimmed '("1" "on" "yes" "true") :test #'string-equal) + (values t nil nil)) + (t + (let ((plist '()) + (errors '())) + (dolist (pair (split-string trimmed #\:)) + (let* ((p (string-trim '(#\Space #\Tab) pair)) + (eq (position #\= p))) + (cond + ((zerop (length p))) + ((null eq) + (push (format nil "expected key=value, got ~S" p) errors)) + (t + (let ((key (string-trim '(#\Space #\Tab) (subseq p 0 eq))) + (val (string-trim '(#\Space #\Tab) (subseq p (1+ eq))))) + (cond + ((string-equal key "path") + (if (zerop (length val)) + (push "path= must not be empty" errors) + (setf (getf plist :path) val))) + ((string-equal key "duration") + (multiple-value-bind (n pos) + (parse-integer val :junk-allowed t) + (if (and n (= pos (length val)) (plusp n)) + (setf (getf plist :duration) n) + (push (format nil "duration must be a positive integer, got ~S" val) + errors)))) + ((string-equal key "rate") + (multiple-value-bind (n pos) + (parse-integer val :junk-allowed t) + (if (and n (= pos (length val)) (plusp n)) + (setf (getf plist :rate) n) + (push (format nil "rate must be a positive integer, got ~S" val) + errors)))) + (t + (push (format nil "unknown key ~S (known: path, duration, rate)" key) + errors)))))))) + (values t plist (nreverse errors))))))) + + (defun snapshot (path &key (duration *snapshot-duration*)) + "Run DURATION seconds of sampling profiling on a background thread +and write the flame graph SVG to PATH. Returns T if a snapshot was +spawned, NIL if the profiler was already running." + (when (ext:profile-running-p) + (format *error-output* "flamegraph: profiler already running~%") + (return-from snapshot nil)) + (mp:process-run-function + "flamegraph-snapshot" + (lambda () + (handler-case + (if (ext:profile-start :rate *snapshot-rate*) + (unwind-protect + (progn + (sleep duration) + (ext:profile-stop) + (with-open-file (out path :direction :output + :if-exists :supersede + :if-does-not-exist :create) + (flamegraph :data (ext:profile-symbolicated-samples) + :output out + :title (format nil "clasp ~A (~Ds)" + (core:getpid) duration))) + (core:chmod path #o664) + (ext:profile-reset) + (format *error-output* "flamegraph: wrote ~A~%" path)) + ;; Make sure the profiler is stopped even on non-local exit. + (when (ext:profile-running-p) (ext:profile-stop))) + (format *error-output* + "flamegraph: profile-start failed (already running?)~%")) + (error (c) + (format *error-output* "flamegraph: snapshot failed: ~A~%" c))))) + t) + + ;; CLASP_FLAME_PROFILE controls SIGUSR2 snapshots. See parse-flame-profile-env + ;; for the full syntax. + (multiple-value-bind (enabled overrides errors) + (parse-flame-profile-env (ext:getenv "CLASP_FLAME_PROFILE")) + (dolist (e errors) + (format *error-output* "flamegraph: CLASP_FLAME_PROFILE: ~A~%" e)) + (when enabled + (setf *snapshot-path* + (or (getf overrides :path) + (format nil "/tmp/clasp-~d.svg" (core:getpid)))) + (let ((d (getf overrides :duration))) (when d (setf *snapshot-duration* d))) + (let ((r (getf overrides :rate))) (when r (setf *snapshot-rate* r))) + (let ((dir (directory-namestring *snapshot-path*))) + (unless (probe-file dir) + (format *error-output* + "flamegraph: warning: output directory ~A does not exist~%" dir))) + (defmethod mp:service-interrupt ((i core:sigusr2)) + (snapshot *snapshot-path*)) + (format *error-output* + "flamegraph: SIGUSR2 will write flame graph to ~A (~Ds @ ~DHz)~%" + *snapshot-path* *snapshot-duration* *snapshot-rate*)))) + +#+sbcl (flamegraph:main) diff --git a/src/lisp/kernel/lsp/top-hook.lisp b/src/lisp/kernel/lsp/top-hook.lisp index f9fda6743e..094768e521 100644 --- a/src/lisp/kernel/lsp/top-hook.lisp +++ b/src/lisp/kernel/lsp/top-hook.lisp @@ -61,6 +61,7 @@ (call-initialize-hooks) (unwind-protect (progn + (ext:process-extension-command-line-arguments) (process-command-line-load-eval-sequence) (if (is-interactive-lisp) (top-level) diff --git a/src/llvmo/code.cc b/src/llvmo/code.cc index dacc85f89c..ee8be4a7f0 100644 --- a/src/llvmo/code.cc +++ b/src/llvmo/code.cc @@ -17,6 +17,7 @@ #include #include #include +#include namespace llvmo { @@ -529,15 +530,12 @@ CL_DEFUN core::T_sp jit_code_entries() { return ll.result(); } -CL_DOCSTRING(R"dx(Generate a list of JITted symbols to /tmp/perf-.map)dx"); +CL_DOCSTRING(R"dx(Append JITted symbols from the GDB JIT descriptor to /tmp/perf-.map. +Shares the same file as arena-trampoline entries and the per-compile callback +from jit_register_symbol — all writes go through perf_map_append.)dx"); DOCGROUP(clasp); CL_DEFUN void ext__generate_perf_map() { - stringstream ss; - ss << "/tmp/perf-" << getpid() << ".map"; - core::clasp_write_string(fmt::format("Writing to {}\n", ss.str())); - FILE* fout = fopen(ss.str().c_str(), "w"); jit_code_entry* jce = __jit_debug_descriptor.first_entry; - ql::list ll; while (jce) { const char* of_start = jce->symfile_addr; size_t of_length = jce->symfile_size; @@ -551,11 +549,12 @@ CL_DEFUN void ext__generate_perf_map() { llvm::Expected> obj = llvm::object::ObjectFile::createObjectFile(memr); for (auto sym : (*obj)->symbols()) { if ((*sym.getAddress()) != 0) - fprintf(fout, "%lX %lX %s\n", (uintptr_t)(*sym.getAddress()), (size_t)sym.getCommonSize(), (sym).getName()->str().c_str()); + perf_map_append((uint8_t*)(uintptr_t)(*sym.getAddress()), + (size_t)sym.getCommonSize(), + (sym).getName()->str()); } jce = jce->next_entry; } - fclose(fout); } CL_LISPIFY_NAME(describe_code); diff --git a/src/llvmo/cscript.lisp b/src/llvmo/cscript.lisp index ebcf9abee5..f819f450f1 100644 --- a/src/llvmo/cscript.lisp +++ b/src/llvmo/cscript.lisp @@ -10,5 +10,7 @@ #~"llvmoExpose.cc" #~"code.cc" #~"llvmoPackage.cc" + #~"trampoline_arena.cc" + #~"trampolineWork.cc" #~"runtimeJit.cc" #~"clbindLlvmExpose.cc") diff --git a/src/llvmo/llvmoPackage.cc b/src/llvmo/llvmoPackage.cc index 589d77da60..30fb17a938 100644 --- a/src/llvmo/llvmoPackage.cc +++ b/src/llvmo/llvmoPackage.cc @@ -33,6 +33,7 @@ THE SOFTWARE. #include #include "llvm/Support/InitLLVM.h" #include "llvm/InitializePasses.h" +#include "llvm/Support/MemoryBuffer.h" #if 0 #undef NDEBUG #include "llvm/Support/Debug.h" @@ -57,7 +58,6 @@ THE SOFTWARE. #include #include #include -#include #include #include #include @@ -93,16 +93,6 @@ SYMBOL_SHADOW_EXPORT_SC_(LlvmoPkg, or); void redirect_llvm_interface_addSymbol() { // llvm_interface::addSymbol = &addSymbolAsGlobal; } - -JITDylib_sp loadModule(llvmo::Module_sp module, size_t startupID, const std::string& libname) { - ClaspJIT_sp jit = llvm_sys__clasp_jit(); - JITDylib_sp jitDylib = jit->createAndRegisterJITDylib(libname); - // printf("%s:%d:%s jit = %p jitDylib = %p\n", __FILE__, __LINE__, __FUNCTION__, jit.raw_(), jitDylib.raw_() ); - ThreadSafeContext_sp tsc = gc::As(comp::_sym_STARthread_safe_contextSTAR->symbolValue()); - jit->addIRModule(jitDylib, module, tsc, startupID); - return jitDylib; -} - DOCGROUP(clasp); CL_DEFUN core::SimpleBaseString_sp llvm_sys__mangleSymbolName(core::String_sp name) { ASSERT(cl__stringp(name)); @@ -552,149 +542,9 @@ void LlvmoExposer_O::expose(core::LispPtr lisp, core::Exposer_O::WhatToExpose wh } } -/*! - Install a trampoline that spills registers onto the stack - - The bytecode trampoline passes a PC. -*/ - -}; // namespace llvmo - -extern "C" { -NEVER_OPTIMIZE -gctools::return_type default_bytecode_trampoline(unsigned char* pc, core::T_O* closure, uint64_t nargs, core::T_O** args) { - return bytecode_call(pc, closure, nargs, args); -} - -NEVER_OPTIMIZE -gctools::return_type unknown_bytecode_trampoline(unsigned char* pc, core::T_O* closure, uint64_t nargs, core::T_O** args) { - return bytecode_call(pc, closure, nargs, args); -} - -NEVER_OPTIMIZE -gctools::return_type lambda_nil(unsigned char* pc, core::T_O* closure, uint64_t nargs, core::T_O** args) { - return bytecode_call(pc, closure, nargs, args); -} -}; - -namespace llvmo { -#include - -std::atomic global_trampoline_counter; -#ifdef CLASP_THREADS -mp::Mutex* global_trampoline_mutex = NULL; -#endif - -string escapeNameForLlvm(const string& inp) { - stringstream sout; - stringstream sin(inp); - char c; - while (1) { - sin.get(c); - if (!sin.good()) - break; - switch (c) { - case '"': - sout << "_"; - break; - default: - sout << c; - } - }; - return sout.str(); -} - -CL_DEFUN core::Pointer_mv cmp__compile_trampoline(core::T_sp tname) { - if (!global_options->_GenerateTrampolines - && !getenv("CLASP_ENABLE_TRAMPOLINES")) { - // If trampolines aren't enabled, don't compile one. - return Values(Pointer_O::create((void*)bytecode_call), SimpleBaseString_O::make("bytecode_call")); - } - - // FIXME: race - if (global_trampoline_mutex == NULL) { - global_trampoline_mutex = new mp::Mutex(DISSASSM_NAMEWORD); - } - WITH_READ_WRITE_LOCK(*global_trampoline_mutex); - ClaspJIT_sp jit = llvm_sys__clasp_jit(); - if (jit.nilp()) { - // If the JIT isn't ready then use the default trampoline - return Values(Pointer_O::create((void*)default_bytecode_trampoline), SimpleBaseString_O::make("default_bytecode_trampoline")); - } - if (tname.consp() && CONS_CAR(tname) == ::cl::_sym_lambda && CONS_CDR(tname).consp() && CONS_CAR(CONS_CDR(tname)).nilp()) { - return Values(Pointer_O::create((void*)lambda_nil), SimpleBaseString_O::make("lambda_nil")); - } - - std::string name; - if (gc::IsA(tname)) { - name = gc::As_unsafe(tname)->fullName(); - } else { - name = _rep_(tname); - // printf("%s:%d:%s trampoline name = |%s|\n", __FILE__, __LINE__, __FUNCTION__, name.c_str()); - // fflush(); - if (name[0] == '"' && name[name.size() - 1] == '"') { - if (name.size() < 3) { // matches "" - return Values(Pointer_O::create((void*)unknown_bytecode_trampoline), - SimpleBaseString_O::make("unknown_bytecode_trampoline")); - } - } - name = name.substr(1, name.size() - 2); // Strip double quotes - } - name = escapeNameForLlvm(name) + "_bct" + std::to_string(global_trampoline_counter++); -#if LLVM_VERSION_MAJOR < 21 - LLVMContext_sp context = llvm_sys__thread_local_llvm_context(); - std::string trampoline = core::searchAndReplaceString(global_trampoline, "wrapper:name", name); - Module_sp module = llvm_sys__parseIRString(trampoline, context, "backtrace_trampoline"); - JITDylib_sp jitDylib = loadModule(module, 0, "trampoline"); - core::Pointer_sp bytecode_ptr = jit->lookup(jitDylib, name); - return Values(bytecode_ptr, SimpleBaseString_O::make(name)); -#else - llvm::orc::ThreadSafeContext* tsc = ((llvm::orc::ThreadSafeContext*)gc::As(comp::_sym_STARthread_safe_contextSTAR->symbolValue())->externalObject()); - return tsc->withContextDo([&](llvm::LLVMContext *lc) { - auto context = gctools::GC::allocate(); - context->_ptr = lc; - std::string trampoline = core::searchAndReplaceString(global_trampoline, "wrapper:name", name); - Module_sp module = llvm_sys__parseIRString(trampoline, context, "backtrace_trampoline"); - JITDylib_sp jitDylib = loadModule(module, 0, "trampoline"); - core::Pointer_sp bytecode_ptr = jit->lookup(jitDylib, name); - return Values(bytecode_ptr, SimpleBaseString_O::make(name)); - }); -#endif -} }; // namespace llvmo -namespace llvmo { - -// Compile callbacks for FFI. -CL_DEFUN JITDylib_sp jit_module_to_dylib(Module_sp module, const std::string& libname) { return loadModule(module, 0, libname); } - -CL_DEFUN core::Pointer_sp jit_lookup(JITDylib_sp dylib, const std::string& name) { - return llvm_sys__clasp_jit()->lookup(dylib, name); -} -// Access a T_sp in a variable. -CL_DEFUN core::T_sp jit_lookup_t(JITDylib_sp dylib, const std::string& name) { - void* ptr; - bool found = llvm_sys__clasp_jit()->do_lookup(dylib, name, ptr); - if (!found) - SIMPLE_ERROR("Could not find pointer for name |{}|", name); - core::T_O** tptr = (core::T_O**)ptr; - T_sp ret((gctools::Tagged)(*tptr)); - return ret; -} - -CL_LISPIFY_NAME("llvmo:jit-lookup-t"); -CL_DEFUN_SETF core::T_sp setf_jit_lookup_t(core::T_sp value, JITDylib_sp dylib, const std::string& name) { - void* ptr; - bool found = llvm_sys__clasp_jit()->do_lookup(dylib, name, ptr); - if (!found) - SIMPLE_ERROR("Could not find pointer for name |{}|", name); - core::T_O** tptr = (core::T_O**)ptr; - *tptr = value.raw_(); - return value; -} - -}; // namespace llvmo #ifdef USE_PRECISE_GC // diff --git a/src/llvmo/runtimeJit.cc b/src/llvmo/runtimeJit.cc index 5b5f538f4c..c27076de59 100644 --- a/src/llvmo/runtimeJit.cc +++ b/src/llvmo/runtimeJit.cc @@ -723,9 +723,21 @@ CL_DEFMETHOD core::T_sp ClaspJIT_O::lookup_all_dylibs(const std::string& name) { return nil(); } +// Set by arena-init code in llvmoPackage.cc around the shared-trampoline and +// stub-template captures. When true, prepareObjectFileForMaterialization marks +// the new ObjectFile as transient so the snapshot walker skips it. The +// ObjectFile is still registered in _AllObjectFiles normally — LLVM's link +// layer plugin looks up the ObjectFile by name via lookupObjectFile() during +// link, so it must remain reachable/findable at runtime; only the snapshot +// save path needs to ignore it. +thread_local bool t_mark_transient_snapshot = false; + ObjectFile_sp prepareObjectFileForMaterialization(JITDylib_sp dylib, const std::string& uniqueObjectFileName, size_t objectId) { core::SimpleBaseString_sp sbs = core::SimpleBaseString_O::make(uniqueObjectFileName); ObjectFile_sp codeObject = gc::GC::allocate(sbs, unbound(), dylib, objectId); + if (t_mark_transient_snapshot) { + codeObject->_TransientSkipSnapshot = true; + } registerObjectFile(codeObject); return codeObject; }; diff --git a/src/llvmo/trampolineWork.cc b/src/llvmo/trampolineWork.cc new file mode 100644 index 0000000000..81e94272ae --- /dev/null +++ b/src/llvmo/trampolineWork.cc @@ -0,0 +1,866 @@ +/* + * trampolineWork.cc — trampoline code + * + */ + +#ifdef __APPLE__ +#pragma message("trampolineWork.cc: trampoline arena is not ported to macOS — " \ + "ensure_trampoline_arena_initialized will fail at runtime and " \ + "callers will fall back to default_bytecode_trampoline. See the " \ + "port-plan comment above ensure_trampoline_arena_initialized.") +#endif + +#include +#include +#include +#include +#include +#include +#include +#include // bytecode_call (call target of every trampoline) +#include // Values() multi-value ctor +#include // _lisp root, used by the post-load regen pass +#include // comp::_sym_STARthread_safe_contextSTAR +#include +#include "llvm/IR/DebugInfo.h" // StripDebugInfo (trampoline shrink) +#include "llvm/Object/ObjectFile.h" // for CFI extraction +#include "llvm/Object/SymbolSize.h" // computeSymbolSizes (trampoline sizing) +#include // arena-mode trampolines +#include // ClaspJIT_O full def (member access on ClaspJIT_sp) +#include // Module_sp, LLVMContext_sp +#include // ObjectFile_O, lookupObjectFileFromEntryPoint +#include "clasp/llvmo/trampolineWork.h" + +// Bring core:: (Pointer_O, SimpleBaseString_O, T_sp, etc.) into scope for +// unqualified use inside `namespace llvmo`. The pre-refactor llvmoPackage.cc +// did the same — most of the trampoline code was written against core:: names +// without qualification. +using namespace core; + + + +extern "C" { +NEVER_OPTIMIZE +gctools::return_type default_bytecode_trampoline(unsigned char* pc, core::T_O* closure, uint64_t nargs, core::T_O** args) { + return bytecode_call(pc, closure, nargs, args); +} + +NEVER_OPTIMIZE +gctools::return_type unknown_bytecode_trampoline(unsigned char* pc, core::T_O* closure, uint64_t nargs, core::T_O** args) { + return bytecode_call(pc, closure, nargs, args); +} + +NEVER_OPTIMIZE +gctools::return_type lambda_nil(unsigned char* pc, core::T_O* closure, uint64_t nargs, core::T_O** args) { + return bytecode_call(pc, closure, nargs, args); +} +}; + + +namespace llvmo { +#include + +std::atomic global_trampoline_counter; + + + +JITDylib_sp loadModule(llvmo::Module_sp module, size_t startupID, const std::string& libname) { + ClaspJIT_sp jit = llvm_sys__clasp_jit(); + JITDylib_sp jitDylib = jit->createAndRegisterJITDylib(libname); + // printf("%s:%d:%s jit = %p jitDylib = %p\n", __FILE__, __LINE__, __FUNCTION__, jit.raw_(), jitDylib.raw_() ); + ThreadSafeContext_sp tsc = gc::As(comp::_sym_STARthread_safe_contextSTAR->symbolValue()); + jit->addIRModule(jitDylib, module, tsc, startupID); + return jitDylib; +} + + + +// Shared JITDylib that owns the arena-init trampoline template module. +// Created lazily on the first trampoline compile. This dylib is not +// registered with _lisp->_Roots._JITDylibs (it holds only the template, +// whose ObjectFile is marked _TransientSkipSnapshot and isn't serialized). +JITDylib_sp global_trampoline_dylib; + +// Add an IR module to an already-existing JITDylib. Mirrors loadModule() +// but skips the per-call createAndRegisterJITDylib (which would push a +// new entry onto _Roots._JITDylibs every time). +static void addModuleToDylib(JITDylib_sp jitDylib, llvmo::Module_sp module, size_t startupID) { + ClaspJIT_sp jit = llvm_sys__clasp_jit(); + ThreadSafeContext_sp tsc = gc::As(comp::_sym_STARthread_safe_contextSTAR->symbolValue()); + jit->addIRModule(jitDylib, module, tsc, startupID); +} + +string escapeNameForLlvm(const string& inp) { + stringstream sout; + stringstream sin(inp); + char c; + while (1) { + sin.get(c); + if (!sin.good()) + break; + switch (c) { + case '"': + sout << "_"; + break; + default: + sout << c; + } + }; + return sout.str(); +} + + +// Trampoline arena initialization. +// +// At first use we build a small IR module for the trampoline template: a +// function that calls a pointer-typed value created from an inttoptr of +// bytecode_call's address (so the address is encoded as an absolute 64-bit +// literal, not a symbol relocation). Compile via the JIT, look up the +// resulting bytes, capture them. Hand them to arena_install_trampoline_template(). +// +// From then on, each cmp__compile_trampoline call routed to the arena +// backend memcpys the template bytes into a fresh slot — no patching, +// no per-arch code, no per-slot indirection. +enum class TrampolineKind; +static bool ensure_trampoline_arena_initialized(ClaspJIT_sp jit, TrampolineKind kind); + +namespace { + +// Extract the leading "target datalayout = ..." and "target triple = ..." +// lines from the existing trampoline IR template, so the stub IR uses the +// same architecture-correct strings without us having to know them at +// compile time. +static std::string extract_target_lines(const std::string& source) { + std::string out; + std::string::size_type pos = 0; + for (int line_count = 0; line_count < 50 && pos < source.size(); ++line_count) { + auto eol = source.find('\n', pos); + if (eol == std::string::npos) break; + std::string line = source.substr(pos, eol - pos); + pos = eol + 1; + if (line.find("target datalayout") != std::string::npos + || line.find("target triple") != std::string::npos) { + out += line; + out += '\n'; + } + } + return out; +} + +} // anonymous namespace + +// Declared in runtimeJit.cc. When true, prepareObjectFileForMaterialization +// marks the created ObjectFile_O's _TransientSkipSnapshot flag. We set it +// while compiling the trampoline template so the snapshot save walker +// excludes that scaffolding ObjectFile. It is still registered in +// _AllObjectFiles (LLVM's link layer needs that), but is filtered out at +// snapshot save time. +extern thread_local bool t_mark_transient_snapshot; + +// Defined in src/core/funcallableInstance.cc — pointer to the C++ static +// inline GFBytecodeEntryPoint::entry_point_n (which we can't take the +// address of from this TU since the struct is local to that file). +extern "C" gctools::return_type (*g_gf_dispatch_entry_point_n)(core::T_O*, size_t, core::T_O**); + +// --------------------------------------------------------------------------- +// Unified trampoline-template machinery. The bytecode and GF trampolines have +// the same structure (alloca + volatile arg-save + movabs/call + epilogue) +// and only differ in (a) signature arity (4 args vs 3) and (b) target +// function address (bytecode_call vs entry_point_n). One IR builder + one +// capture + one ensure-initialized parameterized by TrampolineKind. +// --------------------------------------------------------------------------- + +enum class TrampolineKind { Bytecode, GF }; + +namespace { + +static const char* kind_label(TrampolineKind k) { + return k == TrampolineKind::Bytecode ? "bytecode" : "GF"; +} + +static std::string build_trampoline_ir(uint64_t target_addr, + const std::string& tramp_name, + TrampolineKind kind) { + std::string targets = extract_target_lines(global_trampoline); + char addr_buf[32]; + snprintf(addr_buf, sizeof addr_buf, "%lu", (unsigned long)target_addr); + // Per-kind IR fragments. Bytecode: (i64 pc, ptr closure, i64 nargs, ptr args) + // — closure at %1, nargs at %2, args at %3. GF: (ptr closure, i64 nargs, + // ptr args) — closure at %0, nargs at %1, args at %2. + const bool gf = (kind == TrampolineKind::GF); + const char* sig_params = gf ? "ptr %0, i64 %1, ptr %2" + : "i64 %0, ptr %1, i64 %2, ptr %3"; + const char* sig_fn_type = gf ? "(ptr, i64, ptr)" + : "(i64, ptr, i64, ptr)"; + const char* call_args = gf ? "ptr %0, i64 %1, ptr %2" + : "i64 %0, ptr %1, i64 %2, ptr %3"; + const char* closure_param = gf ? "%0" : "%1"; + const char* nargs_param = gf ? "%1" : "%2"; + const char* args_param = gf ? "%2" : "%3"; + + std::string ir; + ir.reserve(2048); + ir += targets; + ir += "\n"; + // Mirror the legacy trampoline IR's literals stub. clasp's + // ObjectFile-on-load machinery expects every JIT'd module to define it. + ir += "@__clasp_literals_trampoline_stub = internal local_unnamed_addr global [0 x ptr] zeroinitializer, align 8\n\n"; + // No personality. The trampoline is a pure passthrough with no landing + // pads; the C++ unwinder only needs CFI to step through this frame, not + // a personality callback. Keeping personality off means LLVM emits a + // plain "zR" CIE whose only aug-data byte is the FDE encoding, with no + // pcrel-encoded personality pointer — so the CIE bytes are fully + // position-independent and can be memcpy'd verbatim into every arena + // slot. + // + // Direct inttoptr-in-call: forces LLVM to materialize the address as a + // 64-bit immediate (movabs on x86_64; MOVZ+MOVK*3 on arm64). The bytes + // generated are byte-identical for every compile (the immediate is the + // same value across all slots), so memcpy alone is enough — no per-slot + // patching, no RIP-relative offsets to fix up. + ir += "define { ptr, i64 } @\""; ir += tramp_name; ir += "\"("; + ir += sig_params; + ir += ") #0 {\n"; + // Save (closure, nargs, args) to a stack-allocated 3-slot array so a + // debugger / backtrace can recover them from this frame. volatile prevents + // LLVM from optimizing the stores away. Identical for both kinds — only + // the source registers differ. + ir += " %saved = alloca [3 x i64], align 8\n"; + ir += " %p0 = getelementptr inbounds [3 x i64], ptr %saved, i64 0, i64 0\n"; + ir += " %p1 = getelementptr inbounds [3 x i64], ptr %saved, i64 0, i64 1\n"; + ir += " %p2 = getelementptr inbounds [3 x i64], ptr %saved, i64 0, i64 2\n"; + ir += " %closure_int = ptrtoint ptr "; ir += closure_param; ir += " to i64\n"; + ir += " %args_int = ptrtoint ptr "; ir += args_param; ir += " to i64\n"; + ir += " store volatile i64 %closure_int, ptr %p0, align 8\n"; + ir += " store volatile i64 "; ir += nargs_param; ir += ", ptr %p1, align 8\n"; + ir += " store volatile i64 %args_int, ptr %p2, align 8\n"; + ir += " %fp = inttoptr i64 "; ir += addr_buf; ir += " to ptr\n"; + ir += " %r = call { ptr, i64 } "; ir += sig_fn_type; + ir += " %fp("; ir += call_args; ir += ")\n"; + ir += " ret { ptr, i64 } %r\n"; + ir += "}\n"; + // No _end marker: on x86_64 Linux LLVM's target defaults force unwind + // tables on ALL functions regardless of attribute, so any second + // function in this module would produce an extra .eh_frame FDE and + // break the "exactly one FDE" slot-layout invariant. Instead the + // caller queries the trampoline symbol's size from the linked + // ObjectFile (computeSymbolSizes), which also avoids the fragile + // `end - start` address arithmetic. + // + // uwtable: emit unwind tables so Lisp Unwind exceptions can propagate. + // frame-pointer=all: keep the frame pointer chain intact for backtrace. + ir += "attributes #0 = { uwtable \"frame-pointer\"=\"all\" }\n"; + return ir; +} + +// Walk an x86_64-ELF .eh_frame section and extract the single CIE and the +// single FDE as verbatim byte blobs. Relies on the trampoline IR having +// exactly one uwtable function (the template), with the _end marker +// attributed as no-uwtable — so LLVM emits one CIE + one FDE. Any other +// count (zero, or two+ of either) is a sign that something upstream +// changed and silent copy-of-wrong-bytes is about to happen: we fail loud. +// +// Structural bytes (length, CIE pointer, aug data, PC range, CFI opcodes) +// need no relocations, so what's in the unlinked ObjectFile is already +// correct. The only field that requires a relocation is the FDE's PC +// begin; its pre-relocation value is typically zero, and the caller +// patches it for the slot layout. +static bool extract_cie_fde_from_object_file(llvm::object::ObjectFile& obj, + std::vector& out_cie, + std::vector& out_fde) { + for (const llvm::object::SectionRef& sect : obj.sections()) { + auto nameOrErr = sect.getName(); + if (!nameOrErr) { llvm::consumeError(nameOrErr.takeError()); continue; } + if (*nameOrErr != ".eh_frame") continue; + auto contentsOrErr = sect.getContents(); + if (!contentsOrErr) { llvm::consumeError(contentsOrErr.takeError()); continue; } + llvm::StringRef contents = *contentsOrErr; + const uint8_t* p = (const uint8_t*)contents.data(); + const uint8_t* end = p + contents.size(); + const uint8_t* cie_entry = nullptr; size_t cie_entry_size = 0; + const uint8_t* fde_entry = nullptr; size_t fde_entry_size = 0; + size_t cie_count = 0, fde_count = 0; + while (p + 4 <= end) { + uint32_t length; + std::memcpy(&length, p, 4); + if (length == 0) break; // CIE/FDE terminator + if (length == 0xffffffffu) { + fprintf(stderr, "[trampoline-arena] .eh_frame: unexpected 64-bit extended length\n"); + return false; + } + const uint8_t* entry = p; + size_t entry_size = 4 + length; + if (entry + entry_size > end) { + fprintf(stderr, "[trampoline-arena] .eh_frame: entry runs past section end\n"); + return false; + } + uint32_t id; + std::memcpy(&id, p + 4, 4); + if (id == 0) { + ++cie_count; + cie_entry = entry; cie_entry_size = entry_size; + } else { + ++fde_count; + fde_entry = entry; fde_entry_size = entry_size; + } + p += entry_size; + } + if (cie_count != 1 || fde_count != 1) { + fprintf(stderr, + "[trampoline-arena] .eh_frame: expected exactly 1 CIE + 1 FDE, " + "got %zu CIE(s) + %zu FDE(s). Trampoline IR may have grown " + "extra uwtable functions — re-check build_trampoline_ir.\n", + cie_count, fde_count); + return false; + } + out_cie.assign(cie_entry, cie_entry + cie_entry_size); + out_fde.assign(fde_entry, fde_entry + fde_entry_size); + return true; + } + fprintf(stderr, "[trampoline-arena] .eh_frame section not found in ObjectFile\n"); + return false; +} + +// Verify that the CIE we extracted matches our slot-layout assumptions: +// - version 1 +// - augmentation string exactly "zR" (no personality / no LSDA — either +// would add pcrel-encoded pointers that break the byte-identical slot +// copy) +// - FDE encoding byte == DW_EH_PE_pcrel | DW_EH_PE_sdata4 (0x1B), so +// the FDE's PC begin field is a 4-byte signed distance (constant in +// this layout) +// Returns true if all invariants hold; prints a diagnostic and returns +// false otherwise. +static bool validate_trampoline_cie(const uint8_t* cie, size_t n) { + // Minimum for "zR" CIE: length(4)+id(4)+ver(1)+"zR\0"(3)+ca(1)+da(1) + // +rr(1)+aug_len(1)+fde_enc(1) = 17 bytes. Anything shorter is broken. + if (n < 17) { + fprintf(stderr, "[trampoline-arena] CIE too small: %zu bytes\n", n); + return false; + } + if (cie[8] != 1) { + fprintf(stderr, "[trampoline-arena] CIE version = %u (expected 1)\n", cie[8]); + return false; + } + // Aug string is null-terminated starting at offset 9. + size_t pos = 9; + size_t aug_start = pos; + while (pos < n && cie[pos] != 0) ++pos; + if (pos >= n) { + fprintf(stderr, "[trampoline-arena] CIE: aug string not null-terminated\n"); + return false; + } + std::string aug((const char*)cie + aug_start, pos - aug_start); + ++pos; // skip null terminator + if (aug != "zR") { + fprintf(stderr, + "[trampoline-arena] CIE aug string = \"%s\" (expected \"zR\"). " + "Trampoline IR likely added personality/LSDA — byte-identical " + "slot copy would embed pcrel-encoded pointers pointing to the " + "wrong address. Fix: keep build_trampoline_ir free of " + "personality attributes.\n", aug.c_str()); + return false; + } + // Skip code_align (ULEB128), data_align (SLEB128), return register (1B). + // ULEB128/SLEB128 continuation bit = high bit set. + auto skip_leb = [&]() -> bool { + while (pos < n && (cie[pos] & 0x80)) ++pos; + if (pos >= n) return false; + ++pos; // final byte + return true; + }; + if (!skip_leb() || !skip_leb()) { + fprintf(stderr, "[trampoline-arena] CIE: malformed code/data align\n"); + return false; + } + if (pos >= n) return false; + ++pos; // return register (1 byte in eh_frame) + // Aug length (ULEB128). For "zR", value must be 1 (one FDE-encoding byte). + if (pos >= n || (cie[pos] & 0x80)) { + fprintf(stderr, "[trampoline-arena] CIE: unexpected multi-byte aug_length\n"); + return false; + } + uint8_t aug_len = cie[pos++]; + if (aug_len != 1) { + fprintf(stderr, + "[trampoline-arena] CIE aug_length = %u (expected 1 for \"zR\")\n", + aug_len); + return false; + } + if (pos >= n) return false; + uint8_t fde_enc = cie[pos]; + // DW_EH_PE_pcrel = 0x10, DW_EH_PE_sdata4 = 0x0B. Combined = 0x1B. + constexpr uint8_t kExpectedFdeEncoding = 0x1B; + if (fde_enc != kExpectedFdeEncoding) { + fprintf(stderr, + "[trampoline-arena] CIE FDE encoding = 0x%02x (expected 0x%02x = " + "pcrel|sdata4). LLVM may have switched .eh_frame defaults — the " + "PC-begin patch in ensure_trampoline_arena_initialized assumes " + "sdata4 and would corrupt slot unwind info. Audit LLVM's " + "TargetLoweringObjectFile settings for this triple.\n", + fde_enc, kExpectedFdeEncoding); + return false; + } + return true; +} + +static bool capture_trampoline_template(ClaspJIT_sp jit, + uint64_t target_addr, + TrampolineKind kind, + std::vector& out_bytes, + std::vector& out_cie, + std::vector& out_fde) { + std::string tramp_name = (kind == TrampolineKind::GF) + ? "__gf_trampoline_template" + : "__bytecode_trampoline_template"; + std::string ir = build_trampoline_ir(target_addr, tramp_name, kind); + fprintf(stderr, "[trampoline-arena] %s trampoline IR (%zu bytes):\n%s\n", + kind_label(kind), ir.size(), ir.c_str()); + fflush(stderr); + + Module_sp module; +#if LLVM_VERSION_MAJOR < 21 + { + LLVMContext_sp context = llvm_sys__thread_local_llvm_context(); + module = llvm_sys__parseIRString(ir, context, tramp_name.c_str()); + } +#else + llvm::orc::ThreadSafeContext* tsc = + ((llvm::orc::ThreadSafeContext*)gc::As( + comp::_sym_STARthread_safe_contextSTAR->symbolValue())->externalObject()); + tsc->withContextDo([&](llvm::LLVMContext* lc) { + auto context = gctools::GC::allocate(); + context->_ptr = lc; + module = llvm_sys__parseIRString(ir, context, tramp_name.c_str()); + }); +#endif + if (module.nilp() || !module->wrappedPtr()) { + fprintf(stderr, "[trampoline-arena] failed to parse %s trampoline IR\n", kind_label(kind)); + return false; + } + llvm::StripDebugInfo(*module->wrappedPtr()); + + if (!global_trampoline_dylib) { + global_trampoline_dylib = jit->createAndRegisterJITDylib("trampoline"); + } + // Use a large startupID so the ObjectFile codeName won't collide with + // user-compiled trampolines (whose IDs come from global_trampoline_counter + // starting near zero). Bytecode template uses 1M+, GF template uses 2M+. + static std::atomic bytecode_id_counter{1000000}; + static std::atomic gf_id_counter{2000000}; + size_t tramp_id = (kind == TrampolineKind::GF + ? gf_id_counter + : bytecode_id_counter).fetch_add(1); + addModuleToDylib(global_trampoline_dylib, module, tramp_id); + + void* start = nullptr; + if (!jit->do_lookup(global_trampoline_dylib, tramp_name, start)) { + fprintf(stderr, "[trampoline-arena] %s template lookup '%s' failed\n", + kind_label(kind), tramp_name.c_str()); + return false; + } + + // Find the ObjectFile that owns this trampoline's code, and extract the + // CIE and FDE bytes from its .eh_frame section. The arena slot's + // [code | CIE | FDE | terminator] layout copies these bytes verbatim. + ObjectFile_sp of; + if (!lookupObjectFileFromEntryPoint((uintptr_t)start, of)) { + fprintf(stderr, "[trampoline-arena] %s: ObjectFile lookup for %p failed\n", + kind_label(kind), start); + return false; + } + auto expected = of->getObjectFile(); + if (!expected) { + llvm::consumeError(expected.takeError()); + fprintf(stderr, "[trampoline-arena] %s: getObjectFile() failed\n", kind_label(kind)); + return false; + } + std::unique_ptr obj = std::move(*expected); + + // Resolve the trampoline's size from the ELF symbol table instead of + // using an `_end` marker. LLVM's x86_64-Linux target forces unwind + // tables on every function regardless of attribute, so a second + // function would inevitably produce a second FDE and break the 1-FDE + // invariant. The module has exactly one defined function (the + // trampoline) plus a few data-type symbols (the literals-stub global), + // so picking the unique ST_Function with nonzero size is unambiguous + // and robust to name mangling or linker-private prefixes. + size_t sz = 0; + auto sizes = llvm::object::computeSymbolSizes(*obj); + for (auto& p : sizes) { + auto typeOrErr = p.first.getType(); + if (!typeOrErr) { llvm::consumeError(typeOrErr.takeError()); continue; } + if (*typeOrErr != llvm::object::SymbolRef::ST_Function) continue; + if (p.second == 0) continue; + if (sz != 0) { + fprintf(stderr, + "[trampoline-arena] %s: more than one defined function in " + "module — symbol-size disambiguation is unsafe\n", + kind_label(kind)); + return false; + } + sz = (size_t)p.second; + } + if (sz == 0) { + fprintf(stderr, + "[trampoline-arena] %s: no function symbol with nonzero size in module. " + "Symbols visible to computeSymbolSizes:\n", kind_label(kind)); + for (auto& p : sizes) { + auto nameOrErr = p.first.getName(); + std::string n = ""; + if (nameOrErr) n = nameOrErr->str(); + else llvm::consumeError(nameOrErr.takeError()); + auto typeOrErr = p.first.getType(); + int t = -1; + if (typeOrErr) t = (int)*typeOrErr; + else llvm::consumeError(typeOrErr.takeError()); + fprintf(stderr, " name=%s type=%d size=%llu\n", + n.c_str(), t, (unsigned long long)p.second); + } + return false; + } + out_bytes.assign((uint8_t*)start, (uint8_t*)start + sz); + + if (!extract_cie_fde_from_object_file(*obj, out_cie, out_fde)) { + fprintf(stderr, "[trampoline-arena] %s: .eh_frame CIE/FDE extraction failed\n", + kind_label(kind)); + return false; + } + return true; +} + +} // anonymous namespace + +// =========================================================================== +// macOS port plan +// =========================================================================== +// +// The interleaved-slot trampoline arena is currently x86_64-Linux-ELF only. +// On macOS, `ensure_trampoline_arena_initialized` short-circuits to the +// "init failed" path below; callers fall back to default_bytecode_trampoline +// (and g_gf_dispatch_entry_point_n for GF dispatch). Flame charts and +// perf-PID.map on macOS therefore show generic entry-point names instead of +// per-Lisp-function names until this port is completed. +// +// What needs to change, in increasing order of work: +// +// (1) SECTION NAME — extract_cie_fde_from_object_file searches for +// ".eh_frame". MachO stores the same data in segment __TEXT section +// __eh_frame. Try both names (".eh_frame" on ELF, "__eh_frame" on MachO). +// ~10 LoC. +// +// (2) libgcc → libunwind — Darwin uses LLVM's libunwind, whose +// __register_frame takes a *single FDE* pointer, not an eh_frame range. +// Register `slot + tramp_size + cie_size` (FDE start) instead of +// `slot + tramp_size` (CIE start); the 4-byte zero terminator at the +// end of each slot becomes unused padding but is otherwise harmless. +// ~20 LoC, gated on __APPLE__. +// +// (3) FDE ENCODING — validate_trampoline_cie asserts pcrel|sdata4 (0x1B). +// LLVM's Darwin x86_64 default is still 0x1B; verify on actual builds. +// If it's different, validate fires with an actionable diagnostic. +// +// (4) W^X ENFORCEMENT — macOS arm64 forbids PROT_READ|WRITE|EXEC pages. +// Two implementation options: +// (a) MAP_JIT + pthread_jit_write_protect_np(false) before the slot +// memcpy, (true) after. Requires the +// com.apple.security.cs.allow-jit entitlement on the binary. +// Every thread that calls arena_compile_trampoline must be a +// JIT-writer thread at startup. Unmarked threads SIGBUS silently. +// ~30 LoC + entitlement plumbing. +// (b) shm_open dual mapping: one RW view, one RX view of the same +// physical memory. ~80 LoC, doubles VA usage, no thread protocol. +// +// (5) compact_unwind (__unwind_info) — macOS arm64 libunwind consults the +// compact-unwind lookup table first and falls back to eh_frame only +// if nothing matches. Our JIT slots aren't in any compact table, so +// libunwind's fallback path handles them. That path is less +// battle-tested than the compact lookup; budget debugging time. +// Plan-B if the fallback is unreliable: emit a 32-bit compact_unwind +// encoding per slot, maintain a PC-range lookup we register with +// libunwind (no clean API — probably ~200 LoC of private helper). +// +// (6) arm64 IR LOWERING — the `inttoptr i64 ADDR to ptr` + indirect-call +// pattern becomes `movz + movk*3 + blr` (≈20 bytes) on arm64. Slot +// stride grows; otherwise unchanged. arm64 prologue CFI (stp x29,x30 +// + mov x29,sp) is extracted from LLVM automatically — the byte- +// identical memcpy still works because pcrel-sdata4 distances remain +// slot-layout constants. +// +// (7) SIGCHECK — add an install-time assert that the compiled ObjectFile's +// __compact_unwind section is either absent or empty for the template. +// Catches LLVM emitting a compact entry that libunwind might prefer +// over our eh_frame FDE, turning subtle unwind failures into loud +// init-time errors. +// +// Total new code is modest (100–300 LoC). The real cost is verification: +// Lisp Unwind exceptions must propagate through an arena frame without +// std::terminate; Instruments and perf must resolve arena PCs to names; +// both arches need end-to-end testing. Budget 1 day for x86_64 macOS, +// 5–10 days for arm64 macOS dominated by libunwind debugging. +// =========================================================================== + +// One-time arena initialization for either kind. Each kind has independent +// state; calling for one kind doesn't initialize the other. +static bool ensure_trampoline_arena_initialized(ClaspJIT_sp jit, TrampolineKind kind) { + static std::atomic bytecode_state{0}; // 0=uninit, 1=ready, 2=failed + static std::atomic gf_state{0}; + std::atomic& state = (kind == TrampolineKind::GF) ? gf_state : bytecode_state; + int s = state.load(std::memory_order_acquire); + if (s == 1) return true; + if (s == 2) return false; + +#ifdef __APPLE__ + // Trampoline arena is not yet ported to macOS — see the long comment just + // above for the port plan. Fail init so callers fall back to + // default_bytecode_trampoline / g_gf_dispatch_entry_point_n. Log once per + // kind at first touch so the fallback is visible. + static std::atomic warned_bytecode{false}; + static std::atomic warned_gf{false}; + std::atomic& warned = (kind == TrampolineKind::GF) ? warned_gf : warned_bytecode; + bool expected = false; + if (warned.compare_exchange_strong(expected, true)) { + fprintf(stderr, + "[trampoline-arena] DISABLED on macOS (%s) — falling back to " + "%s. See trampolineWork.cc above ensure_trampoline_arena_initialized " + "for the port plan. Consequence: flame charts and perf-PID.map " + "show generic entry-point names instead of per-function names.\n", + kind_label(kind), + kind == TrampolineKind::GF + ? "g_gf_dispatch_entry_point_n" + : "default_bytecode_trampoline"); + fflush(stderr); + } + state.store(2, std::memory_order_release); + return false; +#endif + + static std::mutex bytecode_init_lock; + static std::mutex gf_init_lock; + std::mutex& init_lock = (kind == TrampolineKind::GF) ? gf_init_lock : bytecode_init_lock; + std::lock_guard g(init_lock); + s = state.load(std::memory_order_acquire); + if (s == 1) return true; + if (s == 2) return false; + + // The trampoline template ObjectFile is scaffolding — mark it so the + // snapshot walker skips it. + struct MarkTransientGuard { + MarkTransientGuard() { t_mark_transient_snapshot = true; } + ~MarkTransientGuard() { t_mark_transient_snapshot = false; } + } mark_transient_guard; + + uint64_t target_addr = (kind == TrampolineKind::GF) + ? (uint64_t)g_gf_dispatch_entry_point_n + : (uint64_t)&bytecode_call; + std::vector tramp_bytes, cie_bytes, fde_bytes; + if (!capture_trampoline_template(jit, target_addr, kind, + tramp_bytes, cie_bytes, fde_bytes)) { + state.store(2, std::memory_order_release); + return false; + } + + // Loud-failure invariant checks. The slot-layout design relies on the + // CIE being position-independent ("zR" augmentation, no personality / + // LSDA) and the FDE PC encoding being pcrel|sdata4. If either drifts + // out from under us, the byte-identical memcpy silently corrupts unwind + // info. Fail at install time instead. + if (!validate_trampoline_cie(cie_bytes.data(), cie_bytes.size())) { + state.store(2, std::memory_order_release); + return false; + } + + // Patch the FDE's PC begin field for the slot layout + // [code | CIE | FDE | terminator]. The CIE uses DW_EH_PE_pcrel|sdata4 + // for FDE PC encoding (LLVM's default on x86_64 ELF), so PC begin is + // a signed 4-byte offset from the field's own address to the function + // start. In our layout the PC begin field is at + // slot + tramp_size + cie_size + 8 + // and the code is at slot + 0, so the offset is + // -(tramp_size + cie_size + 8). + // Both quantities are slot-layout constants, so every slot's bytes match. + // + // Other FDE fields do NOT need patching: + // - Length: unchanged. + // - CIE pointer (offset 4): self-relative distance from the field to + // the preceding CIE. In LLVM's .eh_frame the CIE is at section offset + // 0 and the FDE starts at cie_size, so the stored value is + // cie_size + 4 — which is the SAME value we'd need for our slot + // layout (CIE immediately precedes FDE). + // - PC range (offset 12): the function length, already correct. + // - Aug data + CFI: no relocations. + if (fde_bytes.size() < 16) { + fprintf(stderr, "[trampoline-arena] %s: FDE too small (%zu bytes)\n", + kind_label(kind), fde_bytes.size()); + state.store(2, std::memory_order_release); + return false; + } + int32_t pc_begin = -(int32_t)(tramp_bytes.size() + cie_bytes.size() + 8); + std::memcpy(fde_bytes.data() + 8, &pc_begin, 4); + + bool installed = (kind == TrampolineKind::GF) + ? gf_arena_install_trampoline_template( + tramp_bytes.data(), tramp_bytes.size(), + cie_bytes.data(), cie_bytes.size(), + fde_bytes.data(), fde_bytes.size()) + : arena_install_trampoline_template( + tramp_bytes.data(), tramp_bytes.size(), + cie_bytes.data(), cie_bytes.size(), + fde_bytes.data(), fde_bytes.size()); + if (!installed) { + fprintf(stderr, "[trampoline-arena] %s install failed\n", kind_label(kind)); + state.store(2, std::memory_order_release); + return false; + } + fprintf(stderr, "[trampoline-arena] %s template %zu bytes, target %p (CIE %zu, FDE %zu)\n", + kind_label(kind), tramp_bytes.size(), (void*)target_addr, + cie_bytes.size(), fde_bytes.size()); + state.store(1, std::memory_order_release); + return true; +} + +// Compile a GF trampoline (per-generic-function), called from +// GFBytecodeSimpleFun_O::make and from the post-snapshot-load regen pass. +// Returns the address of the per-GF arena slot, or the static +// entry_point_n forwarder pointer if the JIT/arena isn't ready. +core::Pointer_sp cmp__compile_gf_trampoline(core::T_sp tname) { + ClaspJIT_sp jit = llvm_sys__clasp_jit(); + if (jit.nilp() || !ensure_trampoline_arena_initialized(jit, TrampolineKind::GF)) { + return Pointer_O::create((void*)g_gf_dispatch_entry_point_n); + } + std::string aname; + if (gc::IsA(tname)) { + aname = gc::As_unsafe(tname)->fullName(); + } else if (tname.notnilp()) { + aname = _rep_(tname); + if (aname.size() >= 2 && aname[0] == '"' && aname.back() == '"') + aname = aname.substr(1, aname.size() - 2); + } else { + aname = "anonymous_gf"; + } + static std::atomic gf_counter{0}; + size_t id = gf_counter++; + std::string mangled = escapeNameForLlvm(aname) + "_gft" + std::to_string(id); + return gf_arena_compile_trampoline(mangled); +} + +CL_DEFUN core::Pointer_mv cmp__compile_trampoline(core::T_sp tname) { + // Special cases: functions that share a single libclasp fallback symbol. + // These aren't worth giving individual arena slots. + // - (lambda nil ...) — anonymous lambdas with no arglist that Lisp code + // distinguishes by source-file position rather than name. + // - "" — empty-string names (makeBytecodeSimpleFun before set_trampoline + // fires with the real name). + if (tname.consp() && CONS_CAR(tname) == ::cl::_sym_lambda + && CONS_CDR(tname).consp() && CONS_CAR(CONS_CDR(tname)).nilp()) { + return Values(Pointer_O::create((void*)lambda_nil), + SimpleBaseString_O::make("lambda_nil")); + } + + // If the JIT isn't up yet (very early startup), return the default + // trampoline pointer. It's a plain passthrough to bytecode_call and gets + // replaced later when the function is named (loadltv calls set_trampoline). + ClaspJIT_sp jit = llvm_sys__clasp_jit(); + if (jit.nilp()) { + return Values(Pointer_O::create((void*)default_bytecode_trampoline), + SimpleBaseString_O::make("default_bytecode_trampoline")); + } + + // Derive a printable name. + std::string aname; + if (gc::IsA(tname)) { + aname = gc::As_unsafe(tname)->fullName(); + } else { + aname = _rep_(tname); + if (aname.size() < 3 && aname.size() >= 2 && aname[0] == '"' && aname.back() == '"') { + return Values(Pointer_O::create((void*)unknown_bytecode_trampoline), + SimpleBaseString_O::make("unknown_bytecode_trampoline")); + } + if (aname.size() >= 2 && aname[0] == '"' && aname.back() == '"') + aname = aname.substr(1, aname.size() - 2); + } + + // Initialize the arena on first call. Init fails only if the JIT can't + // compile the template; fall back to bytecode_call in that case so the + // system degrades gracefully. + if (!ensure_trampoline_arena_initialized(jit, TrampolineKind::Bytecode)) { + return Values(Pointer_O::create((void*)default_bytecode_trampoline), + SimpleBaseString_O::make("default_bytecode_trampoline")); + } + + size_t arenaId = global_trampoline_counter++; + std::string mangled = escapeNameForLlvm(aname) + "_bct" + std::to_string(arenaId); + core::Pointer_sp p = arena_compile_trampoline(mangled); + return Values(p, SimpleBaseString_O::make(mangled)); +} + +// Compile callbacks for FFI. +CL_DEFUN JITDylib_sp jit_module_to_dylib(Module_sp module, const std::string& libname) { return loadModule(module, 0, libname); } + +CL_DEFUN core::Pointer_sp jit_lookup(JITDylib_sp dylib, const std::string& name) { + return llvm_sys__clasp_jit()->lookup(dylib, name); +} + +// Access a T_sp in a variable. +CL_DEFUN core::T_sp jit_lookup_t(JITDylib_sp dylib, const std::string& name) { + void* ptr; + bool found = llvm_sys__clasp_jit()->do_lookup(dylib, name, ptr); + if (!found) + SIMPLE_ERROR("Could not find pointer for name |{}|", name); + core::T_O** tptr = (core::T_O**)ptr; + T_sp ret((gctools::Tagged)(*tptr)); + return ret; +} + +CL_LISPIFY_NAME("llvmo:jit-lookup-t"); +CL_DEFUN_SETF core::T_sp setf_jit_lookup_t(core::T_sp value, JITDylib_sp dylib, const std::string& name) { + void* ptr; + bool found = llvm_sys__clasp_jit()->do_lookup(dylib, name, ptr); + if (!found) + SIMPLE_ERROR("Could not find pointer for name |{}|", name); + core::T_O** tptr = (core::T_O**)ptr; + *tptr = value.raw_(); + return value; +} + +// Re-attach an arena trampoline to every BytecodeSimpleFun reachable from the +// snapshot. Called after snapshot_load completes its fixup pass. The save side +// substituted bytecode_call for any arena-owned _Trampoline (since the slot +// address is not stable across a restart); this restores wrapped trampolines +// so backtraces and the perf-PID.map continue to identify Lisp frames. +// +// No-op when the env var requesting the arena backend is unset — in that case +// the saved bytecode_call value is the right runtime trampoline anyway. +void arena_post_load_regenerate_trampolines() { + size_t n_regen = 0; + core::List_sp modules = _lisp->_Roots._AllBytecodeModules.load(std::memory_order_relaxed); + for (auto mods : modules) { + core::BytecodeModule_sp module = gc::As_assert(oCar(mods)); + core::T_sp debuginfo = module->debugInfo(); + if (debuginfo.nilp()) continue; + for (auto const& info : debuginfo.as_assert()) { + if (gc::IsA(info)) { + core::BytecodeSimpleFun_sp fun = gc::As_unsafe(info); + core::Pointer_sp tramp = cmp__compile_trampoline(fun->functionName()); + fun->set_trampoline(tramp); + ++n_regen; + } + } + } + // Generic-function dispatch trampolines (GFBytecodeSimpleFun). Install a + // fresh per-GF trampoline as _EntryPoints[0] so flame charts and backtrace + // see per-GF names instead of one shared entry_point_n. + size_t n_gf = 0; + core::List_sp gfs = _lisp->_Roots._AllGFBytecodeFuns.load(std::memory_order_relaxed); + for (auto gf_cons : gfs) { + core::GFBytecodeSimpleFun_sp gf = gc::As_assert(oCar(gf_cons)); + core::Pointer_sp tramp = cmp__compile_gf_trampoline(gf->functionName()); + gf->_EntryPoints._EntryPoints[0] = (core::ClaspXepAnonymousFunction)tramp->ptr(); + ++n_gf; + } + fprintf(stderr, "[trampoline-arena] post-load regenerated %zu bytecode + %zu gf trampolines\n", + n_regen, n_gf); + fflush(stderr); +} + +}; // namespace llvmo diff --git a/src/llvmo/trampoline_arena.cc b/src/llvmo/trampoline_arena.cc new file mode 100644 index 0000000000..f62cef79dc --- /dev/null +++ b/src/llvmo/trampoline_arena.cc @@ -0,0 +1,295 @@ +/* + * trampoline_arena.cc — interleaved-slot arena. + * + * Each slot is a byte-identical blob of [code | CIE | FDE | 4B terminator], + * padded to a 16-byte stride. The caller supplies pre-composed tramp/CIE/FDE + * byte arrays, already patched so that: + * - FDE's PC begin uses pcrel|sdata4 encoding whose value equals the + * compile-time-constant distance back to the code (= -(code_size + + * cie_size + 8)). + * - FDE's CIE pointer equals the compile-time-constant distance back to + * the preceding CIE (= cie_size + 4). + * - FDE's PC range = code_size. + * Because every slot has the same layout, the bytes are identical across + * slots and a single memcpy is all that's required. + * + * For unwinding, each slot registers its own CIE+FDE with libgcc via + * __register_frame(slot + code_size). The trailing 4-byte zero terminator + * in the slot stops libgcc's walk after that one FDE. + * + * Memory: each page is mmap'd PROT_READ|PROT_WRITE|PROT_EXEC and never + * reprotected. Works on Linux (default policy allows RWX). On macOS arm64 + * (W^X enforced) this needs MAP_JIT + per-thread pthread_jit_write_protect_np + * toggling, OR a dual-mapped scheme via shm_open. + * + * Concurrency: + * - allocate(): mutex-locked. Page growth happens under the same mutex. + * - owns() reads the page list using an atomic-published size; lock-free. + * - The side table is append-only with atomic-published size; lookups are + * lock-free and signal-safe. + * - Slot bytes are memcpy'd and FDE registered before side-table publish, + * so a concurrent reader either sees no entry yet, or sees one whose + * memory is fully populated and executable. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace llvmo { + +// ============================================================================ +// TrampolineSideTable +// ============================================================================ + +TrampolineSideTable::TrampolineSideTable() { + // Reserve a generous max capacity up front so push_back never reallocates, + // keeping pointers to existing entries stable for lock-free readers. + _entries.reserve(1u << 20); +} + +void TrampolineSideTable::append(TrampolineEntry e) { + std::lock_guard g(_write_lock); + _entries.push_back(std::move(e)); + _published.store(_entries.size(), std::memory_order_release); +} + +const TrampolineEntry* TrampolineSideTable::find(uintptr_t pc) const { + // Linear scan. mmap places arena pages at non-monotonic addresses, so a + // binary search on a flat vector won't work. With ~100K entries this scan + // is a few hundred microseconds — fine for backtrace frequency. + size_t n = _published.load(std::memory_order_acquire); + for (size_t i = 0; i < n; ++i) { + const TrampolineEntry& e = _entries[i]; + uintptr_t s = (uintptr_t)e.code_start; + if (s <= pc && pc < s + e.code_size) return &e; + } + return nullptr; +} + +// ============================================================================ +// ExecutableArena +// ============================================================================ + +// libgcc unwind-info registration. The pointer passed to __register_frame +// must remain valid for the registration's lifetime — each arena slot holds +// its own CIE+FDE inline, so that's automatic (slots are never freed). +extern "C" void __register_frame(const void* begin); + +ExecutableArena::ExecutableArena(const uint8_t* tramp_bytes, size_t tramp_size, + const uint8_t* cie_bytes, size_t cie_len, + const uint8_t* fde_bytes, size_t fde_len) + : _tramp_size(tramp_size), + _cie_size(cie_len), + _fde_size(fde_len) { + // Slot stride: code + CIE + FDE + 4B terminator, padded to 16-byte + // alignment so each new slot's code starts on a boundary the CPU likes. + size_t payload = tramp_size + cie_len + fde_len + 4; + _slot_stride = (payload + 15u) & ~size_t(15); + _slot_template.assign(_slot_stride, 0); + std::memcpy(_slot_template.data(), tramp_bytes, tramp_size); + std::memcpy(_slot_template.data() + tramp_size, cie_bytes, cie_len); + std::memcpy(_slot_template.data() + tramp_size + cie_len, fde_bytes, fde_len); + // Last 4 bytes of the payload are the zero terminator — already zero from + // assign(_slot_stride, 0). Remaining padding bytes are also zero, harmless. + + // 64 KB chunks. Bigger than the OS page (4 KB on Linux, 16 KB on macOS arm64) + // so each mmap holds many slots. + size_t os_page = (size_t)sysconf(_SC_PAGESIZE); + size_t desired = 64u * 1024u; + _page_size = ((desired + os_page - 1) / os_page) * os_page; + _pages.reserve(1u << 14); +} + +uint8_t* ExecutableArena::allocate() { + std::lock_guard g(_lock); + + if (!_current_page || _current_offset + _slot_stride > _page_size) { + void* p = mmap(nullptr, _page_size, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (p == MAP_FAILED) { + perror("[trampoline-arena] mmap"); + abort(); + } + _current_page = (uint8_t*)p; + _current_offset = 0; + _pages.push_back({(uintptr_t)p, (uintptr_t)p + _page_size}); + _pages_published.store(_pages.size(), std::memory_order_release); + } + + uint8_t* slot = _current_page + _current_offset; + std::memcpy(slot, _slot_template.data(), _slot_stride); + // Register this slot's CIE+FDE with libgcc. The CIE starts at + // slot + _tramp_size; the FDE follows; the 4B zero terminator after the + // FDE stops libgcc's classify walk. + __register_frame(slot + _tramp_size); + _current_offset += _slot_stride; + return slot; +} + +bool ExecutableArena::owns(uintptr_t pc) const { + size_t n = _pages_published.load(std::memory_order_acquire); + for (size_t i = 0; i < n; ++i) { + const PageRange& r = _pages[i]; + if (r.start <= pc && pc < r.end) return true; + } + return false; +} + +// ============================================================================ +// Top-level arena state and APIs +// ============================================================================ + +namespace { + +class TrampolineArenaInstance { +public: + TrampolineArenaInstance(const char* label) : _label(label) {} + + bool install_template(const uint8_t* tramp_bytes, size_t tramp_size, + const uint8_t* cie_bytes, size_t cie_len, + const uint8_t* fde_bytes, size_t fde_len) { + std::lock_guard g(_init_lock); + if (_initialized.load(std::memory_order_relaxed)) return true; + if (!tramp_bytes || tramp_size == 0) { + fprintf(stderr, "[trampoline-arena] %s install_template: invalid tramp args\n", _label); + return false; + } + if (!cie_bytes || cie_len == 0 || !fde_bytes || fde_len == 0) { + fprintf(stderr, "[trampoline-arena] %s install_template: invalid CIE/FDE args\n", _label); + return false; + } + _tramp_size = tramp_size; + _arena = new ExecutableArena(tramp_bytes, tramp_size, + cie_bytes, cie_len, + fde_bytes, fde_len); + _side_table = new TrampolineSideTable(); + fprintf(stderr, + "[trampoline-arena] installed %s template: code=%zu CIE=%zu FDE=%zu\n", + _label, tramp_size, cie_len, fde_len); + fprintf(stderr, "[trampoline-arena] %s code:", _label); + for (size_t i = 0; i < tramp_size; ++i) fprintf(stderr, " %02x", tramp_bytes[i]); + fprintf(stderr, "\n[trampoline-arena] %s cie:", _label); + for (size_t i = 0; i < cie_len; ++i) fprintf(stderr, " %02x", cie_bytes[i]); + fprintf(stderr, "\n[trampoline-arena] %s fde:", _label); + for (size_t i = 0; i < fde_len; ++i) fprintf(stderr, " %02x", fde_bytes[i]); + fprintf(stderr, "\n"); + fflush(stderr); + _initialized.store(true, std::memory_order_release); + return true; + } + + bool is_initialized() const { + return _initialized.load(std::memory_order_acquire); + } + + core::Pointer_sp compile(const std::string& name) { + if (!is_initialized()) { + fprintf(stderr, "[trampoline-arena] %s compile before init\n", _label); + abort(); + } + uint8_t* slot = _arena->allocate(); + // No per-slot patching: the slot template is byte-identical across + // all slots (the call target is embedded as an absolute immediate in + // the template by LLVM at capture time; FDE fields use pcrel-sdata4 + // distances that are constant within the slot layout). + _side_table->append(TrampolineEntry{slot, (uint32_t)_tramp_size, name}); + perf_map_append(slot, _tramp_size, name); + int n = _debug_count.fetch_add(1); + if (n < 3) { + fprintf(stderr, "[trampoline-arena] %s compile #%d '%s' -> %p\n", + _label, n, name.c_str(), slot); + fflush(stderr); + } + return core::Pointer_O::create((void*)slot); + } + + const TrampolineEntry* lookup_if_owned(uintptr_t pc) const { + if (!is_initialized()) return nullptr; + if (!_arena->owns(pc)) return nullptr; + return _side_table->find(pc); + } + + bool owns(uintptr_t pc) const { + return is_initialized() && _arena->owns(pc); + } + +private: + const char* _label; + std::atomic _initialized{false}; + std::mutex _init_lock; + size_t _tramp_size = 0; + ExecutableArena* _arena = nullptr; + TrampolineSideTable* _side_table = nullptr; + std::atomic _debug_count{0}; +}; + +TrampolineArenaInstance g_bytecode("bytecode trampoline"); +TrampolineArenaInstance g_gf("GF trampoline"); + +FILE* g_perf_map = nullptr; +std::mutex g_perf_map_lock; +} // anonymous namespace + +// perf-PID.map writer — opens lazily, one append per registered entry. +// Shared across callers (both arena instances and the LLVM-ORC link-plugin +// per-symbol callback); the first caller truncates any stale file, subsequent +// callers append. +void perf_map_append(uint8_t* addr, size_t size, const std::string& name) { + std::lock_guard g(g_perf_map_lock); + if (!g_perf_map) { + char path[64]; + snprintf(path, sizeof path, "/tmp/perf-%d.map", getpid()); + g_perf_map = fopen(path, "w"); + if (!g_perf_map) return; + } + fprintf(g_perf_map, "%lx %zx %s\n", (uintptr_t)addr, size, name.c_str()); + fflush(g_perf_map); +} + +// ------------------------------------------------------------------------- +// Public C-level wrappers — delegate to the appropriate arena instance. +// ------------------------------------------------------------------------- + +bool arena_install_trampoline_template(const uint8_t* tramp_bytes, size_t tramp_size, + const uint8_t* cie_bytes, size_t cie_len, + const uint8_t* fde_bytes, size_t fde_len) { + return g_bytecode.install_template(tramp_bytes, tramp_size, + cie_bytes, cie_len, + fde_bytes, fde_len); +} +bool arena_is_initialized() { return g_bytecode.is_initialized(); } +core::Pointer_sp arena_compile_trampoline(const std::string& name) { + return g_bytecode.compile(name); +} + +bool gf_arena_install_trampoline_template(const uint8_t* tramp_bytes, size_t tramp_size, + const uint8_t* cie_bytes, size_t cie_len, + const uint8_t* fde_bytes, size_t fde_len) { + return g_gf.install_template(tramp_bytes, tramp_size, + cie_bytes, cie_len, + fde_bytes, fde_len); +} +bool gf_arena_is_initialized() { return g_gf.is_initialized(); } +core::Pointer_sp gf_arena_compile_trampoline(const std::string& name) { + return g_gf.compile(name); +} + +// Unified lookup — checks both arenas so backtrace / debugger code can +// resolve a PC without caring which kind it is. +const TrampolineEntry* arena_lookup_by_pc(uintptr_t pc) { + if (const TrampolineEntry* e = g_bytecode.lookup_if_owned(pc)) return e; + return g_gf.lookup_if_owned(pc); +} + +bool arena_owns_pc(uintptr_t pc) { + return g_bytecode.owns(pc) || g_gf.owns(pc); +} + +}; // namespace llvmo