diff --git a/common/trace.c b/common/trace.c index 8a6b24e91b19..bfea98069ace 100644 --- a/common/trace.c +++ b/common/trace.c @@ -11,12 +11,23 @@ #include #include #include +#include #include #include +#include +#include #include #if HAVE_USDT #include +#else +#define DTRACE_PROBE1(provider, probe, arg1) \ + do { \ + } while (0) +#define DTRACE_PROBE2(provider, probe, arg1, arg2) \ + do { \ + } while (0) +#endif /* The traceperent format is defined in W3C Trace Context RFC[1]. * Its format is defined as @@ -42,6 +53,11 @@ const char *trace_service_name = "lightningd"; static bool disable_trace = false; static FILE *trace_to_file = NULL; +/* UDS datagram socket for sending completed spans. */ +static int trace_socket_fd = -1; +static struct sockaddr_un trace_socket_addr; +static socklen_t trace_socket_addrlen; + #define SPAN_MAX_TAGS 2 struct span_tag { @@ -70,10 +86,7 @@ struct span { bool suspended; }; -static size_t span_keyof(const struct span *span) -{ - return span->key; -} +static size_t span_keyof(const struct span *span) { return span->key; } static size_t span_key_hash(size_t key) { @@ -91,12 +104,11 @@ static struct span fixed_spans[8]; static struct span_htable *spans = NULL; static struct span *current; -static void init_span(struct span *s, - size_t key, - const char *name, +static void init_span(struct span *s, size_t key, const char *name, struct span *parent) { - struct timeabs now = time_now(); /* discouraged: but tracing wants non-dev time */ + struct timeabs now = + time_now(); /* discouraged: but tracing wants non-dev time */ s->key = key; s->id = pseudorand_u64(); @@ -145,9 +157,11 @@ static void trace_inject_traceparent(void) init_span(current, trace_key(&spans), "", NULL); assert(current && !current->parent); - if (!hex_decode(traceparent + 3, 16, &trace_hi, sizeof(trace_hi)) - || !hex_decode(traceparent + 3 + 16, 16, &trace_lo, sizeof(trace_lo)) - || !hex_decode(traceparent + 3 + 16 + 16 + 1, 16, &span, sizeof(span))) { + if (!hex_decode(traceparent + 3, 16, &trace_hi, sizeof(trace_hi)) || + !hex_decode(traceparent + 3 + 16, 16, &trace_lo, + sizeof(trace_lo)) || + !hex_decode(traceparent + 3 + 16 + 16 + 1, 16, &span, + sizeof(span))) { /* We failed to parse the traceparent, abandon. */ fprintf(stderr, "Failed!"); trace_span_clear(current); @@ -159,13 +173,13 @@ static void trace_inject_traceparent(void) } } -static void memleak_scan_spans(struct htable *memtable, struct span_htable *spantable) +static void memleak_scan_spans(struct htable *memtable, + struct span_htable *spantable) { struct span_htable_iter i; const struct span *span; - for (span = span_htable_first(spantable, &i); - span; + for (span = span_htable_first(spantable, &i); span; span = span_htable_next(spantable, &i)) { memleak_ptr(memtable, span); memleak_scan_region(memtable, span, sizeof(*span)); @@ -175,6 +189,7 @@ static void memleak_scan_spans(struct htable *memtable, struct span_htable *span static void trace_init(void) { const char *dev_trace_file; + const char *trace_socket_path; if (spans) return; @@ -188,22 +203,43 @@ static void trace_init(void) current = NULL; dev_trace_file = getenv("CLN_DEV_TRACE_FILE"); if (dev_trace_file) { - const char *fname = tal_fmt(tmpctx, "%s.%u", - dev_trace_file, (unsigned)getpid()); + const char *fname = tal_fmt(tmpctx, "%s.%u", dev_trace_file, + (unsigned)getpid()); trace_to_file = fopen(fname, "a+"); if (!trace_to_file) err(1, "Opening CLN_DEV_TRACE_FILE %s", fname); } - trace_inject_traceparent(); + + /* Open a non-blocking UDS datagram socket for trace emission. */ + trace_socket_path = getenv("CLN_TRACE_SOCKET"); + if (trace_socket_path) { + trace_socket_fd = socket(AF_UNIX, SOCK_DGRAM, 0); + if (trace_socket_fd >= 0) { + fcntl(trace_socket_fd, F_SETFL, O_NONBLOCK); + memset(&trace_socket_addr, 0, + sizeof(trace_socket_addr)); + trace_socket_addr.sun_family = AF_UNIX; + strncpy(trace_socket_addr.sun_path, trace_socket_path, + sizeof(trace_socket_addr.sun_path) - 1); + trace_socket_addrlen = + sizeof(trace_socket_addr.sun_family) + + strlen(trace_socket_addr.sun_path) + 1; + } + } + + /* If no output backend is active, disable tracing to skip + * all the expensive span management work. */ + if (!trace_to_file && trace_socket_fd < 0 && !HAVE_USDT) + disable_trace = true; + + if (!disable_trace) + trace_inject_traceparent(); } /** * Convert the pointer to a context object to a numeric key. */ -static size_t trace_key(const void *key) -{ - return (size_t)key; -} +static size_t trace_key(const void *key) { return (size_t)key; } static struct span *trace_span_find(size_t key) { @@ -233,23 +269,23 @@ static void trace_emit(struct span *s) char buffer[MAX_BUF_SIZE + 1]; size_t len; - snprintf(span_id, sizeof(span_id), "%016"PRIx64, s->id); + snprintf(span_id, sizeof(span_id), "%016" PRIx64, s->id); len = snprintf(buffer, MAX_BUF_SIZE, "[{\"id\":\"%s\",\"name\":\"%s\"," - "\"timestamp\":%"PRIu64",\"duration\":%"PRIu64"," + "\"timestamp\":%" PRIu64 ",\"duration\":%" PRIu64 "," "\"localEndpoint\":{\"serviceName\":\"%s\"},", - span_id, s->name, s->start_time, s->end_time - s->start_time, trace_service_name); + span_id, s->name, s->start_time, + s->end_time - s->start_time, trace_service_name); if (s->parent != NULL) { - len += snprintf(buffer + len, MAX_BUF_SIZE - len, - "\"parentId\":\"%016"PRIx64"\",", - s->parent->id); + len += + snprintf(buffer + len, MAX_BUF_SIZE - len, + "\"parentId\":\"%016" PRIx64 "\",", s->parent->id); if (len > MAX_BUF_SIZE) len = MAX_BUF_SIZE; } - len += snprintf(buffer + len, MAX_BUF_SIZE - len, - "\"tags\":{"); + len += snprintf(buffer + len, MAX_BUF_SIZE - len, "\"tags\":{"); if (len > MAX_BUF_SIZE) len = MAX_BUF_SIZE; for (size_t i = 0; i < SPAN_MAX_TAGS; i++) { @@ -257,15 +293,14 @@ static void trace_emit(struct span *s) continue; len += snprintf(buffer + len, MAX_BUF_SIZE - len, "%s\"%s\":\"%.*s\"", i == 0 ? "" : ", ", - s->tags[i].name, - s->tags[i].valuelen, + s->tags[i].name, s->tags[i].valuelen, s->tags[i].valuestr); if (len > MAX_BUF_SIZE) len = MAX_BUF_SIZE; } len += snprintf(buffer + len, MAX_BUF_SIZE - len, - "},\"traceId\":\"%016"PRIx64"%016"PRIx64"\"}]", + "},\"traceId\":\"%016" PRIx64 "%016" PRIx64 "\"}]", s->trace_id_hi, s->trace_id_lo); if (len > MAX_BUF_SIZE) len = MAX_BUF_SIZE; @@ -276,6 +311,12 @@ static void trace_emit(struct span *s) fprintf(trace_to_file, "span_emit %s %s\n", span_id, buffer); fflush(trace_to_file); } + if (trace_socket_fd >= 0) { + /* Fire-and-forget: silently ignore all sendto errors. */ + sendto(trace_socket_fd, buffer, len, MSG_DONTWAIT, + (struct sockaddr *)&trace_socket_addr, + trace_socket_addrlen); + } } /** @@ -300,9 +341,9 @@ void trace_span_start_(const char *name, const void *key) { size_t numkey = trace_key(key); + trace_init(); if (disable_trace) return; - trace_init(); assert(trace_span_find(numkey) == NULL); struct span *s = trace_span_slot(); @@ -332,11 +373,12 @@ void trace_span_end(const void *key) assert(s && "Span to end not found"); assert(s == current && "Ending a span that isn't the current one"); - struct timeabs now = time_now(); /* discouraged: but tracing wants non-dev time */ + struct timeabs now = + time_now(); /* discouraged: but tracing wants non-dev time */ s->end_time = (now.ts.tv_sec * 1000000) + now.ts.tv_nsec / 1000; DTRACE_PROBE1(lightningd, span_end, s->id); if (trace_to_file) { - fprintf(trace_to_file, "span_end %016"PRIx64"\n", s->id); + fprintf(trace_to_file, "span_end %016" PRIx64 "\n", s->id); fflush(trace_to_file); } trace_emit(s); @@ -364,9 +406,8 @@ void trace_span_tag(const void *key, const char *name, const char *value) t->name = name; t->valuestr = value; t->valuelen = strlen(value); - if (t->valuestr[0] == '"' - && t->valuelen > 1 - && t->valuestr[t->valuelen-1] == '"') { + if (t->valuestr[0] == '"' && t->valuelen > 1 && + t->valuestr[t->valuelen - 1] == '"') { t->valuestr++; t->valuelen -= 2; } @@ -385,12 +426,13 @@ void trace_span_suspend_(const void *key, const char *lbl) struct span *span = trace_span_find(numkey); TRACE_DBG("Suspending span %s (%zu)\n", current->name, current->key); assert(current == span); - assert(!span->suspended); + assert(!span->suspended); span->suspended = true; current = current->parent; DTRACE_PROBE1(lightningd, span_suspend, span->id); if (trace_to_file) { - fprintf(trace_to_file, "span_suspend %016"PRIx64"\n", span->id); + fprintf(trace_to_file, "span_suspend %016" PRIx64 "\n", + span->id); fflush(trace_to_file); } } @@ -426,12 +468,13 @@ void trace_span_resume_(const void *key, const char *lbl) size_t numkey = trace_key(key); current = trace_span_find(numkey); - assert(current->suspended); + assert(current->suspended); current->suspended = false; TRACE_DBG("Resuming span %s (%zu)\n", current->name, current->key); DTRACE_PROBE1(lightningd, span_resume, current->id); if (trace_to_file) { - fprintf(trace_to_file, "span_resume %016"PRIx64"\n", current->id); + fprintf(trace_to_file, "span_resume %016" PRIx64 "\n", + current->id); fflush(trace_to_file); } } @@ -439,16 +482,8 @@ void trace_span_resume_(const void *key, const char *lbl) void trace_cleanup(void) { spans = tal_free(spans); + if (trace_socket_fd >= 0) { + close(trace_socket_fd); + trace_socket_fd = -1; + } } - -#else /* HAVE_USDT */ - -void trace_span_start_(const char *name, const void *key) {} -void trace_span_end(const void *key) {} -void trace_span_suspend_(const void *key, const char *lbl) {} -void trace_span_suspend_may_free_(const void *key, const char *lbl) {} -void trace_span_resume_(const void *key, const char *lbl) {} -void trace_span_tag(const void *key, const char *name, const char *value) {} -void trace_cleanup(void) {} - -#endif /* HAVE_USDT */ diff --git a/doc/developers-guide/tracing-cln-performance.md b/doc/developers-guide/tracing-cln-performance.md index 445df81b0ad2..47b322f80a61 100644 --- a/doc/developers-guide/tracing-cln-performance.md +++ b/doc/developers-guide/tracing-cln-performance.md @@ -8,11 +8,20 @@ CLN includes a simple opentracing exporter that allows tracing the execution of > In software engineering, tracing involves a specialized use of logging to record information about a program's execution. This information is typically used by programmers for debugging purposes, and additionally, depending on the type and detail of information contained in a trace log, by experienced system administrators or technical-support personnel and by software monitoring tools to diagnose common problems with software. -The tracing system in CLN is implemented using [USDTs](https://illumos.org/books/dtrace/chp-usdt.html) (no, not that kind of [USDT](https://en.wikipedia.org/wiki/Tether_(cryptocurrency))). As such it emits events into the kernel, from where an exporter can receive them. If no exporter is configured then the kernel will replace the call-sites of the probe with a `NOP`, thus causing only minimal overhead when not tracing. +CLN supports two tracing backends: -## Compiling with tracing support +1. **USDT probes** (eBPF-based) -- events are emitted into the kernel ring buffer, where an exporter can receive them. Requires kernel access and `systemtap-sdt-dev` headers at compile time. +2. **Unix Domain Socket datagrams** -- completed spans are sent as self-contained Zipkin JSON payloads to a `SOCK_DGRAM` UDS socket. Works in restricted environments (e.g., Kubernetes) where kernel eBPF access is unavailable. -CLN will build with tracing support if the necessary headers (`sys/sdt.h`) are present during the compilation. For debian and ubuntu based systems that is easily achieved by installing `systemtap-sdt-dev`: +The tracing logic (span management, JSON serialization) is always compiled in. When no backend is active, all tracing functions return early with negligible overhead. + +## Backend 1: USDT Probes (eBPF) + +The USDT backend is implemented using [USDTs](https://illumos.org/books/dtrace/chp-usdt.html) (no, not that kind of [USDT](https://en.wikipedia.org/wiki/Tether_(cryptocurrency))). As such it emits events into the kernel, from where an exporter can receive them. If no exporter is configured then the kernel will replace the call-sites of the probe with a `NOP`, thus causing only minimal overhead when not tracing. + +### Compiling with USDT support + +CLN will build with USDT support if the necessary headers (`sys/sdt.h`) are present during the compilation. For debian and ubuntu based systems that is easily achieved by installing `systemtap-sdt-dev`: ```bash # apt-get install -y systemtap-sdt-dev @@ -41,7 +50,7 @@ usdt:lightningd/lightningd:lightningd:span_start usdt:lightningd/lightningd:lightningd:span_suspend ``` -## Exporters +### USDT Exporters The simplest way to get started with eBPF in general (which the tracing is built upon) is the `bpftrace` command that we've already seen above when checking if the binary was built with tracing support. @@ -69,6 +78,84 @@ of the box. [bpftracer]: https://github.com/iovisor/bpftrace/issues/305 +## Backend 2: Unix Domain Socket Datagrams + +The UDS backend sends completed spans as atomic datagrams to a Unix +domain socket. This is designed for environments where kernel eBPF +access is unavailable, such as Kubernetes pods. + +### How it works + +Set the `CLN_TRACE_SOCKET` environment variable to the filesystem path +of a `SOCK_DGRAM` Unix domain socket: + +```bash +$ CLN_TRACE_SOCKET=/tmp/cln-traces.sock lightningd +``` + +When a span completes, its Zipkin-format JSON payload is sent via +`sendto()` to the specified socket. Key properties: + +- **Atomic delivery**: Each datagram contains a complete, self-contained + JSON span payload. No framing or reassembly needed. +- **Non-blocking**: The socket is set to `O_NONBLOCK`. If the collector + is down or the socket buffer is full, the `sendto()` silently fails + without affecting the node. +- **Multi-writer safe**: Multiple CLN daemons can write to the same + socket path concurrently. The kernel guarantees datagram boundaries + are preserved. +- **No USDT dependency**: Works on any system, regardless of whether + `systemtap-sdt-dev` is installed or `HAVE_USDT` is set. + +### Payload format + +Each datagram is a Zipkin v2 JSON array containing a single span: + +```json +[{"id":"00f067aa0ba902b7","name":"lightningd/jsonrpc", + "timestamp":1700000000123000,"duration":500, + "localEndpoint":{"serviceName":"lightningd"}, + "parentId":"d981248067b5e396", + "tags":{"method":"getinfo"}, + "traceId":"4bf92f3577b34da6a3ce929d0e0e4736"}] +``` + +Payloads are capped at 2048 bytes, well below the UDS datagram limit +(~200KB on Linux). + +### Setting up a collector + +Any process that binds a `SOCK_DGRAM` Unix domain socket at the +configured path can receive spans. A minimal Python collector: + +```python +import socket, os, json + +path = "/tmp/cln-traces.sock" +if os.path.exists(path): + os.unlink(path) + +sock = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM) +sock.bind(path) + +while True: + data = sock.recv(4096) + spans = json.loads(data) + print(spans[0]["name"], spans[0]["duration"], "us") +``` + +In a Kubernetes deployment, the socket file is typically placed on a +shared `emptyDir` volume so a sidecar container can collect spans and +forward them to Jaeger, Tempo, or an OpenTelemetry Collector. + +## Environment Variables + +| Variable | Purpose | +|----------|---------| +| `CLN_TRACE_SOCKET` | Path to a `SOCK_DGRAM` UDS socket. Completed spans are sent as Zipkin JSON datagrams. | +| `CLN_TRACEPARENT` | Inject a W3C Trace Context parent (`00---`). All spans inherit this trace ID. | +| `CLN_DEV_TRACE_FILE` | (Developer) Write all trace lifecycle events to `.` files. Used for testing. | + ## Tracing Overhead While we try to make tracing as lightweight as possible it isn't diff --git a/tests/test_misc.py b/tests/test_misc.py index bbfbd80c384f..e229a3850c43 100644 --- a/tests/test_misc.py +++ b/tests/test_misc.py @@ -5143,3 +5143,43 @@ def test_filter_with_invalid_json(node_factory): stdout=subprocess.PIPE) assert 'filter: Expected object: invalid token' in out.stdout.decode('utf-8') assert out.returncode == 1 + + +def test_tracing_socket(node_factory): + """Test UDS datagram tracing backend via CLN_TRACE_SOCKET.""" + l1 = node_factory.get_node(start=False) + sock_path = os.path.join(l1.daemon.lightning_dir, TEST_NETWORK, "trace.sock") + + # Create a SOCK_DGRAM UDS listener + sock = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM) + sock.bind(sock_path) + sock.settimeout(5) + + l1.daemon.env["CLN_TRACE_SOCKET"] = sock_path + l1.start() + l1.stop() + + # Collect all datagrams that were sent + spans = [] + while True: + try: + data = sock.recv(4096) + spans.append(json.loads(data.decode("utf-8"))) + except socket.timeout: + break + + sock.close() + + # We should have received at least some spans from startup + assert len(spans) > 0, "No spans received via UDS socket" + + for span_array in spans: + # Each datagram is a Zipkin JSON array with one span + assert isinstance(span_array, list) + assert len(span_array) == 1 + span = span_array[0] + + # Validate required Zipkin fields are present + for key in ("id", "name", "timestamp", "duration", "traceId"): + assert key in span, f"Missing key {key} in span {span}" + assert span["localEndpoint"] == {"serviceName": "lightningd"}