diff --git a/packages/orchestrator/cmd/create-build/main.go b/packages/orchestrator/cmd/create-build/main.go index d67097e39f..4d293a4e92 100644 --- a/packages/orchestrator/cmd/create-build/main.go +++ b/packages/orchestrator/cmd/create-build/main.go @@ -18,6 +18,7 @@ import ( "time" "github.com/launchdarkly/go-sdk-common/v3/ldlog" + "github.com/launchdarkly/go-sdk-common/v3/ldvalue" "go.opentelemetry.io/otel/metric/noop" "go.uber.org/zap" "go.uber.org/zap/zapcore" @@ -39,6 +40,7 @@ import ( "github.com/e2b-dev/infra/packages/orchestrator/pkg/template/build/metrics" artifactsregistry "github.com/e2b-dev/infra/packages/shared/pkg/artifacts-registry" "github.com/e2b-dev/infra/packages/shared/pkg/dockerhub" + "github.com/e2b-dev/infra/packages/shared/pkg/fcversion" "github.com/e2b-dev/infra/packages/shared/pkg/featureflags" templatemanager "github.com/e2b-dev/infra/packages/shared/pkg/grpc/template-manager" "github.com/e2b-dev/infra/packages/shared/pkg/logger" @@ -76,6 +78,11 @@ func main() { log.Fatal("-to-build required") } + // FPH must be installed at boot — flag is read by Create, not Resume. + featureflags.NewJSONFlag("free-page-hinting-config", ldvalue.FromJSONMarshal(map[string]any{ + "enabled": true, + })) + // Suppress other noisy output unless verbose, but keep std log for fatal errors if !*verbose { cmdutil.SuppressNoisyLogsKeepStdLog() @@ -351,6 +358,13 @@ func doBuild( }) } + // Mirror prod gating in pkg/template/server/create_template.go: balloon + // is rejected on FC <1.14, so we can't unconditionally request FPR. + fcInfo, err := fcversion.New(fc) + if err != nil { + return fmt.Errorf("invalid firecracker version %q: %w", fc, err) + } + tmpl := config.TemplateConfig{ Version: templates.TemplateV2LatestVersion, TemplateID: templateID, @@ -363,6 +377,8 @@ func doBuild( ReadyCmd: readyCmd, KernelVersion: kernel, FirecrackerVersion: fc, + FreePageReporting: fcInfo.HasFreePageReporting(), + TeamID: "local", Steps: steps, } diff --git a/packages/orchestrator/cmd/resume-build/fph_bench.go b/packages/orchestrator/cmd/resume-build/fph_bench.go new file mode 100644 index 0000000000..f9bd0b941a --- /dev/null +++ b/packages/orchestrator/cmd/resume-build/fph_bench.go @@ -0,0 +1,194 @@ +package main + +import ( + "context" + "errors" + "fmt" + "os" + "path/filepath" + "time" + + "github.com/google/uuid" + "github.com/launchdarkly/go-sdk-common/v3/ldvalue" + + "github.com/e2b-dev/infra/packages/orchestrator/cmd/internal/cmdutil" + "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox" + "github.com/e2b-dev/infra/packages/shared/pkg/featureflags" + "github.com/e2b-dev/infra/packages/shared/pkg/storage" +) + +type fphBenchOptions struct { + enabled bool + workload string + iterations int + delay time.Duration +} + +const fphBenchDrainTimeout = 5 * time.Second + +type fphBenchSample struct { + memfileBytes int64 + hintBytes uint64 + reportBytes uint64 + pause time.Duration + err error +} + +// fphBench runs the workload N times under FPR-only and N times under FPR+FPH, +// printing the resulting memfile data size and pause time per iteration. +// Smaller memfile is better. +func (r *runner) fphBench(ctx context.Context, opts fphBenchOptions) error { + if opts.iterations <= 0 { + return errors.New("fph-bench: iterations must be > 0") + } + fmt.Printf("FPH bench (%d iterations per arm, workload=%q)\n", opts.iterations, opts.workload) + + noFph := r.fphBenchArm(ctx, opts, false, "FPR-only ") + withFph := r.fphBenchArm(ctx, opts, true, "FPR + FPH") + + avg := func(s []fphBenchSample, f func(fphBenchSample) int64) int64 { + var sum, n int64 + for _, x := range s { + if x.err == nil { + sum += f(x) + n++ + } + } + if n == 0 { + return 0 + } + + return sum / n + } + memfile := func(s fphBenchSample) int64 { return s.memfileBytes } + pause := func(s fphBenchSample) int64 { return int64(s.pause) } + + fmt.Printf("FPR-only : memfile %s pause %s\n", fmtMiB(avg(noFph, memfile)), time.Duration(avg(noFph, pause))) + fmt.Printf("FPR + FPH: memfile %s pause %s\n", fmtMiB(avg(withFph, memfile)), time.Duration(avg(withFph, pause))) + + for _, g := range [][]fphBenchSample{noFph, withFph} { + for _, s := range g { + if s.err != nil { + return s.err + } + } + } + + return nil +} + +func (r *runner) fphBenchArm(ctx context.Context, opts fphBenchOptions, withFph bool, label string) []fphBenchSample { + out := make([]fphBenchSample, 0, opts.iterations) + for i := range opts.iterations { + if ctx.Err() != nil { + break + } + s := r.fphBenchOnce(ctx, opts, withFph) + out = append(out, s) + if s.err != nil { + fmt.Printf("[%d/%d] %s: %v\n", i+1, opts.iterations, label, s.err) + + continue + } + fmt.Printf("[%d/%d] %s: memfile %s fpr_freed %s fph_freed %s pause %s\n", + i+1, opts.iterations, label, + fmtMiB(s.memfileBytes), + fmtMiB(int64(s.reportBytes)), + fmtMiB(int64(s.hintBytes)), + s.pause.Round(time.Millisecond)) + } + + return out +} + +func (r *runner) fphBenchOnce(ctx context.Context, opts fphBenchOptions, withFph bool) fphBenchSample { + if withFph { + featureflags.NewJSONFlag("free-page-hinting-config", ldvalue.FromJSONMarshal(map[string]any{ + "enabled": true, + "pause": int(fphBenchDrainTimeout / time.Millisecond), + })) + } else { + featureflags.NewJSONFlag("free-page-hinting-config", ldvalue.Null()) + } + + buildID := uuid.New().String() + defer cleanupLocalBuild(buildID) + + runtime := sandbox.RuntimeMetadata{ + TemplateID: r.buildID, + TeamID: "local", + SandboxID: fmt.Sprintf("fph-bench-%d", time.Now().UnixNano()), + ExecutionID: fmt.Sprintf("fph-bench-exec-%d", time.Now().UnixNano()), + } + t0 := time.Now() + sbx, err := r.factory.ResumeSandbox(ctx, r.tmpl, r.sbxConfig, runtime, t0, t0.Add(24*time.Hour), nil) + if err != nil { + return fphBenchSample{err: fmt.Errorf("resume: %w", err)} + } + defer sbx.Close(context.WithoutCancel(ctx)) + + if err := runCommandInSandbox(ctx, sbx, opts.workload); err != nil { + return fphBenchSample{err: fmt.Errorf("workload: %w", err)} + } + if opts.delay > 0 { + select { + case <-ctx.Done(): + return fphBenchSample{err: ctx.Err()} + case <-time.After(opts.delay): + } + } + + origMeta, err := r.tmpl.Metadata() + if err != nil { + return fphBenchSample{err: fmt.Errorf("metadata: %w", err)} + } + newMeta := origMeta + newMeta.Template.BuildID = buildID + + pauseStart := time.Now() + snapshot, err := sbx.Pause(ctx, newMeta, sandbox.SnapshotUseCasePause) + pauseDur := time.Since(pauseStart) + if err != nil { + return fphBenchSample{pause: pauseDur, err: fmt.Errorf("pause: %w", err)} + } + defer snapshot.Close(context.WithoutCancel(ctx)) + + balloon, _ := sbx.FlushAndReadBalloonMetrics(ctx) + + upload, err := sandbox.NewUpload(ctx, nil, snapshot, r.storage, storage.CompressConfig{}, nil, "", nil) + if err != nil { + return fphBenchSample{pause: pauseDur, err: fmt.Errorf("upload prepare: %w", err)} + } + if err := upload.Run(ctx); err != nil { + return fphBenchSample{pause: pauseDur, err: fmt.Errorf("upload: %w", err)} + } + + memfileBytes, err := readLocalMemfileSize(buildID) + if err != nil { + return fphBenchSample{pause: pauseDur, err: fmt.Errorf("memfile size: %w", err)} + } + + return fphBenchSample{ + memfileBytes: memfileBytes, + hintBytes: balloon.HintFreed, + reportBytes: balloon.ReportFreed, + pause: pauseDur, + } +} + +func readLocalMemfileSize(buildID string) (int64, error) { + basePath := os.Getenv("LOCAL_TEMPLATE_STORAGE_BASE_PATH") + if basePath == "" { + return 0, errors.New("LOCAL_TEMPLATE_STORAGE_BASE_PATH not set; -fph-bench requires local storage") + } + + return cmdutil.GetActualFileSize(filepath.Join(basePath, buildID, storage.MemfileName)) +} + +func cleanupLocalBuild(buildID string) { + if base := os.Getenv("LOCAL_TEMPLATE_STORAGE_BASE_PATH"); base != "" { + _ = os.RemoveAll(filepath.Join(base, buildID)) + } +} + +func fmtMiB(n int64) string { return fmt.Sprintf("%.1f MiB", float64(n)/(1<<20)) } diff --git a/packages/orchestrator/cmd/resume-build/main.go b/packages/orchestrator/cmd/resume-build/main.go index dd780e13da..417413b150 100644 --- a/packages/orchestrator/cmd/resume-build/main.go +++ b/packages/orchestrator/cmd/resume-build/main.go @@ -71,11 +71,21 @@ func main() { optimize := flag.Bool("optimize", false, "collect fresh prefetch mapping after pause (resumes snapshot to record page faults)") shell := flag.Bool("shell", false, "attach an interactive PTY shell via envd (no sshd required in the sandbox)") - // Enables the pre-pause reclaim chain with sensible per-step caps. + fphTimeoutMs := flag.Int("fph-timeout-ms", 0, "override free-page-hinting-config pause timeout LD flag (0 = use LD default)") reclaim := flag.Bool("reclaim", false, "enable pre-pause reclaim chain (fstrim 500ms, sync 500ms, drop_caches 200ms, compact 1s)") + fphBench := flag.Bool("fph-bench", false, "compare pause memfile size with vs without FPH; requires -cmd-pause workload, uses -iterations (default 3), forces FPR on") + fphBenchDelay := flag.Duration("fph-bench-delay", 0, "wait this long between workload completion and pause (lets FPR settle)") + flag.Parse() + if *fphTimeoutMs > 0 { + featureflags.NewJSONFlag("free-page-hinting-config", ldvalue.FromJSONMarshal(map[string]any{ + "enabled": true, + "pause": *fphTimeoutMs, + })) + } + if *reclaim { featureflags.NewJSONFlag("guest-pause-reclaim", ldvalue.FromJSONMarshal(map[string]int{ "sync": 500, @@ -119,8 +129,8 @@ func main() { isPauseMode := pauseCount > 0 - // Interactive pause modes are incompatible with iterations. - if *iterations > 0 && (*signalPause != "" || *cmdPause != "" || *cmdSignalPause != "") { + // fph-bench reuses -cmd-pause and -iterations. + if !*fphBench && *iterations > 0 && (*signalPause != "" || *cmdPause != "" || *cmdSignalPause != "") { log.Fatal("-signal-pause, -cmd-pause, and -cmd-signal-pause are incompatible with -iterations") } @@ -140,6 +150,10 @@ func main() { log.Fatal("-shell can only be used in interactive mode (no -cmd, no pause flags, no -iterations)") } + if *fphBench && (*cmdPause == "" || *fphTimeoutMs > 0) { + log.Fatal("-fph-bench requires -cmd-pause and is incompatible with -fph-timeout-ms") + } + // Generate new build ID if not specified and pause mode is enabled outputBuildID := *toBuild if isPauseMode && outputBuildID == "" { @@ -174,7 +188,13 @@ func main() { iterations: *iterations, } - err := run(ctx, *fromBuild, *iterations, *coldStart, *noPrefetch, *noEgress, *verbose, *shell, pauseOpts, runOpts) + benchIters := *iterations + if *fphBench && benchIters <= 0 { + benchIters = 3 + } + fphBenchOpts := fphBenchOptions{enabled: *fphBench, workload: *cmdPause, iterations: benchIters, delay: *fphBenchDelay} + + err := run(ctx, *fromBuild, *iterations, *coldStart, *noPrefetch, *noEgress, *verbose, *shell, pauseOpts, runOpts, fphBenchOpts) cancel() if err != nil { @@ -989,7 +1009,7 @@ func (r *runner) benchmark(ctx context.Context, n int) error { return lastErr } -func run(ctx context.Context, buildID string, iterations int, coldStart, noPrefetch, noEgress, verbose, shell bool, pauseOpts pauseOptions, runOpts runOptions) error { +func run(ctx context.Context, buildID string, iterations int, coldStart, noPrefetch, noEgress, verbose, shell bool, pauseOpts pauseOptions, runOpts runOptions, fphBenchOpts fphBenchOptions) error { // Silence other loggers unless verbose mode var l logger.Logger if !verbose { @@ -1135,10 +1155,11 @@ func run(ctx context.Context, buildID string, iterations int, coldStart, noPrefe token := "local" sbxCfg := sandbox.NewConfig(sandbox.Config{ - BaseTemplateID: buildID, - Vcpu: 1, - RamMB: 512, - Envd: sandbox.EnvdMetadata{Vars: map[string]string{}, AccessToken: &token, Version: "1.0.0"}, + BaseTemplateID: buildID, + Vcpu: 1, + RamMB: 512, + FreePageReporting: fphBenchOpts.enabled, + Envd: sandbox.EnvdMetadata{Vars: map[string]string{}, AccessToken: &token, Version: "1.0.0"}, FirecrackerConfig: fc.Config{ KernelVersion: meta.Template.KernelVersion, FirecrackerVersion: meta.Template.FirecrackerVersion, @@ -1158,6 +1179,10 @@ func run(ctx context.Context, buildID string, iterations int, coldStart, noPrefe sbxConfig: sbxCfg, } + if fphBenchOpts.enabled { + return r.fphBench(ctx, fphBenchOpts) + } + if runOpts.enabled() { return r.cmdMode(ctx, runOpts) } diff --git a/packages/orchestrator/go.mod b/packages/orchestrator/go.mod index 399f8a0fb2..ec35bfa0d7 100644 --- a/packages/orchestrator/go.mod +++ b/packages/orchestrator/go.mod @@ -36,6 +36,7 @@ require ( github.com/gin-contrib/size v1.0.2 github.com/gin-gonic/gin v1.12.0 github.com/go-git/go-billy/v5 v5.9.0 + github.com/go-openapi/runtime v0.29.2 github.com/go-openapi/strfmt v0.25.0 github.com/google/go-containerregistry v0.21.5 github.com/google/nftables v0.3.0 @@ -168,7 +169,6 @@ require ( github.com/go-openapi/jsonpointer v0.22.4 // indirect github.com/go-openapi/jsonreference v0.21.3 // indirect github.com/go-openapi/loads v0.23.2 // indirect - github.com/go-openapi/runtime v0.29.2 // indirect github.com/go-openapi/spec v0.22.1 // indirect github.com/go-openapi/swag v0.25.4 // indirect github.com/go-openapi/swag/cmdutils v0.25.4 // indirect diff --git a/packages/orchestrator/pkg/sandbox/fc/client.go b/packages/orchestrator/pkg/sandbox/fc/client.go index 06cdcd7779..7dfa4a2fda 100644 --- a/packages/orchestrator/pkg/sandbox/fc/client.go +++ b/packages/orchestrator/pkg/sandbox/fc/client.go @@ -4,11 +4,13 @@ package fc import ( "context" + "errors" "fmt" "runtime" "github.com/RoaringBitmap/roaring/v2" "github.com/firecracker-microvm/firecracker-go-sdk" + openapiruntime "github.com/go-openapi/runtime" "github.com/go-openapi/strfmt" "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/template" @@ -430,9 +432,9 @@ func (c *apiClient) startVM(ctx context.Context) error { } // installBalloon attaches a zero-MiB balloon device. Individual balloon -// features (free-page-reporting today, free-page-hinting next) are toggled -// via parameters so callers can opt in to any subset independently. -func (c *apiClient) installBalloon(ctx context.Context, freePageReporting bool) error { +// features (free-page-reporting, free-page-hinting) are toggled via +// parameters so callers can opt in to any subset independently. +func (c *apiClient) installBalloon(ctx context.Context, freePageReporting, freePageHinting bool) error { ctx, span := tracer.Start(ctx, "install-balloon") defer span.End() @@ -445,6 +447,7 @@ func (c *apiClient) installBalloon(ctx context.Context, freePageReporting bool) AmountMib: &amountMib, DeflateOnOom: &deflateOnOom, FreePageReporting: freePageReporting, + FreePageHinting: freePageHinting, }, } @@ -456,6 +459,40 @@ func (c *apiClient) installBalloon(ctx context.Context, freePageReporting bool) return nil } +func (c *apiClient) startBalloonHinting(ctx context.Context, acknowledgeOnStop bool) error { + params := operations.StartBalloonHintingParams{ + Context: ctx, + Body: &models.BalloonStartCmd{AcknowledgeOnStop: acknowledgeOnStop}, + } + _, err := c.client.Operations.StartBalloonHinting(¶ms) + if err != nil { + // FC returns 204 (no content) on success, but the FC OpenAPI spec only + // declares 200/400 — go-swagger treats any other 2xx as "unexpected + // success" and surfaces it as a *runtime.APIError. Honour the 2xx. + var apiErr *openapiruntime.APIError + if errors.As(err, &apiErr) && apiErr.IsSuccess() { + return nil + } + + return fmt.Errorf("error starting balloon hinting: %w", err) + } + + return nil +} + +func (c *apiClient) describeBalloonHinting(ctx context.Context) (hostCmd int64, err error) { + params := operations.DescribeBalloonHintingParams{Context: ctx} + res, err := c.client.Operations.DescribeBalloonHinting(¶ms) + if err != nil { + return 0, err + } + if res.Payload.HostCmd != nil { + hostCmd = *res.Payload.HostCmd + } + + return hostCmd, nil +} + func (c *apiClient) memoryMapping(ctx context.Context) (*memory.Mapping, error) { params := operations.GetMemoryMappingsParams{ Context: ctx, diff --git a/packages/orchestrator/pkg/sandbox/fc/drain_balloon_test.go b/packages/orchestrator/pkg/sandbox/fc/drain_balloon_test.go new file mode 100644 index 0000000000..f19187b7fb --- /dev/null +++ b/packages/orchestrator/pkg/sandbox/fc/drain_balloon_test.go @@ -0,0 +1,33 @@ +//go:build linux + +package fc + +import ( + "context" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// Regression: when hostBefore was already freePageHintDone (steady state +// after a prior cycle, common after resume-from-snapshot) and the new +// cycle completed between polls, an earlier sawBump check missed the +// transition and hung until ctx timeout. +func TestPollFphDone_FastCycleAfterPriorDone(t *testing.T) { + t.Parallel() + + calls := 0 + describe := func(_ context.Context) (int64, error) { + calls++ + + return freePageHintDone, nil + } + + ctx, cancel := context.WithTimeout(t.Context(), time.Second) + defer cancel() + + require.NoError(t, pollFphDone(ctx, describe)) + assert.Equal(t, 1, calls) +} diff --git a/packages/orchestrator/pkg/sandbox/fc/fc_metrics.go b/packages/orchestrator/pkg/sandbox/fc/fc_metrics.go index d575907982..041a036680 100644 --- a/packages/orchestrator/pkg/sandbox/fc/fc_metrics.go +++ b/packages/orchestrator/pkg/sandbox/fc/fc_metrics.go @@ -7,6 +7,7 @@ import ( "context" "encoding/json" "errors" + "fmt" "os" "time" @@ -108,10 +109,98 @@ type firecrackerBlockMetrics struct { RemainingReqsCount uint64 `json:"remaining_reqs_count"` } +// firecrackerBalloonMetrics is a subset of Firecracker's BalloonDeviceMetrics. +// Counters are SharedIncMetric — each flush emits the delta since the previous +// serialize, so we accumulate them in the reader. +type firecrackerBalloonMetrics struct { + FreePageHintCount uint64 `json:"free_page_hint_count"` + FreePageHintFreed uint64 `json:"free_page_hint_freed"` + FreePageHintFails uint64 `json:"free_page_hint_fails"` + FreePageReportCount uint64 `json:"free_page_report_count"` + FreePageReportFreed uint64 `json:"free_page_report_freed"` + FreePageReportFails uint64 `json:"free_page_report_fails"` +} + // firecrackerMetrics is the top-level structure of one Firecracker metrics JSON line. type firecrackerMetrics struct { - Net firecrackerNetMetrics `json:"net"` - Block firecrackerBlockMetrics `json:"block"` + Net firecrackerNetMetrics `json:"net"` + Block firecrackerBlockMetrics `json:"block"` + Balloon firecrackerBalloonMetrics `json:"balloon"` +} + +// BalloonMetricsSnapshot is the cumulative-since-FC-start view of +// virtio-balloon counters, exposed via Process.BalloonMetrics. +type BalloonMetricsSnapshot struct { + HintCount uint64 + HintFreed uint64 + HintFails uint64 + ReportCount uint64 + ReportFreed uint64 + ReportFails uint64 +} + +// fphFlushReadTimeout caps how long FlushAndReadBalloonMetrics waits for the +// metrics-reader goroutine to consume FC's response line. +const fphFlushReadTimeout = 2 * time.Second + +func accumulateBalloon(prev *BalloonMetricsSnapshot, b firecrackerBalloonMetrics) BalloonMetricsSnapshot { + next := BalloonMetricsSnapshot{ + HintCount: b.FreePageHintCount, + HintFreed: b.FreePageHintFreed, + HintFails: b.FreePageHintFails, + ReportCount: b.FreePageReportCount, + ReportFreed: b.FreePageReportFreed, + ReportFails: b.FreePageReportFails, + } + if prev != nil { + next.HintCount += prev.HintCount + next.HintFreed += prev.HintFreed + next.HintFails += prev.HintFails + next.ReportCount += prev.ReportCount + next.ReportFreed += prev.ReportFreed + next.ReportFails += prev.ReportFails + } + + return next +} + +// BalloonMetrics returns the cumulative virtio-balloon counters observed so far. +func (p *Process) BalloonMetrics() BalloonMetricsSnapshot { + if cur := p.balloonAccum.Load(); cur != nil { + return *cur + } + + return BalloonMetricsSnapshot{} +} + +// FlushMetrics triggers an FC metrics flush. Non-blocking on the reader. +func (p *Process) FlushMetrics(ctx context.Context) error { + return p.client.flushMetrics(ctx) +} + +// FlushAndReadBalloonMetrics flushes and waits for the reader to ingest the +// resulting line, returning the updated cumulative snapshot. On flush error +// (e.g. FC already torn down) returns the last observed snapshot. +func (p *Process) FlushAndReadBalloonMetrics(ctx context.Context) (BalloonMetricsSnapshot, error) { + pre := p.balloonAccum.Load() + if err := p.client.flushMetrics(ctx); err != nil { + return p.BalloonMetrics(), fmt.Errorf("flush metrics: %w", err) + } + + deadline := time.Now().Add(fphFlushReadTimeout) + for { + if cur := p.balloonAccum.Load(); cur != pre { + return p.BalloonMetrics(), nil + } + if time.Now().After(deadline) { + return p.BalloonMetrics(), errors.New("timeout waiting for fresh balloon metrics line") + } + select { + case <-time.After(5 * time.Millisecond): + case <-ctx.Done(): + return p.BalloonMetrics(), ctx.Err() + } + } } // startMetricsReader opens the metrics FIFO and starts a goroutine that reads @@ -262,6 +351,10 @@ func (p *Process) startMetricsReader(ctx context.Context) { if b.NoAvailBuffer > 0 { fcBlockNoAvailBuffer.Add(ctx, int64(b.NoAvailBuffer)) } + + // Balloon: SharedIncMetric resets on flush, so accumulate. + next := accumulateBalloon(p.balloonAccum.Load(), m.Balloon) + p.balloonAccum.Store(&next) } if err := scanner.Err(); err != nil { diff --git a/packages/orchestrator/pkg/sandbox/fc/fph_gates.go b/packages/orchestrator/pkg/sandbox/fc/fph_gates.go new file mode 100644 index 0000000000..2287836fd4 --- /dev/null +++ b/packages/orchestrator/pkg/sandbox/fc/fph_gates.go @@ -0,0 +1,17 @@ +package fc + +import ( + "github.com/e2b-dev/infra/packages/shared/pkg/fcversion" +) + +// FCSupportsFreePageHinting reports whether the FC version exposes +// virtio-balloon free-page-hinting. Kernel-side eligibility is gated separately +// via LaunchDarkly. +func FCSupportsFreePageHinting(fcVersion string) bool { + info, err := fcversion.New(fcVersion) + if err != nil { + return false + } + + return info.HasFreePageHinting() +} diff --git a/packages/orchestrator/pkg/sandbox/fc/process.go b/packages/orchestrator/pkg/sandbox/fc/process.go index fe9acf8fde..c794e23ed8 100644 --- a/packages/orchestrator/pkg/sandbox/fc/process.go +++ b/packages/orchestrator/pkg/sandbox/fc/process.go @@ -28,6 +28,7 @@ import ( "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/rootfs" "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/socket" "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/template" + "github.com/e2b-dev/infra/packages/shared/pkg/fc/client/operations" "github.com/e2b-dev/infra/packages/shared/pkg/keys" "github.com/e2b-dev/infra/packages/shared/pkg/logger" sbxlogger "github.com/e2b-dev/infra/packages/shared/pkg/logger/sandbox" @@ -135,6 +136,10 @@ type Process struct { Exit *utils.ErrorOnce client *apiClient + + // balloonAccum is the cumulative virtio-balloon snapshot summed by the + // metrics-reader goroutine (FC's SharedIncMetric resets per flush). + balloonAccum atomic.Pointer[BalloonMetricsSnapshot] } func NewProcess( @@ -302,6 +307,7 @@ func (p *Process) Create( memoryMB int64, hugePages bool, freePageReporting bool, + freePageHinting bool, options ProcessOptions, txRateLimit RateLimiterConfig, driveRateLimit RateLimiterConfig, @@ -444,14 +450,16 @@ func (p *Process) Create( } telemetry.ReportEvent(ctx, "set fc entropy config") - if freePageReporting { - err = p.client.installBalloon(ctx, freePageReporting) - if err != nil { + if freePageReporting || freePageHinting { + if err := p.client.installBalloon(ctx, freePageReporting, freePageHinting); err != nil { fcStopErr := p.Stop(ctx) return errors.Join(fmt.Errorf("error installing balloon device: %w", err), fcStopErr) } - telemetry.ReportEvent(ctx, "installed balloon device") + telemetry.ReportEvent(ctx, "installed balloon device", + attribute.Bool("balloon.free_page_reporting", freePageReporting), + attribute.Bool("balloon.free_page_hinting", freePageHinting), + ) } err = p.client.startVM(ctx) @@ -715,6 +723,72 @@ func (p *Process) Pause(ctx context.Context) error { return p.client.pauseVM(ctx) } +// freePageHintDone is FC's FREE_PAGE_HINT_DONE: the host_cmd value FC writes +// back after the guest's FREE_PAGE_HINT_STOP when start used acknowledge_on_stop. +const freePageHintDone int64 = 1 + +// DrainBalloon triggers a free-page-hinting run and blocks until the cycle +// completes or ctx fires. No-op on FC < v1.14 and when no balloon is configured. +func (p *Process) DrainBalloon(ctx context.Context) error { + ctx, span := tracer.Start(ctx, "drain-balloon") + outcome := "ok" + defer func() { + span.SetAttributes(attribute.String("drain-balloon.outcome", outcome)) + span.End() + }() + + if !FCSupportsFreePageHinting(p.Versions.FirecrackerVersion) { + outcome = "fc-unsupported" + + return nil + } + + if err := p.client.startBalloonHinting(ctx, true); err != nil { + var notConfigured *operations.StartBalloonHintingBadRequest + if errors.As(err, ¬Configured) { + outcome = "not-configured" + + return nil + } + + outcome = "start-failed" + + return fmt.Errorf("start balloon hinting: %w", err) + } + + if err := pollFphDone(ctx, p.client.describeBalloonHinting); err != nil { + if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + outcome = "timeout" + } else { + outcome = "describe-failed" + } + + return err + } + + return nil +} + +func pollFphDone(ctx context.Context, describe func(ctx context.Context) (int64, error)) error { + backoff := 5 * time.Millisecond + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(backoff): + } + + host, err := describe(ctx) + if err != nil { + return fmt.Errorf("balloon hinting status: %w", err) + } + if host == freePageHintDone { + return nil + } + backoff = min(backoff*2, 50*time.Millisecond) + } +} + // CreateSnapshot VM needs to be paused before creating a snapshot. func (p *Process) CreateSnapshot(ctx context.Context, snapfilePath string) error { ctx, childSpan := tracer.Start(ctx, "create-snapshot-fc") diff --git a/packages/orchestrator/pkg/sandbox/sandbox.go b/packages/orchestrator/pkg/sandbox/sandbox.go index 8cc61d7a73..2912def24d 100644 --- a/packages/orchestrator/pkg/sandbox/sandbox.go +++ b/packages/orchestrator/pkg/sandbox/sandbox.go @@ -11,6 +11,7 @@ import ( "time" "github.com/google/uuid" + "github.com/launchdarkly/go-sdk-common/v3/ldcontext" "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/attribute" @@ -169,6 +170,17 @@ type RuntimeMetadata struct { SandboxType SandboxType } +// sandboxLDContext builds an LD context with kernel/FC-version attributes for +// per-sandbox flag targeting. +func sandboxLDContext(runtime RuntimeMetadata, config *Config) ldcontext.Context { + return ldcontext.NewBuilder(runtime.SandboxID). + Kind(featureflags.SandboxKind). + SetString(featureflags.SandboxTemplateAttribute, runtime.TemplateID). + SetString(featureflags.SandboxKernelVersionAttribute, config.FirecrackerConfig.KernelVersion). + SetString(featureflags.SandboxFirecrackerVersionAttribute, config.FirecrackerConfig.FirecrackerVersion). + Build() +} + type Resources struct { Slot *network.Slot rootfs rootfs.Provider @@ -491,6 +503,9 @@ func (f *Factory) CreateSandbox( return nil }) + freePageHinting := fc.FCSupportsFreePageHinting(config.FirecrackerConfig.FirecrackerVersion) && + featureflags.IsFreePageHintingEnabled(ctx, f.featureFlags, sandboxLDContext(runtime, config)) + err = fcHandle.Create( ctx, sbxlogger.SandboxMetadata{ @@ -502,6 +517,7 @@ func (f *Factory) CreateSandbox( config.RamMB, config.HugePages, config.FreePageReporting, + freePageHinting, processOptions, fc.RateLimiterConfig{ Ops: fc.TokenBucketConfig(throttleConfig.Ops), @@ -1065,10 +1081,24 @@ func (s *Sandbox) Pause( // all default to 0 which disables the chain entirely. Non-fatal. s.bestEffortReclaim(ctx) + // Drain free-page-hinting before pause so the snapshot doesn't capture + // pages the guest already considers free. Timeout per use case; 0 disables. + if t := featureflags.GetFreePageHintingTimeout(ctx, s.featureFlags, string(useCase), sandboxLDContext(s.Runtime, s.Config)); t > 0 { + drainCtx, cancel := context.WithTimeout(ctx, t) + if err := s.process.DrainBalloon(drainCtx); err != nil { + telemetry.ReportError(ctx, "balloon hinting drain failed (continuing pause)", err) + } + cancel() + } + if err := s.process.Pause(ctx); err != nil { return nil, fmt.Errorf("failed to pause VM: %w", err) } + // Best-effort flush before the rootfs export goroutine closes the FC API + // socket. Non-blocking on the reader; trades precision for pause latency. + _ = s.process.FlushMetrics(ctx) + // Snapfile is not closed as it's returned and cached for later use (like resume) snapfile := template.NewLocalFileLink(cachePaths.CacheSnapfile()) cleanup.AddNoContext(ctx, snapfile.Close) @@ -1147,6 +1177,12 @@ func (s *Sandbox) Pause( }, nil } +// FlushAndReadBalloonMetrics triggers an FC metrics flush and returns the +// updated cumulative virtio-balloon counters. Used by the FPH bench. +func (s *Sandbox) FlushAndReadBalloonMetrics(ctx context.Context) (fc.BalloonMetricsSnapshot, error) { + return s.process.FlushAndReadBalloonMetrics(ctx) +} + // MemoryPrefetchData returns the ordered page fault data for prefetch mapping. func (s *Sandbox) MemoryPrefetchData(ctx context.Context) (block.PrefetchData, error) { prefetchData, err := s.Resources.memory.PrefetchData(ctx) diff --git a/packages/orchestrator/scripts/bench-fph.sh b/packages/orchestrator/scripts/bench-fph.sh new file mode 100755 index 0000000000..0884f1f69a --- /dev/null +++ b/packages/orchestrator/scripts/bench-fph.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +# Compare pause memfile size with vs without virtio-balloon free-page-hinting. +# Usage: bench-fph.sh [--delay 0s] [--iterations 3] [--workload ] +# Requires root, FC v1.14+ template, and LOCAL_TEMPLATE_STORAGE_BASE_PATH. + +set -euo pipefail + +if [[ $EUID -ne 0 ]]; then echo "must run as root" >&2; exit 1; fi +if [[ $# -lt 1 ]]; then + echo "usage: $0 [--delay 0s] [--iterations 3] [--workload ]" >&2 + exit 1 +fi + +BUILD_ID="$1"; shift +DELAY="0s" +ITERATIONS="3" +# Allocate, touch, free ~256 MiB. Python bytearray is mmap'd by glibc so del() +# returns it to the kernel buddy allocator; the sleep lets FPR settle. +WORKLOAD='python3 -c "import time; b=bytearray(256<<20); [b.__setitem__(i,1) for i in range(0,len(b),4096)]; del b; time.sleep(1)"' + +while [[ $# -gt 0 ]]; do + case "$1" in + --delay) DELAY="$2"; shift 2 ;; + --iterations) ITERATIONS="$2"; shift 2 ;; + --workload) WORKLOAD="$2"; shift 2 ;; + *) echo "unknown arg: $1" >&2; exit 1 ;; + esac +done + +cd "$(dirname "$0")/.." +exec "${GO:-go}" run ./cmd/resume-build \ + -from-build "$BUILD_ID" \ + -fph-bench \ + -cmd-pause "$WORKLOAD" \ + -fph-bench-delay "$DELAY" \ + -iterations "$ITERATIONS" diff --git a/packages/shared/pkg/fcversion/sandbox_features.go b/packages/shared/pkg/fcversion/sandbox_features.go index b768f199d4..821508d0c4 100644 --- a/packages/shared/pkg/fcversion/sandbox_features.go +++ b/packages/shared/pkg/fcversion/sandbox_features.go @@ -11,3 +11,7 @@ func (v *Info) HasHugePages() bool { func (v *Info) HasFreePageReporting() bool { return v.lastReleaseVersion.Major() > 1 || (v.lastReleaseVersion.Major() == 1 && v.lastReleaseVersion.Minor() >= 14) } + +func (v *Info) HasFreePageHinting() bool { + return v.lastReleaseVersion.Major() > 1 || (v.lastReleaseVersion.Major() == 1 && v.lastReleaseVersion.Minor() >= 14) +} diff --git a/packages/shared/pkg/featureflags/flags.go b/packages/shared/pkg/featureflags/flags.go index 6d75252a05..d8d5279c72 100644 --- a/packages/shared/pkg/featureflags/flags.go +++ b/packages/shared/pkg/featureflags/flags.go @@ -224,6 +224,33 @@ var ( // Example: {"sync":500,"drop_caches":200,"compact_memory":1000,"fstrim":500} var ReclaimConfigFlag = NewJSONFlag("guest-pause-reclaim", ldvalue.Null()) +// FreePageHintingConfig controls virtio-balloon free-page-hinting. +// "enabled" configures FreePageHinting=true on the balloon at install time +// (kernel-side eligibility is targeted separately via the LD context — the +// race fixed in https://lore.kernel.org/lkml/20240429125100.7393-1-david@redhat.com/ +// is on the hinting flow, gated by the per-use-case timeouts below). +// "pause"/"build" are pre-pause drain timeouts in ms keyed by SnapshotUseCase; +// missing/zero/negative disables the drain for that use case. +// Example: {"enabled": true, "pause": 500, "build": 0} +var FreePageHintingConfig = NewJSONFlag("free-page-hinting-config", ldvalue.Null()) + +// IsFreePageHintingEnabled reports whether FPH should be configured on the +// balloon at install time. +func IsFreePageHintingEnabled(ctx context.Context, ff *Client, contexts ...ldcontext.Context) bool { + return ff.JSONFlag(ctx, FreePageHintingConfig, contexts...).GetByKey("enabled").BoolValue() +} + +// GetFreePageHintingTimeout returns the pre-pause FPH drain timeout for the +// given SnapshotUseCase. Zero means disabled. +func GetFreePageHintingTimeout(ctx context.Context, ff *Client, useCase string, contexts ...ldcontext.Context) time.Duration { + ms := ff.JSONFlag(ctx, FreePageHintingConfig, contexts...).GetByKey(useCase).IntValue() + if ms <= 0 { + return 0 + } + + return time.Duration(ms) * time.Millisecond +} + type ReclaimConfig struct { Sync time.Duration DropCaches time.Duration