From cf0f1d8af8e2e595155cb01f168fe4b511a7dbd3 Mon Sep 17 00:00:00 2001 From: DevVegeta Date: Fri, 19 Jun 2026 09:43:22 +0530 Subject: [PATCH 1/8] PR1: swap-encryption benchmark - shared DaemonSet/pod infra (layer 1/5); manifest moved to data/cluster and rendered via vm_util --- .../cluster/swap_encryption_daemonset.yaml.j2 | 266 +++ .../swap_encryption_benchmark.py | 1529 +++++++++++++++++ 2 files changed, 1795 insertions(+) create mode 100644 perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2 create mode 100644 perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py diff --git a/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2 b/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2 new file mode 100644 index 0000000000..c40ec79dff --- /dev/null +++ b/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2 @@ -0,0 +1,266 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: {{ ds_name }} + namespace: {{ ds_namespace }} + labels: + app: {{ ds_label }} +spec: + selector: + matchLabels: + app: {{ ds_label }} + template: + metadata: + labels: + app: {{ ds_label }} + spec: + hostPID: true + hostNetwork: true + # Pin to the benchmark nodepool — never schedule on the dummy default pool. + nodeSelector: + pkb_nodepool: {{ benchmark_nodepool }} + tolerations: + - operator: Exists + containers: + - name: benchmark + image: {{ image }} + command: + - bash + - -c + - | + echo "[pkb] Installing benchmark tools..." + # Retry apt-get up to 3 times — transient network failures are + # common on a freshly-started GKE node. Critical tools (fio, + # stress-ng) must be present before we write the ready sentinel; + # a silent || true here would cause /tmp/pkb_ready to appear even + # when tools are missing, breaking all subsequent phases. + PKB_APT_OK=0 + for _attempt in 1 2 3; do + apt-get update -qq 2>&1 || true + DEBIAN_FRONTEND=noninteractive apt-get install -y -qq \\ + fio \\ + stress-ng \\ + sysstat \\ + cryptsetup \\ + mdadm \\ + redis-server \\ + redis-tools \\ + git \\ + wget \\ + curl \\ + make \\ + gcc \\ + bc \\ + flex \\ + bison \\ + libelf-dev \\ + libssl-dev \\ + cgroup-tools \\ + nvme-cli \\ + util-linux \\ + python3-pip \\ + libevent-dev \\ + libssl-dev \\ + libpcre3-dev \\ + zlib1g-dev \\ + build-essential \\ + autoconf \\ + automake \\ + libtool \\ + libtool-bin \\ + pkg-config \\ + python3-dev \\ + default-jre-headless \\ + 2>&1 && PKB_APT_OK=1 && break + echo "[pkb] apt-get attempt $_attempt failed, retrying in 15s..." >&2 + sleep 15 + done + if [ "$PKB_APT_OK" != "1" ] || \\ + ! command -v fio >/dev/null 2>&1 || \\ + ! command -v stress-ng >/dev/null 2>&1; then + echo "[pkb] FATAL: critical tools (fio, stress-ng) not installed after 3 attempts" >&2 + exit 1 + fi + echo "[pkb] Installing memtier_benchmark from source..." + # Pin a stable release tag — building from the moving default + # branch (HEAD) intermittently broke (memtier_benchmark not found + # → Phase 3a lost its P50/P90/P99 latency). 2.2.1 matches the + # version PKB's memtier package (memtier.MemtierResult.Parse) is + # validated against and builds cleanly with the apt deps above. + # Fall back to HEAD only if the tagged clone fails. + if ! command -v memtier_benchmark >/dev/null 2>&1; then + (cd /tmp && \\ + rm -rf memtier_benchmark && \\ + ( git clone --depth 1 --branch 2.2.1 \\ + https://github.com/RedisLabs/memtier_benchmark.git 2>&1 || \\ + git clone --depth 1 \\ + https://github.com/RedisLabs/memtier_benchmark.git 2>&1 ) && \\ + cd memtier_benchmark && \\ + autoreconf -ivf 2>&1 && \\ + ./configure 2>&1 && \\ + make -j$(nproc) 2>&1 && \\ + make install 2>&1) > /tmp/pkb_memtier_build.log 2>&1 || \\ + echo "[pkb] WARNING: memtier_benchmark build failed (see /tmp/pkb_memtier_build.log); redis-benchmark fallback will be used" + fi + if command -v memtier_benchmark >/dev/null 2>&1; then + echo "[pkb] memtier_benchmark installed: $(memtier_benchmark --version 2>&1 | head -1)" + fi + echo "[pkb] Installing esrally (lightweight)..." + python3 -m pip install --upgrade --break-system-packages pip setuptools wheel > /tmp/pkb_esrally_build.log 2>&1 || true + pip3 install --break-system-packages elastic-transport esrally >> /tmp/pkb_esrally_build.log 2>&1 || \\ + pip3 install --break-system-packages esrally >> /tmp/pkb_esrally_build.log 2>&1 || \\ + echo "[pkb] WARNING: esrally install failed (see /tmp/pkb_esrally_build.log); opensearch curl fallback will be used" + if command -v esrally >/dev/null 2>&1; then + echo "[pkb] esrally installed: $(esrally --version 2>&1 | head -1)" + else + echo "[pkb] WARNING: esrally binary not on PATH after install; opensearch curl fallback will be used" >&2 + fi + echo "[pkb] Installing OpenSearch (single-node, security off) for Phase 3c..." + # Phase 3c needs a real search server on :9200. Nothing in apt + # ships one and the pod has no systemd, so install the OpenSearch + # bundle (ships its own JDK) and launch the binary directly in the + # phase. All best-effort: if any step fails the phase probes the + # endpoint and skips cleanly rather than recording fake timings. + if [ ! -x /opt/opensearch/bin/opensearch ]; then + OS_VER=2.15.0 + (cd /opt && \\ + wget -q --timeout=600 -O os.tgz \\ + "https://artifacts.opensearch.org/releases/bundle/opensearch/$OS_VER/opensearch-$OS_VER-linux-x64.tar.gz" && \\ + tar -xzf os.tgz && rm -f os.tgz && \\ + mv "opensearch-$OS_VER" opensearch) > /tmp/pkb_opensearch_build.log 2>&1 || \\ + echo "[pkb] WARNING: OpenSearch download/extract failed (see /tmp/pkb_opensearch_build.log); Phase 3c will skip" >&2 + fi + if [ -x /opt/opensearch/bin/opensearch ]; then + # pkbos owns and runs OpenSearch (it refuses to run as root). + # Give it a home so HOME/temp paths are writable. + id pkbos >/dev/null 2>&1 || useradd -r -d /opt/opensearch -s /bin/bash pkbos 2>/dev/null || true + printf 'discovery.type: single-node\\nnetwork.host: 127.0.0.1\\nplugins.security.disabled: true\\n' \\ + > /opt/opensearch/config/opensearch.yml + mkdir -p /opt/opensearch/config/jvm.options.d + # 2 GB heap: 512 MB was too small and OpenSearch aborted early. + # On a 252 GB node this still leaves plenty of page cache to + # pressure into swap during the phase. + printf -- '-Xms2g\\n-Xmx2g\\n' \\ + > /opt/opensearch/config/jvm.options.d/pkb-heap.options + sysctl -w vm.max_map_count=262144 >/dev/null 2>&1 || true + # CRITICAL: never run the binary as root here (it bails and + # leaves root-owned files in logs/ that block the pkbos server). + # Clear any stale logs and chown everything to pkbos LAST. + rm -f /opt/opensearch/logs/* 2>/dev/null || true + chown -R pkbos /opt/opensearch 2>/dev/null || true + echo "[pkb] OpenSearch installed at /opt/opensearch (heap 2g, runs as pkbos)" + fi + echo "[pkb] Pre-fetching kernel source for Phase 3b build workload..." + PKB_KVER="{{ kernel_version }}" + PKB_KROOT="/mnt/stateful_partition/pkb_kernel" + PKB_KTARBALL="$PKB_KROOT/linux-$PKB_KVER.tar.xz" + PKB_KSRC="$PKB_KROOT/linux-$PKB_KVER" + PKB_KURL="https://cdn.kernel.org/pub/linux/kernel/v${PKB_KVER%%.*}.x/linux-$PKB_KVER.tar.xz" + mkdir -p "$PKB_KROOT" + if [ ! -f "$PKB_KTARBALL" ]; then + wget -q --timeout=300 -O "$PKB_KTARBALL" "$PKB_KURL" 2>&1 || \\ + echo "[pkb] WARNING: kernel tarball download failed" >&2 + fi + if [ -f "$PKB_KTARBALL" ] && [ ! -d "$PKB_KSRC" ]; then + echo "[pkb] Extracting kernel source (xz)..." + tar -xf "$PKB_KTARBALL" -C "$PKB_KROOT" 2>&1 || \\ + echo "[pkb] WARNING: kernel source extraction failed" >&2 + fi + echo "[pkb] Unlocking container cgroup swap limits..." + # GKE cgroup v2 sets memory.swap.max=0 per-container, which + # prevents swap usage even when the node has a swap device and + # vm.swappiness>0. Stress-ng gets OOM-killed in ~15s because + # the kernel can't page out to swap for this cgroup. + # + # NOTE: the old approach derived the cgroup path from + # /proc/self/cgroup, but inside a cgroup namespace that reports + # "0::/" — so the write targeted the host ROOT cgroup, silently + # no-op'd, and swap stayed locked (the OOM-in-15s symptom above). + # /sys is the host cgroup tree (hostPath mount) and this pod is + # privileged, so instead unlock swap across the entire kubepods + # hierarchy, which is guaranteed to contain our own container. + if [ -d /sys/fs/cgroup/kubepods.slice ] || \ + [ -d /sys/fs/cgroup/kubepods ]; then + # cgroup v2: write 'max' to every memory.swap.max under kubepods*. + find /sys/fs/cgroup -path '*kubepods*' -name memory.swap.max \ + 2>/dev/null | while read -r _f; do + echo max > "$_f" 2>/dev/null || true + done + fi + # Best-effort: our own namespaced path and the unified root. + PKB_CG=$(awk -F: '$2==""{print $3; exit}' /proc/self/cgroup \ + 2>/dev/null) + for _cgf in "/sys/fs/cgroup${PKB_CG}/memory.swap.max" \ + /sys/fs/cgroup/memory.swap.max; do + [ -f "$_cgf" ] && { echo max > "$_cgf" 2>/dev/null || true; } + done + # cgroup v1 fallback: lift the combined RAM+swap hard ceiling. + find /sys/fs/cgroup/memory -path '*kubepods*' \ + -name memory.memsw.limit_in_bytes 2>/dev/null \ + | while read -r _f; do + echo -1 > "$_f" 2>/dev/null || true + done + # Verify and surface the result in the pod log. grep -L lists + # files that do NOT contain 'max' on their first line, i.e. ones + # still capping swap. + PKB_STILL_CAPPED=$(find /sys/fs/cgroup -path '*kubepods*' \ + -name memory.swap.max 2>/dev/null \ + | xargs -r grep -L '^max' 2>/dev/null | head -1) + if [ -n "$PKB_STILL_CAPPED" ]; then + echo "[pkb] WARNING: cgroup swap still capped at \ + $PKB_STILL_CAPPED=$(cat "$PKB_STILL_CAPPED" 2>/dev/null) — stress-ng may be \ + OOM-killed before swap is exercised" >&2 + else + echo "[pkb] cgroup swap unlocked (memory.swap.max=max across kubepods)" + fi + echo "[pkb] Tools installed. Writing ready sentinel." + touch /tmp/pkb_ready + sleep infinity + securityContext: + privileged: true + capabilities: + add: ["SYS_ADMIN", "IPC_LOCK"] + resources: + requests: + memory: "512Mi" + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + volumeMounts: + - name: dev + mountPath: /dev + - name: sys + mountPath: /sys + - name: run + mountPath: /run + - name: proc-host + mountPath: /proc-host + readOnly: true + - name: stateful-partition + mountPath: /mnt/stateful_partition + - name: lib-modules + mountPath: /lib/modules + readOnly: true + volumes: + - name: dev + hostPath: + path: /dev + - name: sys + hostPath: + path: /sys + - name: run + hostPath: + path: /run + - name: proc-host + hostPath: + path: /proc + - name: stateful-partition + hostPath: + path: /mnt/stateful_partition + type: DirectoryOrCreate + - name: lib-modules + hostPath: + path: /lib/modules + type: Directory diff --git a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py new file mode 100644 index 0000000000..5bdc933bba --- /dev/null +++ b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py @@ -0,0 +1,1529 @@ +# Copyright 2026 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""GKE vs. AWS EKS Swap Encryption and LSSD Performance Benchmark. + +Methodology: go/swap-encryption-and-lssd-performance-comparison:gke-vs-aws + +== Architecture == + +Provisions a real GKE (GCP) or EKS (AWS) Kubernetes cluster via PKB's +container_cluster abstraction, then deploys a privileged DaemonSet whose +pod has full host-device access (/dev, /sys, hostPID). All benchmark +phases execute inside this pod via kubectl exec, so measurements reflect +actual cluster-node behaviour including Kubernetes overhead (kubelet, +containerd cgroup hierarchy, etc.). + + GKE nodes ── dm-crypt with ephemeral key (go/node:swap-encryption) + swap device: /dev/mapper/swap_encrypted (over dedicated + hyperdisk or LSSD RAID-0 /dev/md0). + Single-disk fallback: plain loop device on + /mnt/stateful_partition — dm-crypt is blocked by COS + kernel namespace restrictions from inside a pod. + + EKS nodes ── NVMe Instance Store, Nitro hardware-offloaded encryption + swap device: /dev/nvme1n1 (or auto-detected) + +== Benchmark Phases == + + Phase 1 – fio Microbenchmarks + Run fio directly on the swap block device (swapoff first) to measure + the hardware + encryption ceiling: random IOPS (4K), sequential + bandwidth (1M), and completion latency (iodepth=1). + + Phase 2a – CPU Overhead + stress-ng drives sustained swap I/O; vmstat and pidstat capture + swap-in/out rates and per-process CPU cost (kswapd, kcryptd, + dm-crypt threads on GKE; Nitro offload on EKS). + + Phase 2b – I/O Interference + Baseline fio on a scratch volume → re-run with concurrent swap + pressure. IOPS/latency delta = storage contention cost. + + Phase 3a – Redis Latency + Dataset loaded beyond container memory limit → GET/SET p99 latency + measured while kernel swaps pages. + + Phase 3b – Kernel Build + Linux compiled inside a memory-capped cgroup; slowdown ratio vs + unconstrained baseline. + + Phase 3c – OpenSearch + Bulk-index + search query under swap pressure (esrally or curl). +""" + +import json +import logging +import re +import textwrap +import time +from typing import Any + +from absl import flags +from perfkitbenchmarker import configs +from perfkitbenchmarker import errors +from perfkitbenchmarker import sample +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.resources.container_service import kubectl + +FLAGS = flags.FLAGS + +# --------------------------------------------------------------------------- +# Benchmark identity +# --------------------------------------------------------------------------- + + + +FLAGS = flags.FLAGS + + +BENCHMARK_NAME = 'swap_encryption' + + +BENCHMARK_CONFIG = """ +swap_encryption: + description: > + GKE vs. EKS swap encryption and LSSD performance comparison. + Two-step nodepool setup: PKB provisions a minimal cluster with a cheap + default nodepool (Step 1), then Prepare() adds the real benchmark + nodepool (n4-highmem-32 / c4-*-lssd, COS_CONTAINERD, 80k IOPS) with a + node-level startup script that configures dm-crypt swap before any pod + is scheduled, then removes the default nodepool (Step 2). All benchmark + phases run inside a privileged DaemonSet pinned to the benchmark nodepool. + flags: {} + container_cluster: + type: Kubernetes + vm_count: 1 + vm_spec: + GCP: + # Cheap placeholder — the benchmark nodepool is created in Prepare(). + machine_type: e2-medium + boot_disk_size: 20 + AWS: + # Cheap placeholder — the benchmark nodegroup is added in Prepare(). + machine_type: t3.medium + boot_disk_size: 20 +""" + + +_DAEMONSET_IMAGE = flags.DEFINE_string( + 'swap_encryption_daemonset_image', + 'ubuntu:22.04', + 'Container image used for the privileged benchmark DaemonSet pod.', +) + + +_NODEPOOL = flags.DEFINE_string( + 'swap_encryption_nodepool', + 'benchmark', + 'Name of the node pool to deploy the benchmark DaemonSet on.', +) + + +_INSTANCE_SIZE_LABEL = flags.DEFINE_string( + 'swap_encryption_instance_size_label', + '', + 'Human-readable label for the current instance size being tested, e.g. ' + '"n4-highmem-32" or "i4i.4xlarge". Stored in sample metadata so that ' + 'results from multiple PKB runs across different instance sizes can be ' + 'collated and compared. Defaults to the value reported by the cloud ' + 'metadata endpoint inside the pod.', +) + + +_COLLECT_COST = flags.DEFINE_boolean( + 'swap_encryption_collect_cost', + False, + 'When True, emit a cost_estimate_usd sample using on-demand pricing ' + 'for the instance type detected at runtime.', +) + + +_FAIL_ON_DEGRADED = flags.DEFINE_boolean( + 'swap_encryption_fail_on_degraded', + True, + 'When True (default), raise an error at the end of Run() if the run was ' + 'catastrophically degraded — e.g. the benchmark pod was OOM-evicted and ' + 'replaced mid-run, Gate 1 (fio) produced no samples, or the stress-ng ' + 'swap-pressure phase was OOM-killed before completing. This prevents PKB ' + 'from reporting SUCCEEDED for a run whose post-eviction phases produced ' + 'empty or meaningless data. Set False to keep the legacy behaviour of ' + 'always returning whatever partial samples were collected.', +) + + +_PHASES = flags.DEFINE_list( + 'swap_encryption_phases', + ['all'], + 'Which Run() phases to execute, for fast iteration against an ' + 'already-provisioned cluster (e.g. --run_stage=run --run_uri=...). ' + 'Comma-separated subset of: fio (Tier 1 microbenchmarks), 2a (stress-ng ' + 'CPU overhead + swap pressure), 2b (I/O interference), 3a (redis), ' + '3b (kernel build), 3c (opensearch). Default "all" runs everything. ' + 'Example: --swap_encryption_phases=2a runs only the swap-pressure phase. ' + 'Phases not listed are skipped and do not affect the degraded-run gate ' + '(e.g. skipping fio will not be reported as "Gate 1 produced no samples").', +) + + +_BENCHMARK_MACHINE_TYPE = flags.DEFINE_string( + 'swap_encryption_benchmark_machine_type', + 'n4-highmem-32', + 'Machine type for the benchmark nodepool created in Prepare(). ' + 'Use n4-highmem-32 (hyperdisk, default) or c4-standard-8-lssd ' + '(LSSD RAID-0). The matching swap setup is selected automatically.', +) + + +_BENCHMARK_LSSD = flags.DEFINE_boolean( + 'swap_encryption_lssd', + False, + 'Force LSSD RAID-0 swap path even when the machine type name does not ' + 'contain "lssd". Auto-detected from machine type when False.', +) + + +_LSSD_COUNT = flags.DEFINE_integer( + 'swap_encryption_lssd_count', + 1, + 'Number of local NVMe SSDs to attach as raw block devices ' + '(--local-nvme-ssd-block count=N). Must match the fixed local SSD ' + 'count for the chosen machine type: c4-standard-8-lssd=1, ' + 'c4-standard-16-lssd=2, i4i.4xlarge has NVMe Instance Store (AWS). ' + 'Default 1 covers most single-lssd machine types.', +) + + +_NODE_IMAGE_TYPE = flags.DEFINE_string( + 'swap_encryption_node_image_type', + 'UBUNTU_CONTAINERD', + 'GKE node image type for the benchmark nodepool. ' + 'UBUNTU_CONTAINERD is required for dm-crypt measurement: COS locks ' + 'down device-mapper at the kernel LSM level and cryptsetup hangs ' + 'indefinitely from any pod context (even privileged, even via nsenter ' + 'into the host mount namespace). Ubuntu GKE nodes allow cryptsetup ' + 'from privileged pods without restriction. ' + 'Use COS_CONTAINERD only when dm-crypt is disabled ' + '(--noswap_encryption_enable_dmcrypt) to measure plain-swap overhead. ' + 'AL2 on EKS.', +) + + +_BOOT_DISK_TYPE = flags.DEFINE_string( + 'swap_encryption_boot_disk_type', + 'hyperdisk-balanced', + 'Disk type for the benchmark nodepool boot disk. Use hyperdisk-balanced ' + 'for production machines (n4, c3, c4 families). Use pd-ssd for n2/e2 ' + 'dev/test machines, which do not support hyperdisk-balanced.', +) + + +_BOOT_DISK_IOPS = flags.DEFINE_integer( + 'swap_encryption_boot_disk_iops', + 80000, + 'Provisioned IOPS for the boot disk (hyperdisk-balanced only). ' + '80 000 is the COS max-IOPS target. Ignored for pd-ssd.', +) + + +_BOOT_DISK_THROUGHPUT = flags.DEFINE_integer( + 'swap_encryption_boot_disk_throughput', + 1200, + 'Provisioned throughput in MB/s for the boot disk (hyperdisk-balanced ' + 'only). Must be set together with iops. 1200 MB/s pairs with 80 000 ' + 'IOPS for production; use 140 (minimum) for dev/test. Ignored for ' + 'pd-ssd.', +) + + +_BOOT_DISK_SIZE_GB = flags.DEFINE_integer( + 'swap_encryption_boot_disk_size_gb', + 500, + 'Boot disk size in GiB for the benchmark nodepool. 500 GiB is ' + 'required for the n4-highmem-32 + hyperdisk-balanced Config 2 run ' + '(see Engineer Assignments table in execution-plan.md). ' + 'For LSSD configs the boot disk is smaller; 100 GiB is fine.', +) + + +_ADD_SWAP_DISK = flags.DEFINE_boolean( + 'swap_encryption_add_swap_disk', + False, + 'Attach a dedicated second disk to the benchmark nodepool for use as ' + 'the swap device. Required for dm-crypt measurement on single-boot-disk ' + 'machines (n4-highmem-32, n4-highmem-8) because COS blocks device-mapper ' + 'from pod namespaces. The second disk is provisioned via ' + '--additional-node-disk using the same type/IOPS/throughput as the boot ' + 'disk flags.', +) + + +_SWAP_DISK_SIZE_GB = flags.DEFINE_integer( + 'swap_encryption_swap_disk_size_gb', + 500, + 'Size in GiB of the dedicated swap disk when ' + '--swap_encryption_add_swap_disk is True. Must satisfy the ' + 'hyperdisk-balanced IOPS constraint: provisioned_iops ≤ size_gb × 80.', +) + + +_DS_NAME = 'pkb-swap-benchmark' + + +_DS_NAMESPACE = 'default' + + +_DS_LABEL = 'pkb-swap-benchmark' + + +_active_pod: list[str] = [] # single-element list so closures can mutate it + + +_degraded_reasons: list[str] = [] + + +_pod_lost: list[str] = [] + + +_oom_events: list[str] = [] + + +_BENCHMARK_NODEPOOL = 'benchmark' + + +_DEFAULT_NODEPOOL = 'default-pool' + + +def _daemonset_yaml(image: str) -> str: + """Render the privileged benchmark DaemonSet manifest. + + The manifest is a PKB data file rendered with Jinja2 + (data/cluster/swap_encryption_daemonset.yaml.j2) rather than an inline + string, per PKB conventions. The DaemonSet is pinned to the benchmark + nodepool via nodeSelector so it never lands on the dummy default pool. + """ + return vm_util.ReadAndRenderJinja2Template( + 'cluster/swap_encryption_daemonset.yaml.j2', + ds_name=_DS_NAME, + ds_namespace=_DS_NAMESPACE, + ds_label=_DS_LABEL, + benchmark_nodepool=_BENCHMARK_NODEPOOL, + image=image, + kernel_version=_KERNEL_VERSION.value, + ) + + +def GetConfig(user_config: dict[str, Any]) -> dict[str, Any]: + return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME) + + +def Prepare(spec) -> None: + """Two-step nodepool setup then DaemonSet deployment. + + Step 1 (handled by PKB infrastructure): cluster provisioned with a cheap + e2-medium default nodepool. + + Step 2 (this function): + a. Create the benchmark nodepool (n4-highmem-32 or c4-*-lssd) with + COS_CONTAINERD, 80 000 IOPS, and a node startup script that configures + dm-crypt swap at the OS level — before any pod is scheduled. + b. Delete the dummy default nodepool to stop its cost immediately. + c. Deploy the privileged DaemonSet (pinned via nodeSelector to the + benchmark nodepool) and wait for tools to install. + """ + cluster = spec.container_cluster + + # ── Step 2a: add real benchmark nodepool ──────────────────────────────── + if getattr(cluster, 'project', None): + # GCP path: true two-step nodepool setup + logging.info('[swap_encryption] Step 2a: creating benchmark nodepool') + _create_benchmark_node_pool(cluster) + + # ── Step 2b: wait for the benchmark node to join and be Ready ───────── + logging.info('[swap_encryption] Step 2b: waiting for benchmark node') + _wait_for_benchmark_node() + + # ── Step 2b2: attach dedicated swap disk (if requested) ─────────────── + # --additional-node-disk is not available in all gcloud versions, so we + # create + attach the disk after the node is up using gcloud compute. + if _ADD_SWAP_DISK.value: + logging.info('[swap_encryption] Step 2b2: attaching dedicated swap disk') + _attach_swap_disk(cluster) + else: + # AWS / EKS: nodepool management is external. PKB's cluster creation + # labels nodes pkb_nodepool=default, so re-label all existing nodes here + # to match the DaemonSet nodeSelector (pkb_nodepool=benchmark). + logging.info( + '[swap_encryption] EKS cluster — labelling existing nodes with ' + 'pkb_nodepool=%s so the DaemonSet nodeSelector matches.', + _BENCHMARK_NODEPOOL) + kubectl.RunKubectlCommand([ + 'label', 'nodes', '--all', '--overwrite', + f'pkb_nodepool={_BENCHMARK_NODEPOOL}', + ]) + # io2 test-matrix row: create + attach a real io2 EBS volume so swap runs + # on io2 hardware-encrypted storage (no-op unless swap_type=io2). + _ensure_io2_volume() + + # ── Step 2c: deploy DaemonSet ──────────────────────────────────────────── + # Deploy and wait for the pod BEFORE deleting the default nodepool. + # Deleting the default pool while the benchmark node is still joining causes + # a temporary API server i/o timeout (control plane busy with two nodepool + # ops simultaneously). Once the pod is Running the cluster is fully stable. + logging.info('[swap_encryption] Step 2c: deploying privileged DaemonSet') + _deploy_daemonset() + + pod = _wait_for_benchmark_pod() + logging.info('[swap_encryption] Benchmark pod ready: %s', pod) + + # ── Step 2d: now safe to remove the dummy default nodepool ─────────────── + if getattr(cluster, 'project', None): + logging.info('[swap_encryption] Step 2d: deleting dummy default nodepool') + _delete_default_node_pool(cluster) + # The DaemonSet pod may be evicted and rescheduled with a new name during + # the nodepool deletion (cluster control plane briefly interrupts pod + # lifecycle). Re-resolve the pod name to avoid stale-reference errors on + # all subsequent _pod_exec calls. + logging.info('[swap_encryption] Step 2d: re-resolving benchmark pod ' + 'after nodepool deletion') + pod = _wait_for_benchmark_pod() + logging.info('[swap_encryption] Benchmark pod (post-deletion): %s', pod) + + +def _phase_selected(token: str) -> bool: + """Return True if phase `token` should run given --swap_encryption_phases. + + 'all' (the default) selects every phase. Otherwise only the comma-separated + tokens listed in the flag run. Tokens: fio, 2a, 2b, 3a, 3b, 3c. + """ + selected = [p.strip().lower() for p in _PHASES.value if p.strip()] + return (not selected) or ('all' in selected) or (token.lower() in selected) + + +def Run(spec) -> list[sample.Sample]: + """Execute all benchmark phases with gate logic. + + Execution is structured in three gated tiers matching the execution plan: + + Tier 1 (Gate 1) — fio microbenchmarks + Raw I/O ceiling of the swap device. Gate 1 fails if fio produces + zero samples (device not found, O_DIRECT error, etc.). + + Tier 2 (Gate 2) — stress-ng CPU overhead + I/O interference + Requires an active swap device (Gate 1 must pass). Gate 2 fails if + stress-ng does not complete within timeout. + + Tier 3 (Gate 3) — real-world workloads (Redis, kernel build, OpenSearch) + Independent of Tier 2 results; always attempted if Gate 1 passed. + Individual workload failures are logged but do not abort the others. + + If Gate 1 fails, Tiers 2 and 3 are skipped — there is no point measuring + application-level swap performance when the raw device is inaccessible. + """ + pod = _wait_for_benchmark_pod() + # Initialise the module-level active-pod tracker so _pod_exec and + # _recover_pod can transparently redirect to a replacement pod if the + # original is evicted during the run. + _active_pod.clear() + _active_pod.append(pod) + _degraded_reasons.clear() + _pod_lost.clear() + _oom_events.clear() + original_pod = pod + swap_dev = _detect_swap_device(pod) + base_meta = _build_metadata(pod, swap_dev) + results: list[sample.Sample] = [] + t_run_start = time.time() + + logging.info('[swap_encryption] swap device: %s', swap_dev) + + # ── Cost estimate ───────────────────────────────────────────────────────── + if _COLLECT_COST.value: + elapsed = time.time() - t_run_start + results += _collect_cost_sample(pod, elapsed, base_meta) + + # ── Final degradation gate ──────────────────────────────────────────────── + # The phase try/except blocks above keep the run alive so partial data is + # still collected, but that means a catastrophic failure (pod OOM-evicted + # mid-run, no fio data, stress-ng killed before it could drive swap I/O) + # would otherwise be reported by PKB as SUCCEEDED with empty/garbage metrics. + # Detect those conditions here and surface them explicitly. + if _active_pod and _active_pod[0] != original_pod: + _degraded_reasons.append( + f'benchmark pod was replaced during the run ' + f'({original_pod} → {_active_pod[0]}) — it was OOM-evicted under swap ' + f'pressure; phases executed after the eviction ran against a ' + f'freshly-initialised pod (empty /tmp, swap re-setup) and may be ' + f'invalid') + if _pod_lost: + _degraded_reasons.append( + f'benchmark pod(s) went NotFound during the run ({", ".join(_pod_lost)}) ' + f'— the pod died (node memory-pressure eviction or container exit) and ' + f'any phase running at or after that point (e.g. kernel-build baseline, ' + f'OpenSearch) produced invalid data') + if _oom_events: + _degraded_reasons.append( + f'OOM kill(s) (rc=137) occurred during the run on pod(s) ' + f'{", ".join(_oom_events)} — a phase exceeded memory and was killed by ' + f'the OOM killer (the container may have restarted in place), so the ' + f'affected phase(s) produced no or partial data') + + degraded = bool(_degraded_reasons) + results.append(sample.Sample( + 'swap_encryption_run_status', + 0.0 if degraded else 1.0, + 'status', + dict(base_meta, + degraded=degraded, + degraded_reasons='; '.join(_degraded_reasons) or 'none', + num_samples=len(results) + 1))) + + if degraded: + msg = ('[swap_encryption] RUN DEGRADED — ' + + '; '.join(_degraded_reasons)) + logging.error(msg) + if _FAIL_ON_DEGRADED.value: + # Raise so PKB marks the benchmark FAILED instead of SUCCEEDED. The + # samples collected so far are still published by PKB before the failure + # is recorded, so no data is lost. + raise errors.Benchmarks.RunError(msg) + else: + logging.info('[swap_encryption] Run completed cleanly (%d samples)', + len(results)) + + return results + + +def Cleanup(spec) -> None: + """Remove the DaemonSet and tear down any swap configuration.""" + pod = _wait_for_benchmark_pod(timeout=30) + if pod: + _pod_exec(pod, 'swapoff -a 2>/dev/null || true', ignore_failure=True) + _pod_exec(pod, textwrap.dedent(""" + swapoff /dev/mapper/swap_encrypted 2>/dev/null || true + dmsetup remove --noudevrules --noudevsync swap_encrypted 2>/dev/null || true + """), ignore_failure=True) + # Clean up loop device backing files (single-disk fallback path). + _pod_exec(pod, textwrap.dedent(""" + for backing in /var/pkb_swap_backing /run/pkb_swap_backing \ + /mnt/stateful_partition/pkb_swap_backing + do + losetup -j "$backing" 2>/dev/null | awk -F: '{print $1}' | \ + while read dev + do + losetup -d "$dev" 2>/dev/null || true + done + rm -f "$backing" + done + """), ignore_failure=True) + _pod_exec(pod, "pkill -9 'stress-ng|fio' 2>/dev/null || true", + ignore_failure=True) + + _delete_daemonset() + + # Detach and delete the dedicated swap disk if one was provisioned. + cluster = spec.container_cluster + if _ADD_SWAP_DISK.value and getattr(cluster, 'project', None): + _detach_and_delete_swap_disk(cluster) + + +def _deploy_daemonset() -> None: + """Apply the benchmark DaemonSet manifest to the cluster.""" + manifest = _daemonset_yaml(image=_DAEMONSET_IMAGE.value) + with vm_util.NamedTemporaryFile(mode='w', suffix='.yaml') as f: + f.write(manifest) + f.close() + kubectl.RunKubectlCommand(['apply', '-f', f.name]) + logging.info('[swap_encryption] DaemonSet applied') + + +def _wait_for_benchmark_pod(timeout: int = 900) -> str | None: + """Wait until the DaemonSet pod is Running AND tools are installed. + + The benchmark container installs apt packages on first start and writes + /tmp/pkb_ready when done (~2-4 min on a cold node). We must wait for + that sentinel before exec-ing any commands, otherwise tools like + cryptsetup / fio may not yet be on PATH. + + Uses tab-separated name/phase output so kubectl always exits 0 regardless + of whether any pods are present, avoiding jsonpath index errors. + """ + deadline = time.time() + timeout + last_phase = '' + ready_pod = None # pod name once phase == Running + + while time.time() < deadline: + # ── Step 1: wait for Running phase ────────────────────────────────────── + if ready_pod is None: + out, _, rc = kubectl.RunKubectlCommand([ + 'get', 'pods', + '-l', f'app={_DS_LABEL}', + '-n', _DS_NAMESPACE, + '-o', + r'jsonpath={range .items[*]}{.metadata.name}{"\t"}{.status.phase}{"\n"}{end}', + ], raise_on_failure=False) + + if rc == 0 and out.strip(): + for line in out.strip().splitlines(): + parts = line.split('\t') + if len(parts) == 2: + pod_name, phase = parts[0].strip(), parts[1].strip() + if phase == 'Running': + logging.info('[swap_encryption] Pod %s is Running – ' + 'waiting for tool install to finish...', pod_name) + ready_pod = pod_name + break + if phase != last_phase: + logging.info('[swap_encryption] Pod %s phase: %s', pod_name, phase) + last_phase = phase + if phase in ('Pending',): + _log_pod_events(pod_name) + else: + logging.info('[swap_encryption] Waiting for DaemonSet pod to appear...') + + # ── Step 2: poll for /tmp/pkb_ready sentinel ──────────────────────────── + if ready_pod is not None: + sentinel_out, sentinel_err, sentinel_rc = kubectl.RunKubectlCommand([ + 'exec', ready_pod, '-n', _DS_NAMESPACE, + '--', 'test', '-f', '/tmp/pkb_ready', + ], raise_on_failure=False) + if sentinel_rc == 0: + logging.info( + '[swap_encryption] Pod %s ready (tools installed)', ready_pod) + return ready_pod + # "container not found" means the container crashed (CrashLoopBackOff or + # exited) — treat it as a hard reset: re-check pod phase on next iteration. + if ('container not found' in sentinel_err + or 'unable to upgrade connection' in sentinel_err): + logging.warning('[swap_encryption] Pod %s: container not running (%s) ' + '— will re-check pod state', ready_pod, sentinel_err.strip()) + ready_pod = None + last_phase = '' + else: + logging.info( + '[swap_encryption] Pod %s: still installing tools...', ready_pod) + + time.sleep(15) + + logging.warning( + '[swap_encryption] Benchmark pod not ready after %ds', timeout) + return None + + +def _log_pod_events(pod_name: str) -> None: + """Dump recent Kubernetes events for the pod to help diagnose startup hangs.""" + events_out, _, _ = kubectl.RunKubectlCommand([ + 'describe', 'pod', pod_name, + '-n', _DS_NAMESPACE, + ], raise_on_failure=False) + # Only log the Events section to keep output manageable + in_events = False + lines = [] + for line in events_out.splitlines(): + if line.startswith('Events:'): + in_events = True + if in_events: + lines.append(line) + if lines: + logging.info('[swap_encryption] Pod events:\n%s', '\n'.join(lines[:30])) + else: + logging.info('[swap_encryption] kubectl describe output:\n%s', + events_out[-2000:] if len(events_out) > 2000 else events_out) + + +def _delete_daemonset() -> None: + """Delete the benchmark DaemonSet.""" + kubectl.RunKubectlCommand([ + 'delete', 'daemonset', _DS_NAME, + '-n', _DS_NAMESPACE, + '--ignore-not-found', + ], raise_on_failure=False) + logging.info('[swap_encryption] DaemonSet deleted') + + +def _build_node_startup_script(enable_dmcrypt: bool, lssd: bool) -> str: + """Return a bash startup script for the benchmark nodepool. + + NOTE: This function is not currently used. GKE reserves the + `startup-script` node metadata key, so dm-crypt setup is performed + from within the privileged DaemonSet pod instead (see + _setup_gke_hyperdisk_swap / _setup_gke_lssd_swap). Kept as reference. + + Args: + enable_dmcrypt: When True, wrap the swap device in dm-crypt plain + mode (aes-xts-plain64, ephemeral random key) matching GKE's + go/node:swap-encryption implementation. + lssd: When True, build a RAID-0 array across all local SSDs before + setting up swap (matches go/gke-swap-lssd). + + Returns: + A bash script string suitable for running as root at node boot. + """ + dmcrypt_str = 'true' if enable_dmcrypt else 'false' + lssd_str = 'true' if lssd else 'false' + + return textwrap.dedent(f"""\ + #!/bin/bash + # PKB swap_encryption_benchmark — nodepool startup script. + # Configures swap once at node boot so all benchmark phases see a + # pre-warmed swap device. Runs as root on the COS host. + set -euo pipefail + ENABLE_DMCRYPT={dmcrypt_str} + LSSD={lssd_str} + + _wait_dev() {{ + local d=$1 i + for i in $(seq 1 30); do [ -b "$d" ] && return 0; sleep 2; done + echo "[pkb-startup] device $d not ready" >&2; return 1 + }} + + _boot_dev() {{ + lsblk -no pkname "$(findmnt -n -o SOURCE /)" 2>/dev/null | head -1 || echo nvme0n1 + }} + + if $LSSD; then + BOOT=$(_boot_dev) + # Collect all non-rotational non-boot block devices (local SSDs) + DEVS=$(lsblk -d -o NAME,ROTA | awk '$2=="0"{{print "/dev/"$1}}' | grep -v "/dev/$BOOT" || true) + N=$(echo "$DEVS" | grep -c /dev/ || true) + if [ "$N" -gt 1 ]; then + modprobe raid0 || true + # shellcheck disable=SC2086 + mdadm --create /dev/md0 --level=0 --raid-devices="$N" $DEVS --force + TARGET=/dev/md0 + elif [ "$N" -eq 1 ]; then + TARGET=$(echo "$DEVS" | head -1) + else + echo "[pkb-startup] no LSSD devices found; skipping swap setup" >&2 + exit 0 + fi + else + BOOT=$(_boot_dev) + RAW=$(lsblk -d -o NAME,TYPE | awk '$2=="disk"{{print $1}}' | grep -v "^$BOOT$" | head -1 || true) + if [ -z "$RAW" ]; then + echo "[pkb-startup] no secondary disk found for hyperdisk swap" >&2 + exit 0 + fi + TARGET=/dev/$RAW + fi + + _wait_dev "$TARGET" + + if $ENABLE_DMCRYPT; then + modprobe dm-crypt || true + dd if=/dev/urandom bs=32 count=1 2>/dev/null | \\ + cryptsetup open --type plain \\ + --cipher aes-xts-plain64 --key-size 256 \\ + --key-file=- "$TARGET" pkb_swap + SWAP_DEV=/dev/mapper/pkb_swap + else + SWAP_DEV=$TARGET + fi + + mkswap "$SWAP_DEV" + swapon "$SWAP_DEV" + echo "[pkb-startup] swap active on $SWAP_DEV (dmcrypt=$ENABLE_DMCRYPT lssd=$LSSD)" + """) + + +_HYPERDISK_MAX_IOPS_PER_MBPS = 256 # GCP Hyperdisk Balanced: IOPS <= 256 x MiB/s + + +def _valid_hyperdisk_throughput(iops: int, throughput: int) -> int: + """Return a throughput (MiB/s) that satisfies GCP's Hyperdisk constraint. + + Hyperdisk Balanced rejects disk creation when provisioned IOPS exceed + 256 x provisioned throughput (MiB/s) — e.g. 80000 IOPS with 300 MiB/s fails + with "Requested provisioned throughput is too low for the provisioned iops". + Clamp throughput UP to the minimum the requested IOPS need (plus a small + margin) and warn, so a mismatched flag pairing cannot abort node-pool/disk + creation. + """ + min_tput = -(-int(iops) // _HYPERDISK_MAX_IOPS_PER_MBPS) # ceil(iops/256) + if throughput < min_tput: + logging.warning( + '[swap_encryption] boot/swap disk throughput %d MiB/s is too low for ' + '%d IOPS (Hyperdisk needs >= ceil(iops/256) = %d MiB/s); raising to %d', + throughput, iops, min_tput, min_tput) + return min_tput + return throughput + + +def _create_benchmark_node_pool(cluster) -> None: + """Add the benchmark nodepool to the existing cluster (Step 2 of setup). + + Uses: + --swap_encryption_benchmark_machine_type (default n4-highmem-32) + --swap_encryption_node_image_type (default COS_CONTAINERD) + --swap_encryption_boot_disk_iops (default 80000) + --swap_encryption_enable_dmcrypt (default True) + + The nodepool is labelled pkb_nodepool=benchmark so the DaemonSet + nodeSelector targets it exclusively. dm-crypt swap setup is performed + from within the privileged DaemonSet pod (see _setup_gke_hyperdisk_swap / + _setup_gke_lssd_swap) — we do NOT inject a startup-script via node metadata + because GKE reserves that metadata key and rejects it at the API level. + """ + machine_type = _BENCHMARK_MACHINE_TYPE.value + # Auto-detect LSSD from machine type name; flag overrides only when True. + is_lssd = _BENCHMARK_LSSD.value or 'lssd' in machine_type.lower() + + # Determine zone/region from the cluster object. + zone_flags: list[str] = [] + if getattr(cluster, 'zones', None): + zone_flags = ['--zone', cluster.zones[0]] + elif getattr(cluster, 'region', None): + zone_flags = ['--region', cluster.region] + + # LSSD configs only need a small boot disk (OS only; swap is on local NVMe). + # Hyperdisk configs need 500 GiB to hit 80 000 IOPS (the IOPS/GiB ratio on + # hyperdisk-balanced is 1:1 up to the provisioned ceiling, so a 100 GiB disk + # can only provision up to 100 000 IOPS but a 500 GiB gives comfortable + # headroom and matches the Config 2 spec in the Engineer Assignments table). + disk_size_gb = 100 if is_lssd else _BOOT_DISK_SIZE_GB.value + + disk_type = _BOOT_DISK_TYPE.value + cmd = [ + 'gcloud', 'container', 'node-pools', 'create', _BENCHMARK_NODEPOOL, + '--cluster', cluster.name, + '--project', cluster.project, + '--machine-type', machine_type, + '--image-type', _NODE_IMAGE_TYPE.value, + '--disk-type', disk_type, + '--disk-size', str(disk_size_gb), + '--num-nodes', '1', + '--node-labels', f'pkb_nodepool={_BENCHMARK_NODEPOOL}', + '--no-enable-autoupgrade', + '--no-enable-autorepair', + ] + zone_flags + + # IOPS and throughput provisioning only applies to hyperdisk-* types AND + # only when the boot disk is also the swap device (non-LSSD configs). + # For LSSD machines the boot disk is OS-only; swap is on local NVMe. + # Provisioning 80k IOPS on a 100 GiB boot disk would exceed the + # hyperdisk-balanced per-GiB cap (80 IOPS/GiB × 100 GiB = 8 000 max). + if disk_type.startswith('hyperdisk') and not is_lssd: + cmd += [ + '--boot-disk-provisioned-iops', str(_BOOT_DISK_IOPS.value), + '--boot-disk-provisioned-throughput', + str(_valid_hyperdisk_throughput(_BOOT_DISK_IOPS.value, + _BOOT_DISK_THROUGHPUT.value)), + ] + + # For LSSD machines, expose local NVMe as raw block devices so fio/mdadm + # can access them directly (go/gke-swap-lssd uses local-nvme-ssd-block). + if is_lssd: + cmd += ['--local-nvme-ssd-block', f'count={_LSSD_COUNT.value}'] + + logging.info('[swap_encryption] Creating benchmark nodepool: %s / %s / ' + 'image=%s / disk=%dGiB / iops=%d / dmcrypt=%s / lssd=%s / ' + 'add_swap_disk=%s', + _BENCHMARK_NODEPOOL, machine_type, _NODE_IMAGE_TYPE.value, + disk_size_gb, _BOOT_DISK_IOPS.value, + _ENABLE_DMCRYPT.value, is_lssd, _ADD_SWAP_DISK.value) + + # LSSD nodepools take longer to provision than PD-only nodepools because + # GKE must also initialise the local NVMe devices before marking nodes Ready. + # 1200 s (20 min) covers observed worst-case times on c4-lssd and n4 configs. + stdout, stderr, rc = vm_util.IssueCommand(cmd, timeout=1200, + raise_on_failure=False) + + if rc != 0: + # Idempotent prepare: if the nodepool already exists (e.g. re-running + # --run_stage=prepare,run to redeploy the DaemonSet onto an existing + # cluster), reuse it instead of failing. gcloud returns a 409 / + # "Already exists" message in this case. + low = (stderr or '').lower() + if 'already exists' in low or 'alreadyexists' in low or 'code=409' in low: + logging.info('[swap_encryption] Benchmark nodepool already exists — ' + 'reusing it (idempotent prepare); proceeding to DaemonSet') + return + raise errors.Benchmarks.RunError( + f'[swap_encryption] Failed to create benchmark nodepool ' + f'(rc={rc}): {stderr}' + ) + logging.info('[swap_encryption] Benchmark nodepool ready') + + +def _wait_for_benchmark_node(timeout: int = 900) -> None: + """Block until a node labelled pkb_nodepool=benchmark is Ready. + + gcloud container node-pools create returns as soon as the API accepts the + request — the actual node VM may take another 2-4 minutes to boot, join the + cluster, and pass its readiness checks. Deploying the DaemonSet before that + point leaves the pod Pending indefinitely because the nodeSelector finds no + eligible node. + + This function polls kubectl every 15 s until at least one node with + pkb_nodepool=benchmark has Ready=True, then returns. + """ + deadline = time.time() + timeout + logging.info('[swap_encryption] Waiting for benchmark node ' + '(pkb_nodepool=benchmark) to be Ready...') + while time.time() < deadline: + out, _, rc = kubectl.RunKubectlCommand([ + 'get', 'nodes', + '-l', f'pkb_nodepool={_BENCHMARK_NODEPOOL}', + '-o', r'jsonpath={range .items[*]}' + r'{.metadata.name}{"\t"}' + r'{range .status.conditions[?(@.type=="Ready")]}' + r'{.status}{"\n"}{end}{end}', + ], raise_on_failure=False) + + if rc == 0 and out.strip(): + for line in out.strip().splitlines(): + parts = line.split('\t') + if len(parts) == 2 and parts[1].strip() == 'True': + logging.info('[swap_encryption] Benchmark node ready: %s', + parts[0].strip()) + return + + logging.info('[swap_encryption] Benchmark node not yet Ready — ' + 'retrying in 15 s...') + time.sleep(15) + + raise errors.Benchmarks.RunError( + '[swap_encryption] Timed out waiting for benchmark node ' + f'(pkb_nodepool={_BENCHMARK_NODEPOOL}) to become Ready ' + f'after {timeout}s' + ) + + +def _attach_swap_disk(cluster) -> None: + """Create a dedicated hyperdisk and attach it to the benchmark node. + + gcloud container node-pools create --additional-node-disk is not available + in all gcloud SDK versions, so we use gcloud compute to create the disk and + attach it after the node is ready. In GKE the Kubernetes node name is the + same as the GCE instance name, so no translation is needed. + + After attachment the disk appears as /dev/sdb (or /dev/nvme1n1 on NVMe + nodes) inside the pod, and _setup_gke_hyperdisk_swap detects it via lsblk. + + The disk is named pkb-swap- to avoid name collisions across + concurrent runs. Cleanup deletes it in Cleanup() if it exists. + """ + # Resolve zone from cluster + zone = None + if getattr(cluster, 'zones', None): + zone = cluster.zones[0] + elif getattr(cluster, 'region', None): + zone = cluster.region + if not zone: + raise errors.Benchmarks.RunError( + '[swap_encryption] Cannot attach swap disk: cluster zone unknown') + + project = cluster.project + disk_name = f'pkb-swap-{cluster.name}' + disk_type = _BOOT_DISK_TYPE.value + disk_size_gb = _SWAP_DISK_SIZE_GB.value + + # ── Step 1: get the GCE instance name of the benchmark node ─────────────── + node_out, _, rc = kubectl.RunKubectlCommand([ + 'get', 'nodes', + '-l', f'pkb_nodepool={_BENCHMARK_NODEPOOL}', + '-o', 'jsonpath={.items[0].metadata.name}', + ], raise_on_failure=False) + instance_name = node_out.strip() + if rc != 0 or not instance_name: + raise errors.Benchmarks.RunError( + '[swap_encryption] Cannot find benchmark node for swap disk attach') + logging.info('[swap_encryption] Benchmark node instance: %s', instance_name) + + # ── Step 2: create the hyperdisk ────────────────────────────────────────── + logging.info('[swap_encryption] Creating swap disk %s (%dGiB %s)', + disk_name, disk_size_gb, disk_type) + create_cmd = [ + 'gcloud', 'compute', 'disks', 'create', disk_name, + '--project', project, + '--zone', zone, + '--type', disk_type, + '--size', f'{disk_size_gb}GB', + '--quiet', + ] + if disk_type.startswith('hyperdisk'): + create_cmd += [ + '--provisioned-iops', str(_BOOT_DISK_IOPS.value), + '--provisioned-throughput', + str(_valid_hyperdisk_throughput(_BOOT_DISK_IOPS.value, + _BOOT_DISK_THROUGHPUT.value)), + ] + _, stderr, rc = vm_util.IssueCommand(create_cmd, timeout=120, + raise_on_failure=False) + if rc != 0: + raise errors.Benchmarks.RunError( + f'[swap_encryption] Failed to create swap disk {disk_name}: {stderr}') + + # ── Step 3: attach the disk to the node VM ──────────────────────────────── + logging.info('[swap_encryption] Attaching swap disk %s to %s', + disk_name, instance_name) + attach_cmd = [ + 'gcloud', 'compute', 'instances', 'attach-disk', instance_name, + '--project', project, + '--zone', zone, + '--disk', disk_name, + '--device-name', 'pkb-swap', + '--quiet', + ] + _, stderr, rc = vm_util.IssueCommand(attach_cmd, timeout=120, + raise_on_failure=False) + if rc != 0: + raise errors.Benchmarks.RunError( + f'[swap_encryption] Failed to attach swap disk to {instance_name}: ' + f'{stderr}') + logging.info('[swap_encryption] Swap disk attached: %s → %s', + disk_name, instance_name) + + +def _delete_disk_by_name(disk_name: str, project: str, zone: str) -> bool: + """Detach (if attached) and delete a GCE disk, robustly, with retries. + + Finds the attached instance from the disk's own `users` field rather than + kubectl — kubectl is often unavailable during teardown (cluster being + deleted), which previously left the disk attached and undeletable, so it + leaked. Returns True if the disk is gone (deleted or already absent). + """ + for attempt in range(1, 5): + users, _, rc = vm_util.IssueCommand( + ['gcloud', 'compute', 'disks', 'describe', disk_name, + '--project', project, '--zone', zone, '--format=value(users)'], + timeout=60, raise_on_failure=False) + if rc != 0: + logging.info('[swap_encryption] Swap disk %s not present — nothing to ' + 'delete', disk_name) + return True # already gone + user = users.strip() + if user: + inst = user.split('/')[-1] + logging.info('[swap_encryption] Detaching swap disk %s from %s', + disk_name, inst) + vm_util.IssueCommand( + ['gcloud', 'compute', 'instances', 'detach-disk', inst, + '--project', project, '--zone', zone, '--disk', disk_name, + '--quiet'], timeout=120, raise_on_failure=False) + _, derr, drc = vm_util.IssueCommand( + ['gcloud', 'compute', 'disks', 'delete', disk_name, + '--project', project, '--zone', zone, '--quiet'], + timeout=180, raise_on_failure=False) + if drc == 0: + logging.info('[swap_encryption] Swap disk deleted: %s', disk_name) + return True + logging.warning('[swap_encryption] Swap disk delete attempt %d/4 failed ' + '(%s); retrying in 10s', attempt, derr.strip()[:160]) + time.sleep(10) + logging.error('[swap_encryption] Could NOT delete swap disk %s after retries ' + '— delete it manually: gcloud compute disks delete %s ' + '--zone %s --quiet', disk_name, disk_name, zone) + return False + + +def _detach_and_delete_swap_disk(cluster) -> None: + """Detach and delete the dedicated swap disk created by _attach_swap_disk.""" + zone = None + if getattr(cluster, 'zones', None): + zone = cluster.zones[0] + elif getattr(cluster, 'region', None): + zone = cluster.region + if not zone or not getattr(cluster, 'project', None): + return + _delete_disk_by_name(f'pkb-swap-{cluster.name}', cluster.project, zone) + + +def _delete_default_node_pool(cluster) -> None: + """Delete the dummy default nodepool after the benchmark pool is ready. + + The default nodepool (e2-medium) was only needed to satisfy GKE's + requirement that a cluster must have at least one nodepool at creation time. + Removing it stops the clock on its cost immediately. + """ + zone_flags: list[str] = [] + if getattr(cluster, 'zones', None): + zone_flags = ['--zone', cluster.zones[0]] + elif getattr(cluster, 'region', None): + zone_flags = ['--region', cluster.region] + + cmd = [ + 'gcloud', 'container', 'node-pools', 'delete', _DEFAULT_NODEPOOL, + '--cluster', cluster.name, + '--project', cluster.project, + '--quiet', + ] + zone_flags + + logging.info( + '[swap_encryption] Deleting default nodepool: %s', _DEFAULT_NODEPOOL) + stdout, stderr, rc = vm_util.IssueCommand(cmd, timeout=300, + raise_on_failure=False) + if rc != 0: + logging.warning('[swap_encryption] Could not delete default nodepool ' + '(rc=%d): %s', rc, stderr) + else: + logging.info('[swap_encryption] Default nodepool deleted') + + +def _is_pod_gone(pod: str) -> bool: + """Return True if the named pod no longer exists in the cluster. + + Used to distinguish OOM-killed container processes (pod still alive, rc=137) + from OOM-evicted pods (pod gone, DaemonSet will create a replacement). + """ + try: + _, err, rc = kubectl.RunKubectlCommand( + ['get', 'pod', pod, '-n', _DS_NAMESPACE, + '-o', 'jsonpath={.metadata.name}'], + raise_on_failure=False, timeout=15, + ) + return rc != 0 and 'not found' in (err or '').lower() + except Exception: # pylint: disable=broad-except + return False + + +def _pod_exec( + pod: str, + cmd: str, + ignore_failure: bool = False, + timeout: int = 300, + _retries: int = 2, +) -> tuple[str, str]: + """Run a shell command inside the benchmark pod via kubectl exec. + + Args: + pod: Pod name returned by _wait_for_benchmark_pod. + cmd: Shell command string passed to bash -c. + ignore_failure: When True, non-zero exit codes are logged but not + raised. + timeout: Seconds before PKB kills the kubectl exec process. Default + 300 s matches PKB's IssueCommand default. Pass a larger value for + long-running jobs (fio, stress-ng, kernel build). + _retries: Number of automatic retries on transient GKE websocket + resets ("connection reset by peer"). Set to 0 to disable retries + for idempotent-sensitive commands. + + Returns: + Tuple of (stdout, stderr) strings. + """ + _TRANSIENT_ERRORS = ('connection reset by peer', 'websocket: close') + # Errors that indicate the container/pod is gone and needs recovery. + # 'not found' covers "Error from server (NotFound): pods ... not found" + # which occurs when the DaemonSet pod was evicted and recreated under a + # new name (e.g. after OOM-triggered node pressure eviction). + # 'deleted state' covers "cannot exec in a deleted state" — the container + # was OOM-killed and is mid-termination (not yet recreated). + _CONTAINER_GONE_ERRORS = ('container not found', 'procReady not received', + 'unable to upgrade connection', 'not found', + 'deleted state') + # Use the globally-tracked active pod name — it may have been updated by + # a previous _recover_pod call when eviction replaced the pod. + active = _active_pod[0] if _active_pod else pod + + for attempt in range(_retries + 1): + out, err, rc = kubectl.RunKubectlCommand( + ['exec', active, '-n', _DS_NAMESPACE, + '--', 'bash', '-c', cmd], + raise_on_failure=False, + raise_on_timeout=False, # let _pod_exec's own retry loop handle transient resets + timeout=timeout, + ) + is_transient = rc != 0 and any(e in err for e in _TRANSIENT_ERRORS) + if is_transient and attempt < _retries: + logging.warning( + '[swap_encryption] kubectl exec connection reset (attempt %d/%d); ' + 'retrying in 10 s', attempt + 1, _retries + 1) + time.sleep(10) + continue + # rc=137 (SIGKILL): the OOM killer terminated the container process. + # Two sub-cases: + # A) Pod eviction: pod is gone, DaemonSet recreates it under a new name. + # B) Container OOM restart: pod still exists, container restarts in place. + # (DaemonSet restartPolicy=Always restarts the container, /tmp is lost, + # tools must be re-installed before subsequent commands can run.) + # In both cases we call _recover_pod to wait for tools + sentinel, and + # we do NOT retry the OOM-triggering command itself. + if rc == 137: + # Record the OOM so the run-level gate can flag it even if the container + # restarts in place under the same pod name (which leaves both the + # "pod replaced" and "pod NotFound" checks silent). + if active not in _oom_events: + _oom_events.append(active) + # CRITICAL: sleep before checking pod state. Kubernetes takes a few + # seconds to mark a just-evicted pod as Terminating / NotFound. Without + # this delay _recover_pod sees the pod still in "Running" phase, returns + # the old pod name immediately, and every subsequent command fails with + # "Error from server (NotFound): pods … not found". + logging.warning( + '[swap_encryption] rc=137 — sleeping 15s for Kubernetes to update ' + 'pod state before recovery check') + time.sleep(15) + pod_gone = _is_pod_gone(active) + if pod_gone: + logging.warning( + '[swap_encryption] OOM-eviction detected (rc=137, pod gone) — ' + 'recovering pod name for subsequent commands (not retrying this cmd)') + else: + logging.warning( + '[swap_encryption] Container OOM-killed (rc=137, pod still exists) — ' + 'waiting for container restart and tool re-install before continuing') + new_pod = _recover_pod(active) + if new_pod != active: + logging.info('[swap_encryption] Pod name updated: %s → %s', active, new_pod) + if _active_pod: + _active_pod[0] = new_pod + active = new_pod + break # Do NOT retry — the OOM cmd itself is not re-run on the new pod. + + is_container_gone = (rc != 0 and + any(e in err.lower() for e in _CONTAINER_GONE_ERRORS)) + if is_container_gone: + # Record the loss for the run-level degradation gate REGARDLESS of retry + # budget or ignore_failure. A "pods … not found" on a best-effort command + # (kernel build, opensearch, cleanup of a dead pod) still means the pod + # died; without this the gate stays blind because _active_pod is only + # renamed on the retry path below, which _retries=0 callers never reach. + if active and active not in _pod_lost: + _pod_lost.append(active) + logging.error( + '[swap_encryption] Benchmark pod %s is gone (%s) — recording run ' + 'as degraded', active, (err or '').strip()[:160]) + if attempt < _retries: + logging.warning( + '[swap_encryption] Container gone/restarting (attempt %d/%d) — ' + 'waiting for pod to recover...', attempt + 1, _retries + 1) + new_pod = _recover_pod(active) + if new_pod != active: + logging.info('[swap_encryption] Pod name updated: %s → %s', active, new_pod) + if _active_pod: + _active_pod[0] = new_pod + active = new_pod + continue + break + + if rc != 0 and not ignore_failure: + raise errors.VmUtil.IssueCommandError( + f'[swap_encryption] _pod_exec failed (rc={rc}): {err}') + return out, err + + +def _recover_pod(pod: str, timeout_sec: int = 600) -> str: + """Wait for a DaemonSet container to recover after OOM kill or eviction. + + Handles two scenarios: + 1. Container OOM restart: same pod name, container restarting in place. + DaemonSet restartPolicy=Always brings it back under the same pod name. + 2. Pod eviction/deletion: the pod is gone entirely; the DaemonSet creates + a new pod with a DIFFERENT name. We detect this by checking whether + the named pod still exists; if not, we search by the DaemonSet label + selector for a Running pod. + + Returns the (possibly new) pod name once it is Running and ready. + """ + deadline = time.time() + timeout_sec + logging.info('[swap_encryption] Waiting for pod %s to recover ' + '(up to %ds)...', pod, timeout_sec) + + # Phase 1: wait for a Running pod — either the named one (container + # restart) or a replacement pod found via label selector (eviction). + # + # IMPORTANT: we query BOTH status.phase AND metadata.deletionTimestamp in a + # single call. When a pod is evicted, Kubernetes first sets deletionTimestamp + # (the pod is "Terminating") while status.phase may still read "Running" for + # several seconds. Checking only status.phase causes a false-positive: we + # return the old pod name immediately and every subsequent command fails with + # "Error from server (NotFound)". Checking deletionTimestamp catches this. + recovered_pod = pod + while time.time() < deadline: + # IMPORTANT: capture stderr — kubectl writes "not found" to stderr, not + # stdout. When the pod is gone, status_out is empty and the error text + # lives entirely in status_err. Discarding stderr (using _) means the + # 'not found' check below never fires and we spin until deadline. + status_out, status_err, status_rc = kubectl.RunKubectlCommand( + ['get', 'pod', pod, '-n', _DS_NAMESPACE, + '-o', 'jsonpath={.status.phase}|{.metadata.deletionTimestamp}'], + raise_on_failure=False, timeout=30, + ) + # Parse "Running|" (no deletionTimestamp) vs "Running|2026-…" (terminating) + fields = status_out.strip().split('|') + phase = fields[0].strip() if fields else '' + is_terminating = len(fields) > 1 and bool(fields[1].strip()) + + # Pod is genuinely Running and NOT being deleted — recovery complete. + if status_rc == 0 and phase == 'Running' and not is_terminating: + break + + # Pod no longer exists, OR it exists but is being terminated (Terminating + # state or deletionTimestamp set) — look for a replacement pod by label. + pod_gone_or_terminating = ( + (status_rc != 0 and 'not found' in (status_out + status_err).lower()) + or is_terminating + ) + if pod_gone_or_terminating: + label_out, _, label_rc = kubectl.RunKubectlCommand( + ['get', 'pods', '-n', _DS_NAMESPACE, + '-l', f'app={_DS_LABEL}', + '-o', 'jsonpath={range .items[?(@.status.phase=="Running")]}' + '{.metadata.name}{"\\n"}{end}'], + raise_on_failure=False, timeout=30, + ) + new_pods = [p.strip() for p in label_out.strip().splitlines() if p.strip() + and p.strip() != pod] # exclude the dying pod + if label_rc == 0 and new_pods: + recovered_pod = new_pods[0] + logging.info('[swap_encryption] Original pod %s gone/terminating; ' + 'found replacement %s', pod, recovered_pod) + break + + time.sleep(10) + else: + raise errors.VmUtil.IssueCommandError( + f'[swap_encryption] No Running pod found (original: {pod}) ' + f'within {timeout_sec}s after OOM kill / eviction') + + # Phase 2: wait for init script to finish (sentinel written last). + while time.time() < deadline: + ready_out, _, ready_rc = kubectl.RunKubectlCommand( + ['exec', recovered_pod, '-n', _DS_NAMESPACE, + '--', 'bash', '-c', 'test -f /tmp/pkb_ready && echo READY'], + raise_on_failure=False, timeout=30, + ) + if ready_rc == 0 and 'READY' in ready_out: + logging.info('[swap_encryption] Pod %s recovered and ready', recovered_pod) + return recovered_pod + time.sleep(15) + + raise errors.VmUtil.IssueCommandError( + f'[swap_encryption] Pod {recovered_pod} did not become ready ' + f'within {timeout_sec}s after OOM kill / eviction') + + +_INSTANCE_PRICE_USD_PER_HR: dict[str, float] = { + # GCP (on-demand, us-central1 unless noted) + 'c4-standard-8-lssd': 0.5888, # 8 vCPU, 32 GB RAM + 1×375 GB LSSD + 'c4-standard-8': 0.5008, # 8 vCPU, 32 GB RAM, no LSSD + 'n4-highmem-32': 3.0256, # 32 vCPU, 256 GB RAM + 'n2-highmem-32': 2.5216, # 32 vCPU, 256 GB RAM + 'n2-standard-32': 1.5264, # 32 vCPU, 120 GB RAM + 'z3-highmem-8': 2.7248, # 8 vCPU + 4× LSSD + # AWS + 'i4i.4xlarge': 1.4960, # 16 vCPU, 128 GB RAM, NVMe Instance Store + 'i4i.2xlarge': 0.7480, + 'm6id.4xlarge': 0.9072, # 16 vCPU, 64 GB RAM, NVMe Instance Store + 'm6i.4xlarge': 0.7680, # 16 vCPU, 64 GB RAM, no Instance Store + 'r6i.4xlarge': 1.0080, # 16 vCPU, 128 GB RAM, no Instance Store +} + + +def _collect_cost_sample( + pod: str, elapsed_sec: float, base_meta: dict +) -> list[sample.Sample]: + """Emit a cost_estimate_usd sample for the benchmark run (gap 7). + + Instance type is read from cloud metadata inside the pod. Price is looked + up from _INSTANCE_PRICE_USD_PER_HR; if unknown, the sample is omitted and + a warning is logged. + + Args: + pod: Benchmark pod name. + elapsed_sec: Wall-clock seconds the benchmark phases took. + base_meta: Shared metadata dict. + + Returns: + A list of zero or one sample.Sample. + """ + # Detect instance type from cloud metadata + instance_type = '' + + # GCP: machine type is the last segment of the metadata URL value + gcp_type_out, _ = _pod_exec( + pod, + 'curl -s -m 3 --fail ' + 'http://metadata.google.internal/computeMetadata/v1/instance/machine-type ' + '-H "Metadata-Flavor: Google" 2>/dev/null || echo ""', + ignore_failure=True, + ) + if gcp_type_out.strip(): + instance_type = gcp_type_out.strip().split('/')[-1] + + if not instance_type: + # AWS: instance-type is a plain string + aws_type_out, _ = _pod_exec( + pod, + 'curl -s -m 3 --fail ' + 'http://169.254.169.254/latest/meta-data/instance-type ' + '2>/dev/null || echo ""', + ignore_failure=True, + ) + instance_type = aws_type_out.strip() + + # Allow explicit override (useful when running on custom/renamed machine + # types or when the pod was unavailable during cost collection). + if _INSTANCE_SIZE_LABEL.value: + instance_type = _INSTANCE_SIZE_LABEL.value + + # Last resort: fall back to the benchmark machine type flag. This ensures + # cost tracking works even when the pod was evicted before cost collection + # ran (in which case the metadata curl above returned empty). + if not instance_type and _BENCHMARK_MACHINE_TYPE.value: + instance_type = _BENCHMARK_MACHINE_TYPE.value + logging.info( + '[swap_encryption] Instance type from metadata unavailable; ' + 'using --swap_encryption_benchmark_machine_type=%s for cost tracking', + instance_type, + ) + + price = _INSTANCE_PRICE_USD_PER_HR.get(instance_type) + if price is None: + logging.warning( + '[swap_encryption] Unknown instance type "%s" – skipping cost sample. ' + 'Add it to _INSTANCE_PRICE_USD_PER_HR to enable cost tracking.', + instance_type, + ) + return [] + + hours = elapsed_sec / 3600.0 + cost = hours * price + meta = dict( + base_meta, + instance_type=instance_type, + price_usd_per_hr=price, + benchmark_elapsed_sec=round(elapsed_sec, 1), + ) + return [sample.Sample('cost_estimate_usd', cost, 'USD', meta)] + + +def _detect_swap_device(pod: str) -> str: + """Return the active swap device path on the cluster node.""" + if _SWAP_DEVICE.value: + return _SWAP_DEVICE.value + + # /proc/swaps is the source of truth: it lists the swap device that is + # ACTUALLY active. We must NOT just `test -e /dev/mapper/swap_encrypted`, + # because a stale dm-crypt mapping from a previous run on a reused node can + # still exist as a /dev node while being non-functional (fio/swapoff then + # fail with "No such device or address"). So read the active device from + # /proc/swaps first; only fall back to the mapper path if /proc/swaps is + # somehow empty but the mapper is genuinely present. + dm_out, _ = _pod_exec( + pod, + textwrap.dedent(""" + ACTIVE=$(awk 'NR==2{print $1}' /proc/swaps 2>/dev/null) + if [ -n "$ACTIVE" ] + then + echo "$ACTIVE" + elif test -e /dev/mapper/swap_encrypted + then + echo /dev/mapper/swap_encrypted + fi + """), + ignore_failure=True, + ) + dev = dm_out.strip().splitlines()[-1].strip() if dm_out.strip() else '' + if dev: + return dev + raise ValueError( + 'No active swap device found in the benchmark pod. ' + 'Use --swap_encryption_device to specify one.' + ) + + +def _build_metadata(pod: str, swap_dev: str) -> dict: + """Collect node environment, encryption type, and config into a dict.""" + + kernel_out, _ = _pod_exec(pod, 'uname -r', ignore_failure=True) + mem_out, _ = _pod_exec( + pod, "awk '/MemTotal/{print $2}' /proc/meminfo", + ignore_failure=True, + ) + swap_out, _ = _pod_exec( + pod, "awk 'NR>1{sum+=$3} END{print sum+0}' /proc/swaps", + ignore_failure=True, + ) + + try: + mem_gb = round(int(mem_out.strip()) / (1024 * 1024), 1) + except ValueError: + mem_gb = 0 + try: + swap_gb = round(int(swap_out.strip()) / (1024 * 1024), 1) + except ValueError: + swap_gb = 0 + + # Encryption type — key off dm-crypt presence + the swap target, NOT the + # device path. A GKE plain Local SSD is /dev/nvme0n1 but is NOT Nitro- + # encrypted; only the AWS targets (instance_store / io2) are. + enc = 'unknown' + if '/dev/mapper/' in swap_dev: + table_out, _ = _pod_exec( + pod, + f'dmsetup table {swap_dev.split("/")[-1]} 2>/dev/null || echo ""', + ignore_failure=True, + ) + enc = 'dm-crypt-plain' if 'crypt' in table_out.lower() else 'dm-other' + elif _SWAP_TYPE.value in ('instance_store', 'io2'): + enc = 'nitro_hardware_offload' # AWS: encrypted by the Nitro card + elif not _ENABLE_DMCRYPT.value: + enc = 'none' # GKE plain swap (encryption OFF) + + cloud = _detect_cloud(pod) + + # Gap 6: instance size label for multi-size comparison runs. + # If the flag is set use it directly; otherwise try to read it from + # cloud metadata so that the field is always populated. + instance_label = _INSTANCE_SIZE_LABEL.value + if not instance_label: + gcp_type_out, _ = _pod_exec( + pod, + 'curl -s -m 3 --fail ' + 'http://metadata.google.internal/computeMetadata/v1/instance/machine-type ' + '-H "Metadata-Flavor: Google" 2>/dev/null || echo ""', + ignore_failure=True, + ) + if gcp_type_out.strip(): + instance_label = gcp_type_out.strip().split('/')[-1] + if not instance_label: + aws_type_out, _ = _pod_exec( + pod, + 'curl -s -m 3 --fail ' + 'http://169.254.169.254/latest/meta-data/instance-type ' + '2>/dev/null || echo ""', + ignore_failure=True, + ) + instance_label = aws_type_out.strip() + + return { + 'benchmark': BENCHMARK_NAME, + 'execution_mode': 'kubernetes_privileged_pod', + 'cloud': cloud, + 'instance_size': instance_label, + 'kernel_version': kernel_out.strip(), + 'host_memory_gb': mem_gb, + 'swap_device': swap_dev, + 'swap_size_gb': swap_gb, + 'swap_encryption': enc, + # Test-matrix columns: storage target, encryption on/off, image, IOPS + 'storage_target': _SWAP_TYPE.value, + 'boot_disk_type': _BOOT_DISK_TYPE.value, + 'dmcrypt_enabled': _ENABLE_DMCRYPT.value, + 'node_image_type': _NODE_IMAGE_TYPE.value, + 'boot_disk_iops_target': _BOOT_DISK_IOPS.value, + 'benchmark_machine_type': _BENCHMARK_MACHINE_TYPE.value, + # Other config + 'zswap_enabled': _ENABLE_ZSWAP.value, + 'min_free_kbytes': _MIN_FREE_KBYTES.value, + 'fio_runtime_sec': _FIO_RUNTIME_SEC.value, + # Requested config value only. The *effective* stress-ng footprint may + # be autoscaled per node (see _autoscale_vm_bytes); Phase 2a records the + # actual value it ran with as 'stress_vm_bytes' so the two never conflict. + 'stress_vm_bytes_requested': _STRESS_VM_BYTES.value, + 'stress_vm_bytes_list': _STRESS_VM_BYTES_LIST.value, + 'stress_timeout_sec': _STRESS_TIMEOUT_SEC.value, + 'nodepool': _NODEPOOL.value, + } From f52c931a63c86a6e4521ed57085cbafe393abf62 Mon Sep 17 00:00:00 2001 From: DevVegeta Date: Fri, 19 Jun 2026 09:43:23 +0530 Subject: [PATCH 2/8] PR2: swap-encryption benchmark - layer 2/5 (single file swap_encryption_benchmark.py) --- .../swap_encryption_benchmark.py | 1018 +++++++++++++++++ 1 file changed, 1018 insertions(+) diff --git a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py index 5bdc933bba..ee2e76a665 100644 --- a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py +++ b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py @@ -117,6 +117,55 @@ """ +_SWAP_DEVICE = flags.DEFINE_string( + 'swap_encryption_device', + '', + 'Explicit swap block-device path on the cluster node, e.g. ' + '/dev/nvme1n1 or /dev/dm-0. When empty the benchmark auto-detects ' + 'via /proc/swaps after setup.', +) + + +_SWAP_SIZE_GB = flags.DEFINE_integer( + 'swap_encryption_swap_size_gb', + 32, + 'Size in GB of the swap space to configure on the node. ' + 'Ignored when a ready swap device already exists.', +) + + +_SWAP_TYPE = flags.DEFINE_enum( + 'swap_encryption_swap_type', + 'auto', + ['auto', 'hyperdisk', 'lssd', 'boot_disk', 'instance_store', 'io2'], + 'Swap backing storage target, one per methodology test-matrix row:\n' + ' GKE: boot_disk (swap file on the OS boot disk — pd-balanced or ' + 'hyperdisk-balanced, chosen via --swap_encryption_boot_disk_type),\n' + ' hyperdisk (dedicated hyperdisk-balanced data disk),\n' + ' lssd (dedicated Local SSD RAID-0).\n' + ' AWS: instance_store (NVMe Instance Store, Nitro-encrypted),\n' + ' io2 (EBS io2 data/root volume).\n' + 'dm-crypt is applied on the GKE targets when ' + '--swap_encryption_enable_dmcrypt is set; AWS targets are encrypted by ' + 'Nitro at the hardware level. auto = detect from cloud + instance type.', +) + + +_ENABLE_ZSWAP = flags.DEFINE_boolean( + 'swap_encryption_enable_zswap', + False, + 'Enable zswap (lz4 compressor, 20%% max pool) before running tests.', +) + + +_MIN_FREE_KBYTES = flags.DEFINE_integer( + 'swap_encryption_min_free_kbytes', + 65536, + 'Value written to /proc/sys/vm/min_free_kbytes to trigger earlier ' + 'swapping. Set 0 to leave the kernel default unchanged.', +) + + _DAEMONSET_IMAGE = flags.DEFINE_string( 'swap_encryption_daemonset_image', 'ubuntu:22.04', @@ -150,6 +199,24 @@ ) +_IO2_ENCRYPTED = flags.DEFINE_boolean( + 'swap_encryption_io2_encrypted', + True, + 'When True (default), the dedicated io2 swap volume is created with EBS ' + 'encryption (Nitro/KMS) -> matrix row "io2 + hardware encryption". ' + 'Set False for the unencrypted io2 baseline row. Only applies when ' + '--swap_encryption_swap_type=io2 on AWS/EKS.', +) + + +_IO2_KMS_KEY_ID = flags.DEFINE_string( + 'swap_encryption_io2_kms_key_id', + '', + 'Optional KMS key id/ARN for the encrypted io2 volume. Empty = the ' + 'account default aws/ebs key. Ignored unless io2_encrypted is True.', +) + + _FAIL_ON_DEGRADED = flags.DEFINE_boolean( 'swap_encryption_fail_on_degraded', True, @@ -205,6 +272,15 @@ ) +_ENABLE_DMCRYPT = flags.DEFINE_boolean( + 'swap_encryption_enable_dmcrypt', + True, + 'When True (default), configure dm-crypt on the swap device — the ' + '"encryption enabled" column of the test matrix. Set False to use ' + 'plain swap (encryption disabled column).', +) + + _NODE_IMAGE_TYPE = flags.DEFINE_string( 'swap_encryption_node_image_type', 'UBUNTU_CONTAINERD', @@ -400,6 +476,55 @@ def Prepare(spec) -> None: pod = _wait_for_benchmark_pod() logging.info('[swap_encryption] Benchmark pod (post-deletion): %s', pod) + # Tune kernel swap aggressiveness. + # vm.swappiness=100 (maximum): GKE nodes default to 0 (avoid swap, prefer + # OOM-kill). At 60 the kernel still under-swapped on n4-highmem-32 — under + # cgroup-level memory pressure with ~160 GB node RAM free it would leave + # anonymous pages resident and record swap_out ~0 (run bb4a782d), making the + # result non-deterministic. 100 maximally biases the kernel toward paging + # anonymous pages out to the (encrypted) swap device, which is exactly the + # path this benchmark is meant to exercise. + _pod_exec(pod, 'sysctl -w vm.swappiness=100', ignore_failure=True) + if _MIN_FREE_KBYTES.value > 0: + _pod_exec(pod, f'sysctl -w vm.min_free_kbytes={_MIN_FREE_KBYTES.value}') + + # Unlock container cgroup swap. + # GKE cgroup v2 sets memory.swap.max=0 per-container even when the node has + # a swap device. This blocks swap for the container regardless of + # vm.swappiness. Stress-ng gets OOM-killed in ~15s because the kernel can + # page out for this cgroup. Set 'max' so the container can use all swap. + _pod_exec(pod, textwrap.dedent(""" + PKB_CG=$(awk -F: '/^0::/{print $3; exit}' /proc/self/cgroup 2>/dev/null) + if [ -n "$PKB_CG" ] && [ -f "/sys/fs/cgroup${PKB_CG}/memory.swap.max" ]; then + echo max > "/sys/fs/cgroup${PKB_CG}/memory.swap.max" 2>/dev/null || true + fi + PKB_CG1=$(awk -F: '/:memory:/{print $3; exit}' /proc/self/cgroup 2>/dev/null) + if [ -n "$PKB_CG1" ] && \ + [ -f "/sys/fs/cgroup/memory${PKB_CG1}/memory.memsw.limit_in_bytes" ]; then + echo -1 > "/sys/fs/cgroup/memory${PKB_CG1}/memory.memsw.limit_in_bytes" \ + 2>/dev/null || true + fi + """), ignore_failure=True) + + # Enable zswap if requested + if _ENABLE_ZSWAP.value: + _enable_zswap(pod) + + # Configure cloud-specific swap + cloud = _detect_cloud(pod) + logging.info('[swap_encryption] Detected cloud: %s', cloud) + + if cloud == 'gcp': + _setup_gke_swap(pod) + elif cloud == 'aws': + _setup_eks_swap(pod) + else: + logging.warning( + '[swap_encryption] Unknown cloud – falling back to plain swapfile' + ) + _setup_plain_swap_file(pod, _SWAP_SIZE_GB.value) + + def _phase_selected(token: str) -> bool: """Return True if phase `token` should run given --swap_encryption_phases. @@ -1304,6 +1429,899 @@ def _recover_pod(pod: str, timeout_sec: int = 600) -> str: f'within {timeout_sec}s after OOM kill / eviction') +def _detect_cloud(pod: str) -> str: + """Detect GCP vs AWS from DMI product info exposed via /sys hostPath mount. + + DMI is the most reliable in-container detection method because it reads + directly from the host kernel's SMBIOS table via /sys (already mounted). + It avoids HTTP metadata endpoint quoting issues and network timeouts. + + Falls back to metadata HTTP endpoints if DMI is inconclusive. + """ + # Primary: DMI product name / vendor (available via /sys hostPath mount) + dmi_out, _ = _pod_exec( + pod, + 'cat /sys/class/dmi/id/sys_vendor /sys/class/dmi/id/product_name ' + '/sys/class/dmi/id/bios_vendor 2>/dev/null || echo ""', + ignore_failure=True, + ) + dmi = dmi_out.strip().lower() + if 'google' in dmi: + logging.info( + '[swap_encryption] Cloud detected via DMI: gcp (%s)', dmi_out.strip()) + return 'gcp' + if any(k in dmi for k in ('amazon', 'ec2', 'aws')): + logging.info( + '[swap_encryption] Cloud detected via DMI: aws (%s)', dmi_out.strip()) + return 'aws' + + # Secondary: GCP metadata endpoint. + # Use -H with no space after colon to avoid shell-quoting issues through + # the kubectl exec → bash -c pipeline. + gcp_out, _ = _pod_exec( + pod, + 'curl -s -m 3 ' + 'http://metadata.google.internal/computeMetadata/v1/instance/zone ' + '-H Metadata-Flavor:Google 2>/dev/null || echo ""', + ignore_failure=True, + ) + if gcp_out.strip(): + logging.info('[swap_encryption] Cloud detected via metadata: gcp') + return 'gcp' + + # Tertiary: AWS IMDS (IMDSv2 token-based; IMDSv1 is often disabled). + aws_out, _ = _pod_exec( + pod, + 'T=$(curl -s -m 3 -X PUT ' + 'http://169.254.169.254/latest/api/token ' + '-H "X-aws-ec2-metadata-token-ttl-seconds: 60" 2>/dev/null); ' + 'curl -s -m 3 -H "X-aws-ec2-metadata-token: $T" ' + 'http://169.254.169.254/latest/meta-data/instance-id ' + '2>/dev/null || echo ""', + ignore_failure=True, + ) + if aws_out.strip(): + logging.info('[swap_encryption] Cloud detected via IMDS: aws') + return 'aws' + + logging.warning( + '[swap_encryption] Could not detect cloud from DMI or metadata') + return 'unknown' + + +def _setup_gke_swap(pod: str) -> None: + """Configure dm-crypt swap on the GKE node, mirroring go/node:swap-encryption. + + GKE nodes use dm-crypt with an ephemeral random key so that swap contents + are encrypted at rest without requiring persistent key management. + We replicate this exactly using cryptsetup in plain mode (no LUKS header). + """ + swap_type = _SWAP_TYPE.value + if swap_type == 'auto': + # Check whether Local SSDs are present + lssd_out, _ = _pod_exec( + pod, + "lsblk -d -o NAME,MODEL | grep -i 'local\\|nvme' | " + "grep -v 'nvme0' | awk '{print $1}' | head -1", + ignore_failure=True, + ) + swap_type = 'lssd' if lssd_out.strip() else 'hyperdisk' + + if swap_type == 'lssd': + _setup_gke_lssd_swap(pod) + elif swap_type == 'boot_disk': + _setup_gke_bootdisk_swap(pod) + else: + _setup_gke_hyperdisk_swap(pod) + + +def _setup_gke_hyperdisk_swap(pod: str) -> None: + """Configure dm-crypt swap on hyperdisk-balanced (GKE default). + + Disk detection is split into two separate commands so that the boot-device + name is resolved first and then substituted as a literal string — nested + $() expansions inside a kubectl exec bash -c argument are unreliable. + + If no dedicated data disk is attached (single-disk node) dm-crypt is set up + over a loop device backed by a file on the boot hyperdisk, which still + exercises the full encryption path on the same storage tier. + """ + logging.info('[swap_encryption] GKE: setting up dm-crypt on hyperdisk') + + # Step 1: identify the boot device name (e.g. "nvme0n1", "sda") + boot_out, _ = _pod_exec( + pod, + 'lsblk -no pkname "$(findmnt -n -o SOURCE /)" 2>/dev/null | head -1', + ignore_failure=True, + ) + boot_base = boot_out.strip() or 'nvme0n1' + logging.info('[swap_encryption] GKE: boot device: %s', boot_base) + + # Step 2: find a non-boot disk using the literal name from step 1 + disk_out, _ = _pod_exec( + pod, + f"lsblk -d -o NAME,TYPE | awk '$2==\"disk\"{{print $1}}' " + f"| grep -v '^{boot_base}$' | head -1", + ignore_failure=True, + ) + disk_name = disk_out.strip() + + if not disk_name: + logging.info( + '[swap_encryption] No dedicated data disk found – ' + 'falling back to loop device on /mnt/stateful_partition ' + '(direct-io=on, dm-crypt=%s)', _ENABLE_DMCRYPT.value) + _setup_gke_loop_device_swap(pod) + return + + disk = f'/dev/{disk_name}' + logging.info('[swap_encryption] GKE: swap target disk: %s dmcrypt=%s', + disk, _ENABLE_DMCRYPT.value) + + # Clean up any stale mapping from a previous failed run. + _pod_exec(pod, textwrap.dedent(f""" + swapoff /dev/mapper/swap_encrypted 2>/dev/null || true + dmsetup remove --noudevrules --noudevsync swap_encrypted 2>/dev/null || true + wipefs -a {disk} 2>/dev/null || true + """), ignore_failure=True) + + if _ENABLE_DMCRYPT.value: + # We cannot use cryptsetup open from inside a container because + # libdevmapper calls dm_udev_wait() after creating the target, which + # blocks on /run/udev/control. That socket belongs to udevd which is + # not running inside the container — so cryptsetup hangs forever. + # + # Instead we drive dmsetup directly with --noudevrules --noudevsync, + # which skips all udev synchronisation, and call dmsetup mknodes to + # ensure /dev/mapper/swap_encrypted appears without udev. + # + # insmod (not modprobe) loads the kernel module: modprobe also talks to + # systemd-udevd and can deadlock from a container for the same reason. + _pod_exec(pod, textwrap.dedent(f""" + grep -q dm_crypt /proc/modules 2>/dev/null || {{ + KO=$(find /lib/modules/$(uname -r) -name 'dm-crypt.ko*' 2>/dev/null | head -1) + [ -n "$KO" ] && insmod "$KO" 2>/dev/null || true + }} + KEY=$(dd if=/dev/urandom bs=32 count=1 2>/dev/null | od -A n -t x1 | tr -d ' \\n') + SIZE=$(blockdev --getsz {disk}) + printf "0 %s crypt aes-xts-plain64 %s 0 %s 0\\n" "$SIZE" "$KEY" "{disk}" | \\ + dmsetup create swap_encrypted --noudevrules --noudevsync + unset KEY + dmsetup mknodes swap_encrypted 2>/dev/null || true + mkswap /dev/mapper/swap_encrypted + swapon /dev/mapper/swap_encrypted + """)) + logging.info('[swap_encryption] GKE: dm-crypt swap active on ' + '/dev/mapper/swap_encrypted') + else: + # Encryption-disabled column of the test matrix + _pod_exec(pod, textwrap.dedent(f""" + mkswap {disk} && \\ + swapon {disk} + """)) + logging.info('[swap_encryption] GKE: plain (unencrypted) swap active ' + 'on %s', disk) + + +def _setup_gke_loop_device_swap(pod: str) -> None: + """Plain loop-device swap for single-disk GKE nodes (no dedicated swap disk). + + Used when _setup_gke_hyperdisk_swap finds no dedicated second disk (e.g. + n2-highmem-32 / n4-highmem-32 single-boot-disk nodes, regardless of image + type). + + dm-crypt is skipped on this path for two reasons: + 1. On COS (Container-Optimised OS): the device-mapper kernel subsystem is + inaccessible from inside a Kubernetes pod (even privileged). Calls to + cryptsetup/dmsetup block indefinitely and are killed by the PKB timeout. + This is a deliberate COS security restriction, not a permissions issue. + 2. On UBUNTU_CONTAINERD: the loop device is created in the container + namespace; its behaviour under nsenter (needed for dm-crypt on dedicated + disks) is untested, so plain loop swap is used for safety. + For dedicated block devices (hyperdisk, LSSD) nsenter into the host mount + namespace works around the COS restriction (see _setup_gke_hyperdisk_swap). + The loop device path skips dm-crypt on all image types; plain loop swap is + used instead. + + Therefore this path uses a plain loop device as swap without dm-crypt. + Phase 1 (fio) is skipped for plain loop devices — the goal is enc-on vs + enc-off comparison, and fio on a plain loop device measures the backing + filesystem rather than the swap stack. Tiers 2–6 (stress-ng, Redis, + kernel build, OpenSearch) run normally. + + For dm-crypt measurement on GCP use a machine type with local NVMe (LSSD) + or provision a dedicated hyperdisk on a second disk slot (n4-highmem-32+). + + Improvements over the old /var path: + - Backing file on /mnt/stateful_partition (ext4), not the container + overlayfs — avoids overlayfs O_DIRECT limitation. + - losetup --direct-io=on passes I/O through to the host ext4, reducing + double-buffering for Tiers 2–6 workloads. + """ + size_gb = _SWAP_SIZE_GB.value + # /mnt/stateful_partition is ext4 on COS (mounted from the stateful + # partition of the node's persistent disk). It is NOT the container + # overlay filesystem and is mounted into the pod via the DaemonSet + # hostPath volume. + backing = '/mnt/stateful_partition/pkb_swap_backing' + + # ── Step 0: detach any stale loop device from a previous failed run ─────── + _pod_exec(pod, textwrap.dedent(f""" + losetup -j {backing} 2>/dev/null | awk -F: '{{print $1}}' | \ + while read dev + do + swapoff "$dev" 2>/dev/null || true + losetup -d "$dev" 2>/dev/null || true + done + rm -f {backing} + """), ignore_failure=True) + + # ── Step 1: allocate backing file on stateful partition (ext4) ─────────── + logging.info( + '[swap_encryption] GKE: creating %dG backing file on stateful_partition', + size_gb) + # fallocate preallocates real ext4 blocks (avoids fragmentation during swap + # I/O); truncate is the sparse fallback for filesystems where fallocate + # fails. + _pod_exec(pod, textwrap.dedent(f""" + fallocate -l {size_gb}G {backing} 2>/dev/null || \\ + truncate -s {size_gb}G {backing} + """)) + + # ── Step 2: loop device with direct-io passthrough ─────────────────────── + # --direct-io=on lets the loop driver pass O_DIRECT to the host ext4, + # reducing double-buffering for workload I/O (kernel 5.x+, present on + # GKE COS ≥ 1.29). + loop_out, _ = _pod_exec(pod, textwrap.dedent(f""" + LOOP=$(losetup -f) && \\ + losetup --direct-io=on "$LOOP" {backing} && \\ + echo "$LOOP" + """)) + loop_dev = loop_out.strip() + if not loop_dev.startswith('/dev/loop'): + raise RuntimeError( + f'[swap_encryption] losetup failed – output: {loop_out!r}' + ) + logging.info('[swap_encryption] GKE: loop device: %s direct-io=on', loop_dev) + + # ── Step 3: plain mkswap + swapon (dm-crypt skipped on loop devices) ──────── + _pod_exec(pod, f'mkswap {loop_dev}') + _pod_exec(pod, f'swapon {loop_dev}') + logging.warning( + '[swap_encryption] GKE: plain loop swap active on %s ' + '(dm-crypt unavailable from COS pod — device-mapper is blocked by ' + 'COS kernel namespace restrictions). ' + 'Phase 1 (fio) will be skipped. ' + 'Use a machine with LSSD (c4-*-lssd) or attach a dedicated second ' + 'hyperdisk for dm-crypt measurement.', + loop_dev, + ) + + +def _setup_gke_bootdisk_swap(pod: str) -> None: + """Swap on the OS BOOT disk — methodology Table 0 rows 1-4. + + Creates a loop-backed swap file on /mnt/stateful_partition (the node's boot + disk, whose type — pd-balanced or hyperdisk-balanced — is chosen at + nodepool-creation time via --swap_encryption_boot_disk_type). dm-crypt is + layered on the loop device when --swap_encryption_enable_dmcrypt is set + (encryption-on rows 2/4); otherwise plain swap is used (encryption-off rows + 1/3). + + Reuses the same loop-creation and dmsetup patterns as the LSSD/hyperdisk + paths — no shared provider module is touched. Requires an Ubuntu node image + (dm-crypt from a pod is blocked on COS). + """ + size_gb = _SWAP_SIZE_GB.value + backing = '/mnt/stateful_partition/pkb_swap_backing' + logging.info('[swap_encryption] GKE: boot-disk swap (%dG backing, dmcrypt=%s)', + size_gb, _ENABLE_DMCRYPT.value) + + # Clean up any stale loop/mapping from a previous run. + _pod_exec(pod, textwrap.dedent(f""" + swapoff /dev/mapper/swap_encrypted 2>/dev/null || true + dmsetup remove --noudevrules --noudevsync swap_encrypted 2>/dev/null || true + losetup -j {backing} 2>/dev/null | awk -F: '{{print $1}}' | while read d + do + swapoff "$d" 2>/dev/null || true + losetup -d "$d" 2>/dev/null || true + done + rm -f {backing} + """), ignore_failure=True) + + # Allocate the backing file on the boot-disk ext4 stateful partition. + _pod_exec(pod, textwrap.dedent(f""" + fallocate -l {size_gb}G {backing} 2>/dev/null || truncate -s {size_gb}G {backing} + """)) + + loop_out, _ = _pod_exec(pod, textwrap.dedent(f""" + LOOP=$(losetup -f) && losetup --direct-io=on "$LOOP" {backing} && echo "$LOOP" + """)) + loop_dev = loop_out.strip().splitlines()[-1].strip() if loop_out.strip() else '' + if not loop_dev.startswith('/dev/loop'): + raise RuntimeError( + f'[swap_encryption] boot-disk losetup failed: {loop_out!r}') + logging.info('[swap_encryption] GKE: boot-disk loop device: %s', loop_dev) + + if _ENABLE_DMCRYPT.value: + _pod_exec(pod, textwrap.dedent(f""" + grep -q dm_crypt /proc/modules 2>/dev/null || {{ + KO=$(find /lib/modules/$(uname -r) -name 'dm-crypt.ko*' 2>/dev/null | head -1) + [ -n "$KO" ] && insmod "$KO" 2>/dev/null || true + }} + KEY=$(dd if=/dev/urandom bs=32 count=1 2>/dev/null | od -A n -t x1 | tr -d ' \\n') + SIZE=$(blockdev --getsz {loop_dev}) + printf "0 %s crypt aes-xts-plain64 %s 0 %s 0\\n" "$SIZE" "$KEY" "{loop_dev}" | \\ + dmsetup create swap_encrypted --noudevrules --noudevsync + unset KEY + dmsetup mknodes swap_encrypted 2>/dev/null || true + mkswap /dev/mapper/swap_encrypted + swapon /dev/mapper/swap_encrypted + """)) + logging.info('[swap_encryption] GKE: boot-disk dm-crypt swap active on ' + '/dev/mapper/swap_encrypted') + else: + _pod_exec(pod, textwrap.dedent(f""" + mkswap {loop_dev} && swapon {loop_dev} + """)) + logging.info('[swap_encryption] GKE: boot-disk plain swap active on %s', + loop_dev) + + +def _setup_gke_lssd_swap(pod: str) -> None: + """Configure dm-crypt on LSSD RAID-0 array (go/gke-swap-lssd).""" + logging.info('[swap_encryption] GKE: setting up LSSD RAID-0 swap') + + # Reused-node hygiene: a previous run on this node may have left an ACTIVE + # dm-crypt swap (e.g. /dev/nvme0n1 └─swap_encrypted [SWAP]). That makes the + # LSSD look "unclean/busy" to the device selector below, which then wrongly + # falls back to the hyperdisk path and tries the boot disk. Tear down any + # prior PKB swap mapping FIRST so the underlying LSSD is freed and selectable. + _pod_exec(pod, textwrap.dedent(""" + swapoff /dev/mapper/swap_encrypted 2>/dev/null || true + swapoff -a 2>/dev/null || true + dmsetup remove --force --noudevrules --noudevsync swap_encrypted 2>/dev/null || true + """), ignore_failure=True) + + # Log the full block-device topology up front for diagnosis (every prior + # swap failure traced back to picking the wrong device). + topo, _ = _pod_exec( + pod, 'lsblk -o NAME,TYPE,SIZE,ROTA,MOUNTPOINT 2>/dev/null', + ignore_failure=True) + logging.info('[swap_encryption] block device topology:\n%s', + (topo or '').strip()) + + # Identify candidate swap devices = whole disks that are NOT the boot/OS + # disk. We must NOT rely on a device name (boot disk enumerates as nvme0n1 + # on some nodes, nvme1n1 on others) and we cannot use `findmnt /` because the + # container root is an overlay. Instead we EXCLUDE any disk that: + # * has partition children (boot disk has p1/p14/p15/p16), or + # * has any mounted filesystem (itself or a child). + # A raw local SSD intended for swap has neither. This robustly prevents the + # catastrophic bug where the 100 GB boot disk (root mounted) was RAIDed into + # the swap device, yielding a non-functional swap (fio empty + stress OOM). + lssd_out, _ = _pod_exec( + pod, + textwrap.dedent(""" + for d in $(lsblk -dno NAME,ROTA | awk '$2==0{print $1}') + do + if lsblk -no TYPE "/dev/$d" 2>/dev/null | grep -q '^part$'; then + continue # has partitions -> boot/OS disk + fi + if lsblk -no MOUNTPOINT "/dev/$d" 2>/dev/null | grep -q '[^[:space:]]'; then + continue # mounted somewhere -> not a free swap device + fi + echo "/dev/$d" + done + """), + ignore_failure=True, + ) + devices = [d.strip() for d in lssd_out.strip().splitlines() if d.strip()] + if not devices: + logging.warning( + '[swap_encryption] No clean (unpartitioned, unmounted) local SSD found ' + '— falling back to hyperdisk swap path') + _setup_gke_hyperdisk_swap(pod) + return + + device_list = ' '.join(devices) + n = len(devices) + logging.info('[swap_encryption] GKE: LSSD RAID-0 across %d clean device(s): ' + '%s dmcrypt=%s', n, device_list, _ENABLE_DMCRYPT.value) + + # Clean up stale mappings, RAID arrays, and GKE-managed mounts. + # + # GKE UBUNTU nodes run google-ssd-startup.service at boot which formats + # local NVMe SSDs as ext4 and mounts them at /mnt/disks/ssd0 etc. even + # when --local-nvme-ssd-block is set. The mount makes the block device + # busy so mdadm/wipefs fail silently (we had || true). We must unmount + # those paths first. /proc-host/mounts reflects the host mount table + # (hostPID:true + privileged gives us access). + # + # pkb_swap is the dm-crypt device created by the node startup script (for + # single-LSSD nodes it holds /dev/nvme1n1 directly without an md0 layer). + _pod_exec(pod, textwrap.dedent(f""" + echo "[pkb-lssd-cleanup] /proc/mdstat:" >&2 + cat /proc/mdstat 2>/dev/null || true + echo "[pkb-lssd-cleanup] dmsetup ls:" >&2 + dmsetup ls 2>/dev/null || true + echo "[pkb-lssd-cleanup] /proc/swaps:" >&2 + cat /proc/swaps 2>/dev/null || true + echo "[pkb-lssd-cleanup] host mounts on {device_list}:" >&2 + grep -E '{('|'.join(devices))}' /proc-host/mounts 2>/dev/null || true + echo "[pkb-lssd-cleanup] sysfs holders:" >&2 + for dev in {device_list} + do + devname=$(basename "$dev") + ls -1 /sys/block/$devname/holders/ 2>/dev/null | while read h + do + echo "[pkb-lssd-cleanup] $dev held by $h" >&2 + done + done + echo "[pkb-lssd-cleanup] --- begin teardown ---" >&2 + for dev in {device_list} + do + test -b "$dev" || continue + devname=$(basename "$dev") + for holder in /sys/block/$devname/holders/* + do + test -e "$holder" || continue + h=$(basename "$holder") + echo "[pkb-lssd-cleanup] removing holder /dev/$h from $dev" >&2 + if echo "$h" | grep -q "^md" + then + mdadm --stop /dev/$h 2>/dev/null || true + else + dmsetup remove --force --noudevrules --noudevsync /dev/$h 2>/dev/null || true + fi + done + mounts=$(awk -v d="$dev" '$1==d{{print $2}}' /proc-host/mounts 2>/dev/null || true) + for mp in $mounts + do + echo "[pkb-lssd-cleanup] unmounting $mp from $dev" >&2 + umount -f "$mp" 2>/dev/null || true + done + done + swapoff -a 2>/dev/null || true + swapoff /dev/mapper/pkb_swap 2>/dev/null || true + swapoff /dev/mapper/swap_encrypted 2>/dev/null || true + dmsetup remove --force --noudevrules --noudevsync pkb_swap 2>/dev/null || true + dmsetup remove --force --noudevrules --noudevsync swap_encrypted 2>/dev/null || true + mdadm --stop --scan 2>/dev/null || true + mdadm --zero-superblock {device_list} 2>/dev/null || true + wipefs -a {device_list} 2>/dev/null || true + echo "[pkb-lssd-cleanup] lsblk after wipefs:" >&2 + lsblk {device_list} 2>/dev/null || true + partx -u {device_list} 2>/dev/null || true + losetup -D 2>/dev/null || true + rm -f /mnt/stateful_partition/pkb_swap.img 2>/dev/null || true + sleep 2 + """), ignore_failure=True) + + # Step 3: verify the devices are truly raw (unpartitioned). On GKE Ubuntu + # nodes the local NVMe device may be partitioned by node startup scripts + # even when --local-nvme-ssd-block is specified. The kernel refuses a + # whole-disk exclusive open (DM_TABLE_LOAD → EBUSY) when any partition of + # the disk is open by another process (e.g. the container overlay FS is + # backed by nvme1n1p1). Detect this and fall back to a loop device backed + # by a file on /mnt/stateful_partition (which IS the SSD partition). + raw_check_out, _ = _pod_exec( + pod, + textwrap.dedent(f""" + for dev in {device_list} + do + if lsblk -ln -o TYPE "$dev" 2>/dev/null | grep -q '^part$' + then + echo "[pkb-lssd] $dev is partitioned — cannot use as raw block device" >&2 + else + echo "$dev" + fi + done + """), + ignore_failure=True, + ) + raw_devices = [d.strip() for d in raw_check_out.strip().splitlines() if d.strip()] + + if not raw_devices: + logging.info( + '[swap_encryption] GKE: all LSSD devices are partitioned — ' + 'falling back to loop device on /mnt/stateful_partition' + ) + _setup_gke_lssd_stateful_loop_swap(pod) + return + + # Use only raw (unpartitioned) devices going forward. + devices = raw_devices + device_list = ' '.join(devices) + n = len(devices) + logging.info('[swap_encryption] GKE: using %d raw LSSD device(s): %s ' + 'dmcrypt=%s', n, device_list, _ENABLE_DMCRYPT.value) + + # For N=1 LSSD, skip mdadm entirely and target the raw device directly. + # For N>1 we stripe across multiple NVMe devices. + if n > 1: + _pod_exec(pod, textwrap.dedent(f""" + mdadm --create /dev/md0 --force \\ + --level=0 --raid-devices={n} \\ + {device_list} + test -b /dev/md0 || {{ echo "mdadm: /dev/md0 not created" >&2; exit 1; }} + """)) + swap_block_dev = '/dev/md0' + else: + swap_block_dev = devices[0] + logging.info('[swap_encryption] GKE: single LSSD — skipping mdadm, ' + 'using %s directly', swap_block_dev) + + if _ENABLE_DMCRYPT.value: + # Same dmsetup --noudevrules --noudevsync approach as _setup_gke_hyperdisk_swap. + _pod_exec(pod, textwrap.dedent(f""" + grep -q dm_crypt /proc/modules 2>/dev/null || {{ + KO=$(find /lib/modules/$(uname -r) -name 'dm-crypt.ko*' 2>/dev/null | head -1) + [ -n "$KO" ] && insmod "$KO" 2>/dev/null || true + }} + udevadm control --stop-exec-queue 2>/dev/null || true + KEY=$(dd if=/dev/urandom bs=32 count=1 2>/dev/null | od -A n -t x1 | tr -d ' \\n') + SIZE=$(blockdev --getsz {swap_block_dev}) + printf "0 %s crypt aes-xts-plain64 %s 0 %s 0\\n" "$SIZE" "$KEY" "{swap_block_dev}" | \\ + dmsetup create swap_encrypted --noudevrules --noudevsync + udevadm control --start-exec-queue 2>/dev/null || true + unset KEY + dmsetup mknodes swap_encrypted 2>/dev/null || true + mkswap /dev/mapper/swap_encrypted + swapon /dev/mapper/swap_encrypted + """)) + logging.info('[swap_encryption] GKE: LSSD dm-crypt swap active on %s', + swap_block_dev) + else: + _pod_exec(pod, textwrap.dedent(f""" + mkswap {swap_block_dev} + swapon {swap_block_dev} + """)) + logging.info('[swap_encryption] GKE: LSSD plain swap active on %s', + swap_block_dev) + + +def _setup_gke_lssd_stateful_loop_swap(pod: str) -> None: + """Set up swap on the LSSD partition via a loop device. + + Used when the local NVMe device is partitioned by GKE startup scripts + and cannot be opened as a whole raw block device (DM_TABLE_LOAD EBUSY). + The DaemonSet mounts /mnt/stateful_partition (hostPath) from the host's + nvme1n1p1 — which is still local SSD storage. We create a large file + there and layer loop → dm-crypt → swap on top of it. + """ + img_path = '/mnt/stateful_partition/pkb_swap.img' + + # Clean up any previous run artifacts. + _pod_exec(pod, textwrap.dedent(f""" + swapoff -a 2>/dev/null || true + dmsetup remove --force --noudevrules --noudevsync swap_encrypted 2>/dev/null || true + losetup -D 2>/dev/null || true + rm -f {img_path} 2>/dev/null || true + """), ignore_failure=True) + + # Determine file size: 80% of available space, at least 16 GB. + size_out, _ = _pod_exec( + pod, + f"df -P /mnt/stateful_partition | awk 'NR==2{{print $4}}'", + ignore_failure=True, + ) + avail_kb = int(size_out.strip() or '0') + swap_gb = max(16, int(avail_kb * 0.8 / 1024 / 1024)) + logging.info('[swap_encryption] GKE: LSSD stateful-loop: %d GB image at %s', + swap_gb, img_path) + + # Allocate file (fallocate is instant on ext4; dd fallback for others). + _pod_exec(pod, textwrap.dedent(f""" + fallocate -l {swap_gb}G {img_path} 2>/dev/null || \\ + dd if=/dev/zero of={img_path} bs=1G count={swap_gb} + chmod 600 {img_path} + losetup --direct-io=on -f {img_path} + """), timeout=300) + + loop_out, _ = _pod_exec( + pod, + f"losetup -j {img_path} | awk -F: '{{print $1}}' | head -1", + ignore_failure=True, + ) + loop_dev = loop_out.strip() + if not loop_dev.startswith('/dev/loop'): + raise RuntimeError( + f'[swap_encryption] losetup failed for {img_path} — got: {loop_out!r}' + ) + logging.info('[swap_encryption] GKE: LSSD stateful-loop device: %s', loop_dev) + + if _ENABLE_DMCRYPT.value: + _pod_exec(pod, textwrap.dedent(f""" + grep -q dm_crypt /proc/modules 2>/dev/null || {{ + KO=$(find /lib/modules/$(uname -r) -name 'dm-crypt.ko*' 2>/dev/null | head -1) + [ -n "$KO" ] && insmod "$KO" 2>/dev/null || true + }} + udevadm control --stop-exec-queue 2>/dev/null || true + KEY=$(dd if=/dev/urandom bs=32 count=1 2>/dev/null | od -A n -t x1 | tr -d ' \\n') + SIZE=$(blockdev --getsz {loop_dev}) + printf "0 %s crypt aes-xts-plain64 %s 0 %s 0\\n" "$SIZE" "$KEY" "{loop_dev}" | \\ + dmsetup create swap_encrypted --noudevrules --noudevsync + udevadm control --start-exec-queue 2>/dev/null || true + unset KEY + dmsetup mknodes swap_encrypted 2>/dev/null || true + mkswap /dev/mapper/swap_encrypted + swapon /dev/mapper/swap_encrypted + """)) + logging.info('[swap_encryption] GKE: LSSD stateful-loop dm-crypt swap active ' + 'on %s → %s', img_path, loop_dev) + else: + _pod_exec(pod, textwrap.dedent(f""" + mkswap {loop_dev} + swapon {loop_dev} + """)) + logging.info('[swap_encryption] GKE: LSSD stateful-loop plain swap active ' + 'on %s → %s', img_path, loop_dev) + + +_IO2_VOLUME_ID = '' # set by _ensure_io2_volume; serial-based detection + + +def _ensure_io2_volume() -> None: + """Create + attach a dedicated io2 EBS volume to the benchmark node so the + io2 test-matrix row swaps on real io2 hardware-encrypted storage. + + No-op unless --swap_encryption_swap_type=io2 on an AWS/EKS cluster. + Best-effort: logs and returns on failure. Stashes the created volume id in + _IO2_VOLUME_ID for serial-based device detection in _setup_eks_io2_swap. + """ + global _IO2_VOLUME_ID + if _SWAP_TYPE.value != 'io2': + return + out, _, rc = kubectl.RunKubectlCommand( + ['get', 'nodes', '-o', 'jsonpath={.items[0].spec.providerID}'], + raise_on_failure=False, + ) + provider = (out or '').strip() # aws:///us-east-1a/i-0abc... + if rc != 0 or 'aws://' not in provider: + logging.warning( + '[swap_encryption] io2 attach skipped: could not resolve ' + 'EC2 instance from providerID=%r', provider) + return + parts = [p for p in provider.split('/') if p] + instance_id, az = parts[-1], parts[-2] + region = az[:-1] + base = ['aws', 'ec2', '--region', region] + try: + create_args = [ + 'create-volume', + '--volume-type', 'io2', + '--size', '500', + '--iops', '16000', + '--availability-zone', az, + '--tag-specifications', + 'ResourceType=volume,Tags=[{Key=pkb,Value=swap_encryption}]', + ] + if _IO2_ENCRYPTED.value: + create_args.append('--encrypted') + if _IO2_KMS_KEY_ID.value: + create_args += ['--kms-key-id', _IO2_KMS_KEY_ID.value] + logging.info( + '[swap_encryption] io2 volume will be EBS-encrypted ' + '(row: hardware encryption)') + else: + logging.info('[swap_encryption] io2 volume UNENCRYPTED (baseline row)') + create_args += ['--query', 'VolumeId', '--output', 'text'] + vol_id, _, vrc = vm_util.IssueCommand( + base + create_args, raise_on_failure=False) + vol_id = (vol_id or '').strip() + if vrc != 0 or not vol_id.startswith('vol-'): + logging.warning('[swap_encryption] io2 create-volume failed: %r', vol_id) + return + vm_util.IssueCommand( + base + ['wait', 'volume-available', '--volume-ids', vol_id], + raise_on_failure=False) + vm_util.IssueCommand( + base + [ + 'attach-volume', + '--volume-id', vol_id, + '--instance-id', instance_id, + '--device', '/dev/sdf', + ], + raise_on_failure=False) + vm_util.IssueCommand( + base + ['wait', 'volume-in-use', '--volume-ids', vol_id], + raise_on_failure=False) + _IO2_VOLUME_ID = vol_id + logging.info( + '[swap_encryption] Attached io2 volume %s to %s as /dev/sdf', + vol_id, instance_id) + time.sleep(15) # allow the NVMe device node to appear + except Exception as e: # pylint: disable=broad-except + logging.warning('[swap_encryption] io2 attach error (continuing): %s', e) + + +def _setup_eks_swap(pod: str) -> None: + """Configure swap on EKS nodes — Instance Store OR io2 root disk. + + Swap type is selected by --swap_encryption_swap_type: + instance_store (default) – NVMe SSD attached by Nitro (i4i, m6id, c6id). + Nitro encrypts all block-device writes at hardware level; no extra + cryptsetup needed. + io2 – EBS io2 volume provisioned as the node root/data disk. + Used for apples-to-apples comparison against GKE hyperdisk-balanced. + """ + swap_type = _SWAP_TYPE.value + if swap_type in ('auto', 'instance_store'): + _setup_eks_instance_store_swap(pod) + elif swap_type == 'io2': + _setup_eks_io2_swap(pod) + else: + logging.warning( + '[swap_encryption] Unknown EKS swap type %s – fallback', swap_type) + _setup_eks_instance_store_swap(pod) + + +def _setup_eks_instance_store_swap(pod: str) -> None: + """Swap on AWS NVMe Instance Store (Nitro hardware-offloaded encryption).""" + logging.info('[swap_encryption] EKS: setting up Instance Store swap') + + # Find the Instance Store NVMe device (not the root EBS volume) + nvme_out, _ = _pod_exec( + pod, + "nvme list 2>/dev/null | awk '/Instance Storage/{print $1}' | head -1 || " + "lsblk -d -o NAME,MODEL | grep -i 'instance\\|nvme' | " + "grep -v 'nvme0' | awk '{print \"/dev/\"$1}' | head -1", + ignore_failure=True, + ) + device = nvme_out.strip() + if not device: + # Common Instance Store device paths on AWS + for candidate in ['/dev/nvme1n1', '/dev/nvme2n1', '/dev/xvdb']: + exists_out, _ = _pod_exec( + pod, f'test -b {candidate} && echo yes || echo no', + ignore_failure=True, + ) + if exists_out.strip() == 'yes': + device = candidate + break + + if not device: + logging.warning( + '[swap_encryption] No Instance Store NVMe found – creating swapfile' + ) + _setup_plain_swap_file(pod, _SWAP_SIZE_GB.value) + return + + logging.info('[swap_encryption] EKS: Instance Store device: %s', device) + + # Nitro encrypts all Instance Store writes automatically. + # No additional cryptsetup required. + _pod_exec(pod, textwrap.dedent(f""" + mkswap {device} && \\ + swapon {device} + """)) + logging.info( + '[swap_encryption] EKS: Instance Store swap active on %s', device) + + +def _setup_eks_io2_swap(pod: str) -> None: + """Swap on AWS EBS io2 volume – apples-to-apples comparison vs GKE hyperdisk. + + EBS io2 volumes on Nitro instances are encrypted at rest by AWS KMS (if + enabled on the volume) or via Nitro-level hardware encryption. No additional + cryptsetup is needed here; we simply format the attached data disk as swap. + + Device discovery order: + 1. Match the io2 volume created by _ensure_io2_volume() by its NVMe serial + (serial == volume id without the dash). This is unambiguous and never + picks the root disk or the instance store regardless of nvmeXn1 + enumeration order on Nitro. + 2. First non-root EBS ("Elastic Block Store") block device that is not + currently mounted. + """ + logging.info('[swap_encryption] EKS: setting up io2 EBS swap') + + # Identify root device so we can exclude it. + root_out, _ = _pod_exec( + pod, + 'lsblk -no pkname $(findmnt -n -o SOURCE /) 2>/dev/null || echo nvme0n1', + ignore_failure=True, + ) + root_base = root_out.strip() or 'nvme0n1' + + # Identify the io2 volume UNAMBIGUOUSLY by its NVMe serial == volume id. + # An EBS NVMe device's serial equals the volume id minus the dash + # (vol-0abc... -> serial vol0abc...). + device = '' + target = _IO2_VOLUME_ID.replace('-', '') + if target: + ser_out, _ = _pod_exec( + pod, + 'for d in /sys/block/nvme*n1; do ' + '[ -e "$d" ] || continue; ' + 's=$(cat "$d/device/serial" 2>/dev/null | tr -d "-" | tr -d " "); ' + f'[ "$s" = "{target}" ] && {{ echo "/dev/$(basename "$d")"; break; }}; ' + 'done', + ignore_failure=True, + ) + device = ser_out.strip() + if device: + logging.info( + '[swap_encryption] EKS: io2 matched by serial %s -> %s', + target, device) + + if not device: + # Fallback: first non-root EBS device, excluding any device that is + # currently mounted (root) or already active swap. + disk_out, _ = _pod_exec( + pod, + 'for d in /sys/block/nvme*n1 /sys/block/xvd[b-z] /sys/block/sd[b-z];' + ' do [ -e "$d" ] || continue; n=$(basename "$d"); [ "$n" =' + f' "{root_base}" ] && continue; m=$(cat "$d/device/model" 2>/dev/null);' + ' echo "$m" | grep -qi "Elastic Block Store" || continue; mnt=$(lsblk' + ' -no MOUNTPOINT "/dev/$n" 2>/dev/null | tr -d " "); [ -n "$mnt" ] &&' + ' continue; echo "/dev/$n"; break; done', + ignore_failure=True, + ) + device = disk_out.strip() + if device: + logging.info( + '[swap_encryption] EKS: io2 fallback EBS device: %s', device) + + if not device: + logging.warning( + '[swap_encryption] No io2 EBS disk found – creating plain swapfile') + _setup_plain_swap_file(pod, _SWAP_SIZE_GB.value) + return + + logging.info('[swap_encryption] EKS: io2 EBS device: %s', device) + + # EBS io2 encryption is handled at the AWS level (Nitro / KMS). + out, _ = _pod_exec( + pod, + textwrap.dedent(f""" + swapoff {device} 2>/dev/null || true + wipefs -a {device} 2>/dev/null || true + mkswap -f {device} && swapon {device} + swapon --show + """), + ignore_failure=True, + ) + if device not in out: + raise RuntimeError( + f'[swap_encryption] io2 swap did not activate on {device}; ' + f'swapon --show output: {out!r}. The device may be busy/mounted ' + '(wrong device picked) or mkswap failed.') + logging.info('[swap_encryption] EKS: io2 EBS swap active on %s', device) + + +def _setup_plain_swap_file(pod: str, size_gb: int) -> None: + """Fallback: create a loop-device-backed swapfile. + + A plain file on overlayfs (the container root) cannot be used as swap — + the kernel rejects it with EINVAL. Routing it through a loop device + presents a proper block device to the mm subsystem and succeeds. + """ + logging.info('[swap_encryption] Creating %dGB loop-device swap', size_gb) + _pod_exec(pod, textwrap.dedent(f""" + fallocate -l {size_gb}G /tmp/pkb_swapfile && \\ + chmod 600 /tmp/pkb_swapfile && \\ + LOOP=$(losetup -f) && \\ + losetup "$LOOP" /tmp/pkb_swapfile && \\ + mkswap "$LOOP" && \\ + swapon "$LOOP" && \\ + echo "swap loop device: $LOOP" + """)) + + +def _enable_zswap(pod: str) -> None: + """Enable zswap with lz4 compressor and 20% pool limit inside the pod.""" + logging.info('[swap_encryption] Enabling zswap (lz4, 20%% pool)') + for cmd in [ + 'echo 1 > /sys/module/zswap/parameters/enabled', + 'echo lz4 > /sys/module/zswap/parameters/compressor', + 'echo 20 > /sys/module/zswap/parameters/max_pool_percent', + 'echo z3fold > /sys/module/zswap/parameters/zpool', + ]: + _pod_exec(pod, cmd, ignore_failure=True) + + _INSTANCE_PRICE_USD_PER_HR: dict[str, float] = { # GCP (on-demand, us-central1 unless noted) 'c4-standard-8-lssd': 0.5888, # 8 vCPU, 32 GB RAM + 1×375 GB LSSD From 7ef70158ce7d560385dd81e21043b5f9a1c9b9fe Mon Sep 17 00:00:00 2001 From: DevVegeta Date: Fri, 19 Jun 2026 09:43:24 +0530 Subject: [PATCH 3/8] PR3: swap-encryption benchmark - layer 3/5 (single file swap_encryption_benchmark.py) --- .../swap_encryption_benchmark.py | 191 ++++++++++++++++++ 1 file changed, 191 insertions(+) diff --git a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py index ee2e76a665..b9e8a01879 100644 --- a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py +++ b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py @@ -151,6 +151,13 @@ ) +_FIO_RUNTIME_SEC = flags.DEFINE_integer( + 'swap_encryption_fio_runtime_sec', + 60, + 'Wall-clock runtime in seconds for each individual fio job.', +) + + _ENABLE_ZSWAP = flags.DEFINE_boolean( 'swap_encryption_enable_zswap', False, @@ -381,6 +388,17 @@ _DEFAULT_NODEPOOL = 'default-pool' +_FIO_JOBS = ( + ('rand_write_iops', 'randwrite', '4k', 256, 'Random write IOPS'), + ('rand_read_iops', 'randread', '4k', 256, 'Random read IOPS'), + ('rand_rw_mixed', 'randrw', '4k', 256, 'Mixed random R/W (50/50)'), + ('seq_write_bw', 'write', '1m', 64, 'Sequential write bandwidth'), + ('seq_read_bw', 'read', '1m', 64, 'Sequential read bandwidth'), + ('lat_write', 'randwrite', '4k', 1, 'Random write latency'), + ('lat_read', 'randread', '4k', 1, 'Random read latency'), +) + + def _daemonset_yaml(image: str) -> str: """Render the privileged benchmark DaemonSet manifest. @@ -573,6 +591,27 @@ def Run(spec) -> list[sample.Sample]: logging.info('[swap_encryption] swap device: %s', swap_dev) + # ── Tier 1 / Gate 1: fio microbenchmarks ───────────────────────────────── + tier1_results = [] + if _phase_selected('fio'): + logging.info( + '[swap_encryption] ── Tier 1 / Gate 1: fio microbenchmarks ──') + try: + tier1_results = _phase1_fio(pod, swap_dev, base_meta) + results += tier1_results + except Exception as e: # pylint: disable=broad-except + logging.error('[swap_encryption] Gate 1 FAILED — fio phase error: %s', e) + logging.error('[swap_encryption] Skipping Tiers 2 and 3 (no swap device)') + return results + + if not tier1_results: + logging.warning('[swap_encryption] Gate 1 produced no samples ' + '(loop-device skip or parse error) — ' + 'continuing to Tier 2 with caution') + else: + logging.info('[swap_encryption] Skipping Tier 1 (fio) — not selected by ' + '--swap_encryption_phases=%s', ','.join(_PHASES.value)) + # ── Cost estimate ───────────────────────────────────────────────────────── if _COLLECT_COST.value: elapsed = time.time() - t_run_start @@ -604,6 +643,22 @@ def Run(spec) -> list[sample.Sample]: f'the OOM killer (the container may have restarted in place), so the ' f'affected phase(s) produced no or partial data') + if _phase_selected('fio') and not tier1_results: + if swap_dev.startswith('/dev/loop'): + # Expected: COS blocks device-mapper from pod namespaces on single-disk + # nodes (n2/n4 without --swap_encryption_add_swap_disk or lssd). + # Tier 2/3 results are still valid; do NOT mark the run as degraded. + logging.warning( + '[swap_encryption] Gate 1 (fio) skipped — loop device %s has no ' + 'dm-crypt support from inside a pod. Tier 2/3 results are valid. ' + 'Use c4-*-lssd or --swap_encryption_add_swap_disk for fio data.', + swap_dev) + else: + _degraded_reasons.append( + 'Gate 1 (fio microbenchmarks) produced no samples — the raw swap ' + 'device was never characterised') + + degraded = bool(_degraded_reasons) results.append(sample.Sample( 'swap_encryption_run_status', @@ -2322,6 +2377,142 @@ def _enable_zswap(pod: str) -> None: _pod_exec(pod, cmd, ignore_failure=True) +def _phase1_fio( + pod: str, swap_dev: str, base_meta: dict +) -> list[sample.Sample]: + """Run fio directly on the swap block device for raw I/O characterisation. + + Skipped only for an UNINTENTIONAL loop fallback (a single-disk node with no + dedicated swap disk, where fio on the loop would measure the boot ext4 + filesystem rather than the swap stack). When the user explicitly selects the + boot_disk target (--swap_encryption_swap_type=boot_disk, methodology rows + 1-4), the loop over the boot disk IS the device under test, so fio runs and + characterises it. + + For dedicated second disks (hyperdisk, LSSD, NVMe) direct I/O is always + used and swap is restored (mkswap + swapon) after the fio run. + To get fio results use c4-*-lssd (local NVMe) or + --swap_encryption_add_swap_disk to provision a dedicated second disk. + """ + if swap_dev.startswith('/dev/loop') and _SWAP_TYPE.value != 'boot_disk': + logging.warning( + '[swap_encryption] Phase 1 (fio) SKIPPED for plain loop device %s ' + '(unintentional single-disk fallback). ' + 'fio on a loop-backed device measures the underlying ext4 filesystem ' + '(stateful_partition), not the swap stack. ' + 'Use c4-*-lssd, --swap_encryption_add_swap_disk, or ' + '--swap_encryption_swap_type=boot_disk for fio data.', + swap_dev, + ) + return [] + + results = [] + + _pod_exec(pod, f'swapoff {swap_dev}', ignore_failure=True) + + # Pre-fill device so read tests have real data (avoids zero-block optimisation + # by the storage controller skewing read latency measurements). + # Cap at 20 GiB — enough to warm up the dm-crypt pipeline and cover the fio + # runtime window. Writing 100% of a 500 GiB hyperdisk takes ~500+ seconds + # at provisioned throughput, which exceeds the PKB command timeout. + # Timeout: 20 GiB / ~150 MB/s (conservative dm-crypt write rate) + 60 s buffer. + _PREFILL_GIB = 20 + prefill_timeout = _PREFILL_GIB * 1024 // 150 + 60 # ~197 s, rounds up to ~200 s + prefill_timeout = max(prefill_timeout, 300) # floor at 5 min + logging.info('[swap_encryption] Pre-filling %d GiB of %s', _PREFILL_GIB, swap_dev) + # No --output-format=json for prefill; we only care that it completes. + # Still use --output to avoid streaming large stdout over the websocket. + _pod_exec(pod, ( + f'fio --name=prefill --filename={swap_dev} ' + f'--ioengine=libaio --direct=1 --rw=write --bs=1m ' + f'--size={_PREFILL_GIB}g --verify=0 --output=/tmp/pkb_fio_prefill.log' + ), timeout=prefill_timeout, ignore_failure=True) + + # Each fio job: runtime + 90 s buffer (run + JSON write + file read). + # We write fio output to a file inside the pod and retrieve it in a second + # short-lived kubectl exec, because: + # - A single 120 s kubectl exec session over GKE websocket can be reset + # by the control-plane load balancer mid-stream ("connection reset by + # peer"), losing the output. + # - Separating the long run from the short file-read gives each exec a + # much shorter window, avoiding the keepalive timeout. + fio_run_timeout = _FIO_RUNTIME_SEC.value + 90 + fio_read_timeout = 60 # just a cat of the JSON file + + for name, rw, bs, depth, label in _FIO_JOBS: + logging.info('[swap_encryption] fio: %s', name) + out_file = f'/tmp/pkb_fio_{name}.json' + # Remove any stale output first so a parse can never silently reuse a + # previous job's/run's result (rules out byte-identical results between + # runs being a caching artifact rather than a true device ceiling). + _pod_exec(pod, f'rm -f {out_file}', ignore_failure=True, _retries=0, + timeout=15) + run_cmd = ( + f'fio --name={name} --filename={swap_dev} ' + f'--ioengine=libaio --direct=1 --verify=0 --randrepeat=0 ' + f'--bs={bs} --iodepth={depth} --rw={rw} ' + f'--time_based --runtime={_FIO_RUNTIME_SEC.value}s ' + f'--output-format=json --output={out_file}' + ) + _, err = _pod_exec(pod, run_cmd, timeout=fio_run_timeout, + ignore_failure=True, _retries=0) + if 'connection reset by peer' in err: + logging.warning('[swap_encryption] fio %s: kubectl exec connection ' + 'reset; result may be incomplete', name) + out, _ = _pod_exec(pod, f'cat {out_file} 2>/dev/null || echo ""', + timeout=fio_read_timeout, ignore_failure=True) + results += _parse_fio_json(out, name, label, base_meta) + + # fio prefill overwrites the entire device, destroying the mkswap header. + # Re-stamp and re-enable before the remaining phases need active swap. + _pod_exec(pod, f'mkswap {swap_dev} && swapon {swap_dev}', + ignore_failure=True, timeout=120) + return results + + +def _parse_fio_json( + stdout: str, job_name: str, label: str, base_meta: dict +) -> list[sample.Sample]: + """Parse fio JSON output into PKB Samples.""" + results = [] + try: + data = json.loads(stdout) + except (json.JSONDecodeError, ValueError): + logging.warning('[swap_encryption] fio JSON parse failed for %s', job_name) + return results + + meta = dict(base_meta, fio_job=job_name, fio_label=label) + for job in data.get('jobs', []): + for direction in ('read', 'write'): + d = job.get(direction, {}) + if not d or d.get('io_bytes', 0) == 0: + continue + iops = float(d.get('iops', 0)) + bw_kib = float(d.get('bw', 0)) + clat = d.get('clat_ns', {}) + pct = clat.get('percentile', {}) + lat_mean = float(clat.get('mean', 0)) / 1000.0 + lat_p50 = float(pct.get('50.000000', 0)) / 1000.0 + lat_p99 = float(pct.get('99.000000', 0)) / 1000.0 + lat_p999 = float(pct.get('99.900000', 0)) / 1000.0 + m = dict(meta, direction=direction) + results += [ + sample.Sample( + f'{job_name}_{direction}_iops', iops, 'iops', m), + sample.Sample( + f'{job_name}_{direction}_bw_mbps', bw_kib / 1024, 'MB/s', m), + sample.Sample( + f'{job_name}_{direction}_lat_mean', lat_mean, 'us', m), + sample.Sample( + f'{job_name}_{direction}_lat_p50', lat_p50, 'us', m), + sample.Sample( + f'{job_name}_{direction}_lat_p99', lat_p99, 'us', m), + sample.Sample( + f'{job_name}_{direction}_lat_p999', lat_p999, 'us', m), + ] + return results + + _INSTANCE_PRICE_USD_PER_HR: dict[str, float] = { # GCP (on-demand, us-central1 unless noted) 'c4-standard-8-lssd': 0.5888, # 8 vCPU, 32 GB RAM + 1×375 GB LSSD From 4438d61d7b990261ca7b80267e8284e0e598593c Mon Sep 17 00:00:00 2001 From: DevVegeta Date: Fri, 19 Jun 2026 09:43:25 +0530 Subject: [PATCH 4/8] PR4: swap-encryption benchmark - layer 4/5 (single file swap_encryption_benchmark.py) --- .../swap_encryption_benchmark.py | 805 ++++++++++++++++++ 1 file changed, 805 insertions(+) diff --git a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py index b9e8a01879..5b32395e9c 100644 --- a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py +++ b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py @@ -158,6 +158,49 @@ ) +_STRESS_TIMEOUT_SEC = flags.DEFINE_integer( + 'swap_encryption_stress_timeout_sec', + 300, + 'Duration in seconds of each stress-ng memory-pressure phase.', +) + + +_STRESS_VM_BYTES = flags.DEFINE_string( + 'swap_encryption_stress_vm_bytes', + '28G', + 'Combined stress-ng working-set size (total in-flight footprint, not ' + 'per-worker). It is divided equally across --swap_encryption_stress_vm_' + 'workers before being passed to stress-ng, so the total memory touched ' + 'equals this value. Should exceed node RAM to force kernel swapping.', +) + + +_STRESS_VM_BYTES_LIST = flags.DEFINE_string( + 'swap_encryption_stress_vm_bytes_list', + '', + 'Comma-separated list of stress-ng --vm-bytes values to iterate over ' + 'in Phase 2a CPU-overhead sweeps, e.g. "14G,21G,28G". When non-empty ' + 'this overrides --swap_encryption_stress_vm_bytes and Phase 2a is run ' + 'once per entry so that the swap-pressure intensity curve is captured.', +) + + +_STRESS_VM_WORKERS = flags.DEFINE_integer( + 'swap_encryption_stress_vm_workers', + 4, + 'Number of parallel stress-ng --vm workers for Phase 2a. The total ' + 'working set (the autoscaled vm_bytes) is divided equally across workers, ' + 'so the combined footprint stays under RAM+swap (no OOM) while exceeding ' + 'RAM (forcing swap). Multiple workers are needed for fill speed — a ' + 'single write64 worker cannot dirty enough memory within the timeout to ' + 'reach RAM (run swap1: ~184 GB resident, no swap). To stop the N ' + 'workers\' resident sets from collapsing to one worker\'s share, the ' + 'stressor uses random access (rand-set) and disables KSM page-merging ' + '(without those, identical write64 pages across workers were merged, ' + 'leaving only ~vm_bytes/N resident and swap_out ~0).', +) + + _ENABLE_ZSWAP = flags.DEFINE_boolean( 'swap_encryption_enable_zswap', False, @@ -251,6 +294,19 @@ ) +_MIN_SWAP_OUT_PAGES = flags.DEFINE_integer( + 'swap_encryption_min_swap_out_pages', + 1000, + 'Minimum peak swap-out rate (pages/s) that Phase 2a must reach for the run ' + 'to count as a real swap-encryption measurement. Below this the working ' + 'set never meaningfully paged (e.g. run swap1 peaked at 176 pages/s of ' + 'noise yet "passed" the old zero-only gate), so the dm-crypt overhead is ' + 'hollow and the run is flagged degraded. A genuinely swapping run peaks in ' + 'the tens-to-hundreds of thousands of pages/s. Set 0 to accept any ' + 'non-zero swap-out (legacy behaviour).', +) + + _BENCHMARK_MACHINE_TYPE = flags.DEFINE_string( 'swap_encryption_benchmark_machine_type', 'n4-highmem-32', @@ -373,6 +429,9 @@ _active_pod: list[str] = [] # single-element list so closures can mutate it +_stress_vm_method: list[str] = [] # single-element list; '' means no --vm-method flag + + _degraded_reasons: list[str] = [] @@ -399,6 +458,15 @@ ) +_VMSTAT_LOG = '/tmp/pkb_vmstat.log' + + +_PIDSTAT_LOG = '/tmp/pkb_pidstat.log' + + +_CRYPTO_PROCS = ('kswapd', 'kworker', 'kcryptd', 'dmcrypt_write') + + def _daemonset_yaml(image: str) -> str: """Render the privileged benchmark DaemonSet manifest. @@ -612,6 +680,22 @@ def Run(spec) -> list[sample.Sample]: logging.info('[swap_encryption] Skipping Tier 1 (fio) — not selected by ' '--swap_encryption_phases=%s', ','.join(_PHASES.value)) + # ── Tier 2 / Gate 2: stress-ng CPU overhead + I/O interference ─────────── + if _phase_selected('2a') or _phase_selected('2b'): + logging.info('[swap_encryption] ── Tier 2 / Gate 2: stress-ng phases ──') + try: + if _phase_selected('2a'): + logging.info('[swap_encryption] Phase 2a: CPU overhead') + results += _phase2a_cpu_overhead(pod, base_meta) + if _phase_selected('2b'): + logging.info('[swap_encryption] Phase 2b: I/O interference') + results += _phase2b_io_interference(pod, base_meta) + except Exception as e: # pylint: disable=broad-except + logging.error('[swap_encryption] Gate 2 FAILED — stress phase error: %s', + e) + logging.warning('[swap_encryption] Proceeding to Tier 3 (workloads are ' + 'independent of stress-ng results)') + # ── Cost estimate ───────────────────────────────────────────────────────── if _COLLECT_COST.value: elapsed = time.time() - t_run_start @@ -2513,6 +2597,727 @@ def _parse_fio_json( return results +def _parse_vm_bytes_to_mb(vm_bytes: str) -> float: + """Parse a vm-bytes string like '28G', '512M', '1024k' into megabytes.""" + vm_bytes = vm_bytes.strip() + if not vm_bytes: + return 0.0 + suffix = vm_bytes[-1].upper() + try: + value = float(vm_bytes[:-1]) + except ValueError: + return 0.0 + if suffix == 'G': + return value * 1024.0 + elif suffix == 'M': + return value + elif suffix == 'K': + return value / 1024.0 + elif suffix == 'T': + return value * 1024.0 * 1024.0 + else: + # Assume bytes + try: + return float(vm_bytes) / (1024.0 * 1024.0) + except ValueError: + return 0.0 + + +def _per_worker_vm_bytes(total_vm_bytes: str, workers: int) -> str: + """Split a *total* vm-bytes target across N stress-ng --vm workers. + + stress-ng allocates ``--vm-bytes`` PER worker, so ``--vm N --vm-bytes B`` + touches ``N * B`` of memory. Every vm_bytes value in this benchmark (the + --swap_encryption_stress_vm_bytes flag and the _autoscale_vm_bytes result) + represents the intended *combined* footprint, as documented on + --swap_encryption_stress_vm_workers ("workers divide vm_bytes equally ... + the combined in-flight footprint equals vm_bytes"). We therefore divide by + the worker count before handing the value to stress-ng; otherwise N>1 + workers allocate N x the target and the kernel OOM-kills the whole pod + (observed as stress-ng rc=137, after which all later phases fail with + "pods not found"). + + Returns a stress-ng-friendly ``M`` string (megabytes), floored to at + least 1M. + """ + workers = max(1, int(workers)) + total_mb = _parse_vm_bytes_to_mb(total_vm_bytes) + if total_mb <= 0: + # Unparseable — fall back to letting stress-ng divide nothing rather than + # silently changing behaviour; the caller's value is passed through. + return total_vm_bytes + per_worker_mb = max(1, int(total_mb / workers)) + return f'{per_worker_mb}M' + + +def _cgroup_swap_limit_mb(pod: str) -> float: + """Return the swap budget (in MB) that the benchmark cgroup can actually use. + + GKE sets the per-container cgroup v2 ``memory.swap.max`` to 0, so even though + the node advertises a large swap device the container cannot page anything + out. Sizing stress-ng against the *node* swap total in that case guarantees + an OOM kill. This probe finds the swap budget of *our* cgroup so the caller + can size against reality. + + We locate our own cgroup from the host-mounted /sys by finding the + ``cgroup.procs`` file that lists this shell's PID — ``hostPID: true`` means + ``$$`` is a host-namespace PID that appears in those files, and the + kubectl-exec'd shell shares the container's cgroup with stress-ng. + + Returns: + ``float('inf')`` when swap is uncapped (``max``); the limit in MB when + capped to a finite value; ``0.0`` when swap is fully locked + (``memory.swap.max == 0``); ``-1.0`` when the limit could not be read (the + caller then falls back to the legacy node-total behaviour). + """ + probe = textwrap.dedent(""" + mypid=$$ + for procf in $(find /sys/fs/cgroup -path '*kubepods*' -name cgroup.procs 2>/dev/null) + do + if grep -qx "$mypid" "$procf" 2>/dev/null + then + d=$(dirname "$procf") + if [ -f "$d/memory.swap.max" ] + then + echo "V2=$(cat "$d/memory.swap.max" 2>/dev/null)" + elif [ -f "$d/memory.memsw.limit_in_bytes" ] && [ -f "$d/memory.limit_in_bytes" ] + then + echo "MEMSW=$(cat "$d/memory.memsw.limit_in_bytes" 2>/dev/null) MEM=$(cat "$d/memory.limit_in_bytes" 2>/dev/null)" + fi + break + fi + done + """) + try: + out, _ = _pod_exec(pod, probe, timeout=20, ignore_failure=True) + except Exception as e: # pylint: disable=broad-except + logging.warning('[swap_encryption] cgroup swap-limit probe failed: %s', e) + return -1.0 + + text = (out or '').strip() + m = re.search(r'V2=(\S+)', text) + if m: + val = m.group(1) + if val == 'max': + return float('inf') + try: + return int(val) / (1024.0 * 1024.0) + except ValueError: + return -1.0 + # cgroup v1: the combined RAM+swap ceiling is memsw; swap budget = memsw-mem. + m = re.search(r'MEMSW=(\S+)\s+MEM=(\S+)', text) + if m: + try: + memsw = int(m.group(1)) + mem = int(m.group(2)) + except ValueError: + return -1.0 + # A near-2^63 sentinel means "unlimited" in cgroup v1. + if memsw >= (1 << 62): + return float('inf') + return max(0.0, (memsw - mem) / (1024.0 * 1024.0)) + return -1.0 + + +def _autoscale_vm_bytes(pod: str, vm_bytes: str) -> str: + """Ensure vm_bytes forces real swap I/O without hard-crashing the container. + + Strategy + -------- + We want stress-ng to overflow into swap so that dm-crypt / Nitro encryption + overhead is actually measured. Two competing constraints apply: + + 1. vm_bytes must exceed available RAM so that anonymous pages are paged out + to the swap device. A value below ~95 % of RAM fits entirely in memory + and produces swap_out_pages_per_sec = 0 (benchmark defeats itself). + + 2. vm_bytes must not be so large that the kernel OOM-kills the whole + container before any meaningful swap activity is recorded. + + Target formula + -------------- + target = RAM + min(swap_size × 0.25, 64 GB) + + This guarantees at least 25 % of the swap device is actively exercised + (measured swap I/O) while keeping the allocation safely within what the + kernel can page out given the available swap space. The 64 GB cap prevents + extremely large targets on machines with huge swap devices. + + On large-RAM machines (e.g. n4-highmem-32, 252 GB) the old 110%-of-RAM + formula only overflowed by ~25 GB; with sequential write64 patterns the + kernel handled that via LRU page eviction without actually hitting the swap + device, yielding swap_out = 0. The new formula forces a much larger working + set into swap. + + Hard ceiling + ------------ + Regardless of the formula, cap at RAM + swap_size - 4 GB (4 GB headroom) + to avoid exhausting the swap device and triggering kernel panics. + """ + try: + meminfo_out, _ = _pod_exec(pod, 'cat /proc/meminfo', timeout=15) + node_ram_kb = 0 + swap_total_kb = 0 + for line in meminfo_out.splitlines(): + if line.startswith('MemTotal:'): + parts = line.split() + if len(parts) >= 2: + node_ram_kb = int(parts[1]) + elif line.startswith('SwapTotal:'): + parts = line.split() + if len(parts) >= 2: + swap_total_kb = int(parts[1]) + if node_ram_kb and swap_total_kb: + break + + if node_ram_kb <= 0: + logging.warning('[swap_encryption] Could not read MemTotal; using vm_bytes=%s', vm_bytes) + return vm_bytes + + node_ram_mb = node_ram_kb / 1024.0 + swap_total_mb = swap_total_kb / 1024.0 + requested_mb = _parse_vm_bytes_to_mb(vm_bytes) + if requested_mb <= 0: + return vm_bytes + + # The node may advertise a large SwapTotal while THIS cgroup is forbidden + # from using it (GKE sets memory.swap.max=0 per container). Size against + # the swap the cgroup can actually reach, not the node total — otherwise a + # value like 32G OOM-kills the pod the instant it exceeds RAM. + cgroup_swap_mb = _cgroup_swap_limit_mb(pod) + usable_swap_mb = swap_total_mb # default / legacy when probe is inconclusive + if cgroup_swap_mb == 0.0: + # Swap is fully locked. Cap the working set just under RAM so the pod + # survives, and mark the run degraded: swap-encryption overhead cannot be + # measured when the cgroup cannot page out. + safe_gb = max(1, int(node_ram_mb * 0.9 / 1024)) + msg = (f'cgroup swap is locked (memory.swap.max=0); the ' + f'{swap_total_mb/1024:.0f} GB node swap device is unreachable. ' + f'Capping stress-ng vm_bytes {vm_bytes} → {safe_gb}G (0.9 x RAM) ' + f'to keep the pod alive — swap-encryption overhead will NOT be ' + f'measured this run') + logging.error('[swap_encryption] %s', msg) + _degraded_reasons.append(msg) + return f'{safe_gb}G' + if 0.0 < cgroup_swap_mb < float('inf'): + # cgroup permits a finite swap budget smaller than the device. + usable_swap_mb = min(swap_total_mb, cgroup_swap_mb) + # cgroup_swap_mb == inf -> swap fully usable (node total stands) + # cgroup_swap_mb == -1 -> undetermined; fall back to node total (legacy) + + # Desired overflow: 25% of usable swap capped at 64 GB, minimum 4 GB. + overflow_mb = max(min(usable_swap_mb * 0.25, 64.0 * 1024), 4.0 * 1024) + target_mb = node_ram_mb + overflow_mb + + # Hard ceiling: never exceed RAM + usable swap − 4 GB headroom. + if usable_swap_mb > 0: + ceiling_mb = node_ram_mb + usable_swap_mb - 4096.0 + target_mb = min(target_mb, ceiling_mb) + else: + # No usable swap at all (and not the locked-at-0 case handled above): + # keep the working set just under RAM. + target_mb = min(target_mb, node_ram_mb * 0.9) + + target_gb = max(1, int(target_mb / 1024)) # floor to GB for a clean flag + + if requested_mb < node_ram_mb * 0.95: + new_vm_bytes = f'{target_gb}G' + logging.warning( + '[swap_encryption] Auto-scaling vm_bytes UP: %s → %s ' + '(RAM %.0f GB, swap %.0f GB; original value would not trigger swap)', + vm_bytes, new_vm_bytes, node_ram_mb / 1024, swap_total_mb / 1024, + ) + return new_vm_bytes + + if requested_mb > target_mb: + new_vm_bytes = f'{target_gb}G' + logging.warning( + '[swap_encryption] Capping vm_bytes DOWN: %s → %s ' + '(RAM %.0f GB, swap %.0f GB; original value risks swap exhaustion)', + vm_bytes, new_vm_bytes, node_ram_mb / 1024, swap_total_mb / 1024, + ) + return new_vm_bytes + + return vm_bytes + except Exception as e: # pylint: disable=broad-except + logging.warning('[swap_encryption] _autoscale_vm_bytes failed (%s); using %s', e, vm_bytes) + return vm_bytes + + +def _get_stress_vm_method(pod: str) -> str: + """Detect the best --vm-method argument for stress-ng on this node. + + stress-ng vm-method support varies by version and distro: + - Older Ubuntu / some GKE images: supports 'mmap' + - Newer Ubuntu on n4-highmem-32 (kernel 6.8+ GKE): 'mmap' removed; supports + 'write64', 'rand-set', etc. + + We prefer 'mmap' (lowest overhead, no kernel structure cycling), fall back to + 'write64' (simple sequential writes, universally available), then 'rand-set', + and if none are listed we return '' so callers omit the --vm-method flag + entirely (stress-ng then uses its compiled-in default). + + NOTE on forcing swap (two independent requirements): + (a) The working set must exceed RAM. Without --vm-keep each worker re-mmaps + and re-touches its full slice every iteration, so all + --swap_encryption_stress_vm_workers slices are simultaneously resident and + the combined footprint exceeds RAM (run 910c8da5 swapped ~10k pages/s with + write64 and no --vm-keep). Adding --vm-keep made stress-ng reuse one + quiescent mapping, the resident set plateaued below RAM, and the gate + fired — so we must NOT pass --vm-keep. + (b) The workers must stay BUSY for the whole phase. Do NOT pass --vm-hang 0: + stress-ng documents "--vm-hang 0" as "sleep for an INFINITE time before + unmapping", so each worker wrote its slice once and then slept for the + rest of the run — usr+sys CPU was ~10 s out of 300 s and si/so stayed 0 + (runs 14907cff, config1/111, even with KSM disabled and rand-set). + Omitting --vm-hang entirely lets the workers loop continuously, keeping + the slices hot so the over-RAM remainder pages to swap throughout. + + Result is cached in _stress_vm_method so the detection kubectl exec only runs + once per benchmark run. + """ + if _stress_vm_method: + return _stress_vm_method[0] + + try: + # stress-ng prints its valid vm-methods to stdout when given an invalid one. + out, _, _ = kubectl.RunKubectlCommand( + ['exec', (_active_pod[0] if _active_pod else pod), + '-n', _DS_NAMESPACE, + '--', 'bash', '-c', + 'stress-ng --vm 1 --vm-bytes 1M --vm-method __invalid__ --timeout 1s 2>&1 || true'], + raise_on_failure=False, timeout=15, + ) + combined = out.lower() + # Prefer rand-set: random access keeps every page of each worker's slice + # hot (no cold pages behind a sequential write pointer to reclaim) and + # writes non-identical data (so KSM cannot merge the workers' regions). + # write64 is sequential and was empirically reclaimed / merged, leaving the + # resident set below RAM and swap_out ~0. + if 'rand-set' in combined: + method = 'rand-set' + elif 'mmap' in combined: + method = 'mmap' + elif 'write64' in combined: + method = 'write64' + else: + method = '' # omit flag; use stress-ng default + logging.info('[swap_encryption] stress-ng vm-method detected: %r', method or '(default)') + except Exception as e: # pylint: disable=broad-except + logging.warning('[swap_encryption] vm-method detection failed (%s); using rand-set', e) + method = 'rand-set' + + _stress_vm_method.append(method) + return method + + +def _stress_vm_method_flag(pod: str) -> str: + """Return the --vm-method flag string, or empty string if none.""" + method = _get_stress_vm_method(pod) + return f'--vm-method {method}' if method else '' + + +def _phase2a_cpu_overhead(pod: str, base_meta: dict) -> list[sample.Sample]: + """Measure CPU cost of dm-crypt / Nitro while stress-ng drives swap I/O. + + If --swap_encryption_stress_vm_bytes_list is set the phase is run once per + listed intensity value so that a full pressure-curve is captured (gap 5). + Otherwise the single value from --swap_encryption_stress_vm_bytes is used. + + Auto-scaling: if the requested vm_bytes is less than 95% of node RAM, it is + automatically increased to 110% of node RAM so that swap is actually + triggered on large-RAM machines (e.g. n4-highmem-32 with 256 GB). + """ + # Build the list of vm-bytes intensities to sweep (gap 5) + if _STRESS_VM_BYTES_LIST.value.strip(): + intensities = [v.strip() for v in _STRESS_VM_BYTES_LIST.value.split(',') + if v.strip()] + else: + intensities = [_STRESS_VM_BYTES.value] + + results = [] + for vm_bytes in intensities: + scaled = _autoscale_vm_bytes(pod, vm_bytes) + logging.info('[swap_encryption] Phase 2a: stress-ng intensity %s', scaled) + results += _run_cpu_overhead_sweep(pod, base_meta, scaled) + return results + + +def _run_cpu_overhead_sweep( + pod: str, base_meta: dict, vm_bytes: str +) -> list[sample.Sample]: + """Phase 2a stressor sweep, WITH RETRY for flaky swap. + + Driving the multi-worker rand-set working set past RAM into swap is + empirically non-deterministic on these nodes: the SAME config produced + ~670k pages/s on some runs and <300 on others. So we retry: if an attempt + completes but peak swap-out is below the threshold (and it did not OOM), + reclaim memory and re-run, keeping the BEST attempt. An OOM, or a peak + at/above threshold, ends the retries immediately. + """ + meta = dict(base_meta, phase='cpu_overhead', stress_vm_bytes=vm_bytes) + timeout = _STRESS_TIMEOUT_SEC.value + interval = 2 + n_samples = timeout // interval + 10 + vmstat_log = f'/tmp/pkb_vmstat_{vm_bytes}.log' + pidstat_log = f'/tmp/pkb_pidstat_{vm_bytes}.log' + workers = max(1, _STRESS_VM_WORKERS.value) + per_worker = _per_worker_vm_bytes(vm_bytes, workers) + min_so = _MIN_SWAP_OUT_PAGES.value + method_flag = _stress_vm_method_flag(pod) + max_attempts = 3 + best = None + + for attempt in range(1, max_attempts + 1): + t0 = time.time() + stress_out, _ = _pod_exec(pod, textwrap.dedent(f""" + echo 2 > /sys/kernel/mm/ksm/run 2>/dev/null || true + echo 0 > /sys/kernel/mm/ksm/run 2>/dev/null || true + sysctl -w vm.swappiness=100 >/dev/null 2>&1 || true + PKB_MCG=$(awk -F: '/^0::/{{print $3}}' /proc/self/cgroup 2>/dev/null) + echo "[pkb] phase2a attempt={attempt}/{max_attempts} ksm_run=$(cat /sys/kernel/mm/ksm/run 2>/dev/null || echo n/a) swappiness=$(cat /proc/sys/vm/swappiness 2>/dev/null) MemAvailable_kB=$(awk '/MemAvailable/{{print $2}}' /proc/meminfo) memory.swap.max=$(cat /sys/fs/cgroup$PKB_MCG/memory.swap.max 2>/dev/null || echo n/a) workers={workers} per_worker={per_worker}" + vmstat {interval} {n_samples} > {vmstat_log} 2>&1 & + VMSTAT_PID=$! + pidstat -u {interval} {n_samples} -p ALL > {pidstat_log} 2>&1 & + PISTAT_PID=$! + stress-ng --vm {workers} \\ + --vm-bytes {per_worker} \\ + {method_flag} \\ + --timeout {timeout}s \\ + --metrics-brief 2>&1 || true + kill $VMSTAT_PID $PISTAT_PID 2>/dev/null || true + """), timeout=timeout + 60, ignore_failure=True) + elapsed = time.time() - t0 + + completed_cleanly = ('successful run completed' in stress_out.lower() + or 'metrics-brief' in stress_out.lower() + or 'bogo-ops' in stress_out.lower()) + oom_killed = (not completed_cleanly) and elapsed < timeout * 0.8 + vmstat_out, _ = _pod_exec(pod, f'cat {vmstat_log}', ignore_failure=True) + pidstat_out, _ = _pod_exec(pod, f'cat {pidstat_log}', ignore_failure=True) + vmstat_samples = _parse_vmstat(vmstat_out, meta) + swap_out_max = max( + (s.value for s in vmstat_samples + if s.metric in ('swap_out_pages_per_sec', + 'swap_out_pages_per_sec_max')), default=0.0) + bogo = None + for line in stress_out.splitlines(): + mm = re.search(r'vm\s+\d+\s+(\d+)\s+\S+\s+bogo-ops', line) + if mm: + bogo = float(mm.group(1)) + break + logging.info('[swap_encryption] Phase 2a attempt %d/%d: peak swap-out ' + '%.0f pages/s (completed=%s, oom=%s)', attempt, max_attempts, + swap_out_max, completed_cleanly, oom_killed) + if best is None or swap_out_max > best['swap_out_max']: + best = dict(elapsed=elapsed, oom_killed=oom_killed, + swap_out_max=swap_out_max, vmstat_samples=vmstat_samples, + pidstat_out=pidstat_out, bogo=bogo) + if oom_killed or swap_out_max >= min_so: + break + if attempt < max_attempts: + logging.warning('[swap_encryption] Phase 2a swap-out %.0f < %d threshold ' + '— reclaiming and retrying (%d/%d)', swap_out_max, min_so, + attempt + 1, max_attempts) + _pod_exec(pod, textwrap.dedent(""" + echo -1000 > /proc/self/oom_score_adj 2>/dev/null || true + pkill -9 stress-ng 2>/dev/null || true + sleep 3; sync; echo 1 > /proc/sys/vm/drop_caches 2>/dev/null || true + """), ignore_failure=True, timeout=60) + + # Emit samples from the BEST attempt. + results = [ + sample.Sample('stress_ng_duration_sec', best['elapsed'], 's', meta), + sample.Sample('stress_ng_completed', + 0.0 if best['oom_killed'] else 1.0, 'status', meta), + ] + if best['bogo'] is not None: + results.append(sample.Sample('stress_ng_bogo_ops', best['bogo'], 'ops', + meta)) + results += best['vmstat_samples'] + results += _parse_pidstat(best['pidstat_out'], meta) + + # Swap-activity gate: a completed run that moved ~no pages to swap never + # exercised the encrypted swap path (the headline numbers would be hollow). + if best['oom_killed']: + msg = (f'stress-ng (vm_bytes={vm_bytes}) was OOM-killed — the cgroup could ' + f'not page anonymous memory out to swap; swap-encryption overhead ' + f'was not measured') + logging.error('[swap_encryption] %s', msg) + _degraded_reasons.append(msg) + elif best['swap_out_max'] < min_so: + msg = (f'stress-ng (vm_bytes={vm_bytes}) peak swap-out was only ' + f'{best["swap_out_max"]:.0f} pages/s (< {min_so} threshold) after ' + f'{max_attempts} attempts — the working set never meaningfully ' + f'paged to swap. Check vm_bytes vs RAM and the swap device') + logging.error('[swap_encryption] %s', msg) + _degraded_reasons.append(msg) + + return results + + +def _parse_vmstat(output: str, base_meta: dict) -> list[sample.Sample]: + """Parse vmstat output for swap rates AND CPU utilisation. + + Standard vmstat column layout (non-header data lines, 0-indexed): + r b swpd free buff cache si so bi bo in cs us sy id wa st + 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 + + si=6, so=7 – swap-in / swap-out pages/s + us=12 – user CPU % + sy=13 – system (kernel) CPU % ← gap 2: system time % + id=14 – idle CPU % + wa=15 – I/O wait CPU % + total_active = us + sy + wa ← gap 1: total CPU utilisation + """ + si_vals, so_vals = [], [] + us_vals, sy_vals, wa_vals = [], [], [] + + for line in output.splitlines(): + parts = line.split() + if len(parts) < 17 or not parts[0].isdigit(): + continue + try: + si_vals.append(float(parts[6])) + so_vals.append(float(parts[7])) + us_vals.append(float(parts[12])) + sy_vals.append(float(parts[13])) + wa_vals.append(float(parts[15])) + except (ValueError, IndexError): + pass + + if not si_vals: + return [] + + meta = dict(base_meta, metric_source='vmstat') + + def _mean(lst): + return sum(lst) / len(lst) if lst else 0.0 + + def _peak(lst): + return max(lst) if lst else 0.0 + + total_active = [u + s + w for u, s, w in zip(us_vals, sy_vals, wa_vals)] + + return [ + # Swap rates + sample.Sample( + 'swap_in_pages_per_sec', _mean(si_vals), 'pages/s', meta), + sample.Sample( + 'swap_in_pages_per_sec_max', _peak(si_vals), 'pages/s', meta), + sample.Sample( + 'swap_out_pages_per_sec', _mean(so_vals), 'pages/s', meta), + sample.Sample( + 'swap_out_pages_per_sec_max', _peak(so_vals), 'pages/s', meta), + # Total CPU utilisation (gap 1) + sample.Sample( + 'total_cpu_pct_avg', _mean(total_active), '%', meta), + sample.Sample( + 'total_cpu_pct_max', _peak(total_active), '%', meta), + # System (kernel) time % – encryption overhead signal (gap 2) + sample.Sample('system_time_pct_avg', _mean(sy_vals), '%', meta), + sample.Sample('system_time_pct_max', _peak(sy_vals), '%', meta), + # User and I/O-wait for completeness + sample.Sample('user_cpu_pct_avg', _mean(us_vals), '%', meta), + sample.Sample('iowait_cpu_pct_avg', _mean(wa_vals), '%', meta), + ] + + +def _parse_pidstat(output: str, base_meta: dict) -> list[sample.Sample]: + """Parse CPU % for swap/encryption-related kernel threads from pidstat.""" + cpu_by_proc: dict[str, list[float]] = {} + for line in output.splitlines(): + parts = line.split() + if len(parts) < 9: + continue + proc = parts[-1] + if not any(t in proc for t in _CRYPTO_PROCS): + continue + try: + cpu_by_proc.setdefault(proc, []).append(float(parts[7])) + except (ValueError, IndexError): + pass + results = [] + meta = dict(base_meta, metric_source='pidstat') + for proc, vals in cpu_by_proc.items(): + m = dict(meta, process=proc) + results += [ + sample.Sample(f'cpu_pct_avg_{proc}', sum(vals) / len(vals), '%', m), + sample.Sample(f'cpu_pct_max_{proc}', max(vals), '%', m), + ] + return results + + +def _launch_confined_bg_stress(pod: str, timeout_s: int, logfile: str) -> None: + """Launch the Phase 2b/3a background swap stressor confined to its OWN + memory-capped cgroup, so it drives swap pressure WITHOUT starving the + concurrent foreground workload (fio / Redis) or OOM-killing the pod. + + On a small node (config1, 30 GB) a flat 32 GB stressor plus a concurrent + workload exhausts RAM faster than the kernel pages out, and the OOM killer + takes the foreground process (the under-pressure app_io fio died with + rc=137). Confining the stressor to memory.max = 60% of RAM (with unlimited + swap) makes it page within its own budget; the other ~40% of RAM stays free + for the workload, and if the stressor overruns its cap only IT is killed — + never the pod or the workload. + + Config-2 safety: on a 256 GB node, 60% = ~150 GB, far above the 32 GB + stressor, so the cap is never reached and behaviour is unchanged. + Best-effort: if the cgroup can't be created the stressor still runs in the + main cgroup (degrades to prior behaviour, not worse). MemTotal is read with + grep/cut (no awk) to keep this clear of f-string brace escaping. + """ + method = _stress_vm_method_flag(pod) + vm_bytes = _STRESS_VM_BYTES.value + _pod_exec(pod, textwrap.dedent(f""" + nohup bash -c ' + BG=/sys/fs/cgroup/pkb_bgstress + mkdir -p "$BG" 2>/dev/null || true + echo +memory > /sys/fs/cgroup/cgroup.subtree_control 2>/dev/null || true + echo max > "$BG/memory.swap.max" 2>/dev/null || true + MT_KB=$(grep -m1 MemTotal /proc/meminfo | tr -s " " | cut -d" " -f2) + echo $(( MT_KB * 1024 * 60 / 100 )) > "$BG/memory.max" 2>/dev/null || true + echo $$ > "$BG/cgroup.procs" 2>/dev/null || true + exec stress-ng --vm 1 --vm-bytes {vm_bytes} {method} --timeout {timeout_s}s + ' >{logfile} 2>&1 & + disown + echo STRESS_STARTED + """), timeout=30) + + +def _set_memory_high_guard(pod: str, fraction: float = 0.9) -> None: + """Cap the container cgroup ``memory.high`` at `fraction` x RAM. + + Phases 2b (I/O interference) and 3a (Redis) run a background stressor *and* a + concurrent foreground workload (an 8 GB fio file / a Redis dataset). On a + small-RAM node (config1, 30 GB) their combined footprint exceeds RAM and the + hard OOM killer (``memory.max``) terminates the pod (rc=137), wiping out both + phases. ``memory.high`` is a soft limit: when the cgroup crosses it the + kernel reclaims and *swaps* aggressively (throttling the cgroup) instead of + killing it — which is exactly the swap pressure these phases want to create. + + Config-2 safety: this is a no-op in effect on large-RAM nodes. On + n4-highmem-32 (256 GB) the 32 GB background workload never approaches 0.9 x + 256 GB = 230 GB, so the soft limit is never crossed and behaviour is + unchanged. Phase 2a is deliberately NOT guarded (it works on both configs). + Best-effort; any failure is ignored. + """ + _pod_exec(pod, textwrap.dedent(f""" + PKB_MCG=$(awk -F: '/^0::/{{print $3}}' /proc/self/cgroup 2>/dev/null) + MT_KB=$(awk '/MemTotal/{{print $2}}' /proc/meminfo) + HIGH=$(( MT_KB * 1024 / 100 * {int(fraction * 100)} )) + if [ -n "$PKB_MCG" ] && [ -f "/sys/fs/cgroup$PKB_MCG/memory.high" ]; then + echo $HIGH > "/sys/fs/cgroup$PKB_MCG/memory.high" 2>/dev/null \ + && echo "[pkb] memory.high set to $HIGH bytes ({int(fraction * 100)}% RAM) — pod will swap, not OOM" \ + || echo "[pkb] WARNING: could not set memory.high" >&2 + fi + """), ignore_failure=True, timeout=30, _retries=0) + + +def _reset_memory_high_guard(pod: str) -> None: + """Restore ``memory.high`` to ``max`` after a guarded phase.""" + _pod_exec(pod, textwrap.dedent(""" + PKB_MCG=$(awk -F: '/^0::/{print $3}' /proc/self/cgroup 2>/dev/null) + if [ -n "$PKB_MCG" ] && [ -f "/sys/fs/cgroup$PKB_MCG/memory.high" ]; then + echo max > "/sys/fs/cgroup$PKB_MCG/memory.high" 2>/dev/null || true + fi + """), ignore_failure=True, timeout=30, _retries=0) + + +def _phase2b_io_interference(pod: str, base_meta: dict) -> list[sample.Sample]: + """Quantify drop in application I/O when swap is under simultaneous pressure.""" + results = [] + # IMPORTANT: keep this OFF tmpfs. /tmp is RAM-backed (tmpfs/overlay), so an + # 8 GB fio file there consumes 8 GB of RAM and OOM-kills the pod on a small + # node (config1, rc=137 at "Laying out IO file") before any swap pressure is + # even applied. /mnt/stateful_partition is the node's persistent boot disk + # (hostPath mount) — the file lives on disk, not RAM, and the fio results + # then measure real disk I/O under swap pressure, which is the intent. + app_file = '/mnt/stateful_partition/pkb_app_io' + timeout = _STRESS_TIMEOUT_SEC.value + meta = dict(base_meta, phase='io_interference') + + # Relieve memory pressure via swap rather than the OOM killer (see helper). + # No-op on large-RAM nodes; prevents the config1 Phase 2b OOM (rc=137). + _set_memory_high_guard(pod) + + # Ensure fio is available — apt-get may have failed during DaemonSet init. + _pod_exec(pod, textwrap.dedent(""" + command -v fio >/dev/null 2>&1 || { + apt-get install -y -qq fio 2>/dev/null || true + } + """), ignore_failure=True, timeout=120) + + # Reclaim node memory BEFORE creating the test file. By this point Phase 2a + # has hard-swapped the node and Phase 3c's OpenSearch (which runs first) may + # have left a multi-GB JVM footprint; on a 30 GB node the file create then + # gets OOM-killed (rc=137) at the NODE level — which neither --direct=1 nor + # the cgroup memory.high guard can prevent (those are cgroup/page-cache + # tools, not node-eviction controls). Kill any leftover stressors/servers, + # flush dirty pages, and drop caches so the node starts Phase 2b clean. + _pod_exec(pod, textwrap.dedent(""" + pkill -9 stress-ng 2>/dev/null || true + pkill -9 -f 'opensearch|elasticsearch' 2>/dev/null || true + pkill -9 redis-server 2>/dev/null || true + sync + echo 3 > /proc/sys/vm/drop_caches 2>/dev/null || true + sleep 2 + echo "[pkb] pre-2b MemAvailable_kB=$(awk '/MemAvailable/{print $2}' /proc/meminfo) SwapFree_kB=$(awk '/SwapFree/{print $2}' /proc/meminfo)" + """), ignore_failure=True, timeout=60) + + # Create the test file on the persistent disk (see app_file note above). + # --direct=1 (O_DIRECT, ext4 supports it) bypasses the page cache. Size is + # kept at 4 GB (not 8) so the create + the concurrent background stressor + # cannot exhaust a 30 GB node even with swap already in use. + _pod_exec(pod, ( + f'fio --name=create --filename={app_file} ' + f'--rw=write --bs=1m --size=4G --verify=0 --direct=1' + ), timeout=600, ignore_failure=True) + + def _run_app_fio(pressure_label: str) -> list[sample.Sample]: + # --direct=1 (O_DIRECT) avoids page-cache buildup; ext4 on the persistent + # disk supports it. --size=4G matches the file created above. This + # measures the disk's I/O under swap pressure directly. + cmd = ( + f'fio --name=app_io --filename={app_file} ' + f'--ioengine=libaio --direct=1 ' + f'--rw=randrw --bs=4k --iodepth=32 --size=4G --verify=0 ' + f'--time_based --runtime=60s --output-format=json' + ) + # ignore_failure=True: fio rc=137 is expected when the pod is OOM-evicted + # under heavy swap pressure. _pod_exec handles recovery; callers rely on + # _parse_fio_json returning [] on empty/bad output rather than an exception. + out, _ = _pod_exec(pod, cmd, ignore_failure=True) + return _parse_fio_json( + out, 'app_io', f'App I/O ({pressure_label})', + dict(meta, pressure=pressure_label), + ) + + # 1. Baseline – no swap pressure + logging.info('[swap_encryption] I/O interference: baseline (no pressure)') + results += _run_app_fio('no_pressure') + + # 2. Under swap pressure + # Use nohup + disown so bash exits immediately after launching stress-ng; + # otherwise kubectl exec keeps the session alive until stress-ng finishes + # (300 s) and PKB's IssueCommand times out. + logging.info('[swap_encryption] I/O interference: under swap pressure') + # Confined background stressor: pages within a 60%-RAM cgroup so it can't + # OOM the concurrent app_io fio on a small node (see helper). + _launch_confined_bg_stress(pod, timeout, '/tmp/pkb_stress_io.log') + time.sleep(10) # let swap pressure build + results += _run_app_fio('with_swap_pressure') + + # Stop background stress-ng. If the pod was OOM-evicted while fio ran, + # stress-ng is already dead — kill is a no-op and we skip the long wait. + # _retries=0: no recovery here; the first Phase 3a command will recover + # the pod properly if needed (and it already waits for /tmp/pkb_ready). + _pod_exec(pod, 'pkill -9 stress-ng 2>/dev/null || true', + ignore_failure=True, _retries=0, timeout=15) + _reset_memory_high_guard(pod) + return results + + _INSTANCE_PRICE_USD_PER_HR: dict[str, float] = { # GCP (on-demand, us-central1 unless noted) 'c4-standard-8-lssd': 0.5888, # 8 vCPU, 32 GB RAM + 1×375 GB LSSD From 6652ee70f6bfc9cd2280a30f339f15331bd346d3 Mon Sep 17 00:00:00 2001 From: DevVegeta Date: Wed, 24 Jun 2026 13:14:18 +0530 Subject: [PATCH 5/8] fix(swap_encryption/pr4): slim DaemonSet, duplicate FLAGS, EKS stub, formatting --- .../cluster/swap_encryption_daemonset.yaml.j2 | 210 +- .../swap_encryption_benchmark.py | 6020 +++++++++-------- 2 files changed, 3400 insertions(+), 2830 deletions(-) diff --git a/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2 b/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2 index c40ec79dff..531ce522ae 100644 --- a/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2 +++ b/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2 @@ -28,127 +28,60 @@ spec: - bash - -c - | - echo "[pkb] Installing benchmark tools..." - # Retry apt-get up to 3 times — transient network failures are - # common on a freshly-started GKE node. Critical tools (fio, - # stress-ng) must be present before we write the ready sentinel; - # a silent || true here would cause /tmp/pkb_ready to appear even - # when tools are missing, breaking all subsequent phases. + echo "[pkb] Installing benchmark measurement tools..." + # Phase 1+2 tools: fio (raw-device I/O), stress-ng (CPU overhead), + # cryptsetup/mdadm (dm-crypt inspection), sysstat (vmstat/pidstat), + # nvme-cli (NVMe telemetry), cgroup-tools (cgroup v1 guard). + # Phase 3b tools: gcc/make/etc. (kernel build inside memory cap). + # Redis/memtier/esrally/opensearch are NOT installed here — + # those workloads run in separate PKB benchmark pods (Phase 3a, 3c) + # per Ajay review comment r3457826290. PKB_APT_OK=0 for _attempt in 1 2 3; do apt-get update -qq 2>&1 || true - DEBIAN_FRONTEND=noninteractive apt-get install -y -qq \\ - fio \\ - stress-ng \\ - sysstat \\ - cryptsetup \\ - mdadm \\ - redis-server \\ - redis-tools \\ - git \\ - wget \\ - curl \\ - make \\ - gcc \\ - bc \\ - flex \\ - bison \\ - libelf-dev \\ - libssl-dev \\ - cgroup-tools \\ - nvme-cli \\ - util-linux \\ - python3-pip \\ - libevent-dev \\ - libssl-dev \\ - libpcre3-dev \\ - zlib1g-dev \\ - build-essential \\ - autoconf \\ - automake \\ - libtool \\ - libtool-bin \\ - pkg-config \\ - python3-dev \\ - default-jre-headless \\ + DEBIAN_FRONTEND=noninteractive apt-get install -y -qq \ + fio \ + stress-ng \ + cryptsetup \ + mdadm \ + sysstat \ + nvme-cli \ + cgroup-tools \ + util-linux \ + gcc \ + make \ + bc \ + flex \ + bison \ + libelf-dev \ + libssl-dev \ 2>&1 && PKB_APT_OK=1 && break echo "[pkb] apt-get attempt $_attempt failed, retrying in 15s..." >&2 sleep 15 done - if [ "$PKB_APT_OK" != "1" ] || \\ - ! command -v fio >/dev/null 2>&1 || \\ + if [ "$PKB_APT_OK" != "1" ] || \ + ! command -v fio >/dev/null 2>&1 || \ ! command -v stress-ng >/dev/null 2>&1; then - echo "[pkb] FATAL: critical tools (fio, stress-ng) not installed after 3 attempts" >&2 + echo "[pkb] FATAL: critical tools (fio, stress-ng) not installed" >&2 exit 1 fi - echo "[pkb] Installing memtier_benchmark from source..." - # Pin a stable release tag — building from the moving default - # branch (HEAD) intermittently broke (memtier_benchmark not found - # → Phase 3a lost its P50/P90/P99 latency). 2.2.1 matches the - # version PKB's memtier package (memtier.MemtierResult.Parse) is - # validated against and builds cleanly with the apt deps above. - # Fall back to HEAD only if the tagged clone fails. - if ! command -v memtier_benchmark >/dev/null 2>&1; then - (cd /tmp && \\ - rm -rf memtier_benchmark && \\ - ( git clone --depth 1 --branch 2.2.1 \\ - https://github.com/RedisLabs/memtier_benchmark.git 2>&1 || \\ - git clone --depth 1 \\ - https://github.com/RedisLabs/memtier_benchmark.git 2>&1 ) && \\ - cd memtier_benchmark && \\ - autoreconf -ivf 2>&1 && \\ - ./configure 2>&1 && \\ - make -j$(nproc) 2>&1 && \\ - make install 2>&1) > /tmp/pkb_memtier_build.log 2>&1 || \\ - echo "[pkb] WARNING: memtier_benchmark build failed (see /tmp/pkb_memtier_build.log); redis-benchmark fallback will be used" - fi - if command -v memtier_benchmark >/dev/null 2>&1; then - echo "[pkb] memtier_benchmark installed: $(memtier_benchmark --version 2>&1 | head -1)" - fi - echo "[pkb] Installing esrally (lightweight)..." - python3 -m pip install --upgrade --break-system-packages pip setuptools wheel > /tmp/pkb_esrally_build.log 2>&1 || true - pip3 install --break-system-packages elastic-transport esrally >> /tmp/pkb_esrally_build.log 2>&1 || \\ - pip3 install --break-system-packages esrally >> /tmp/pkb_esrally_build.log 2>&1 || \\ - echo "[pkb] WARNING: esrally install failed (see /tmp/pkb_esrally_build.log); opensearch curl fallback will be used" - if command -v esrally >/dev/null 2>&1; then - echo "[pkb] esrally installed: $(esrally --version 2>&1 | head -1)" - else - echo "[pkb] WARNING: esrally binary not on PATH after install; opensearch curl fallback will be used" >&2 - fi - echo "[pkb] Installing OpenSearch (single-node, security off) for Phase 3c..." - # Phase 3c needs a real search server on :9200. Nothing in apt - # ships one and the pod has no systemd, so install the OpenSearch - # bundle (ships its own JDK) and launch the binary directly in the - # phase. All best-effort: if any step fails the phase probes the - # endpoint and skips cleanly rather than recording fake timings. - if [ ! -x /opt/opensearch/bin/opensearch ]; then - OS_VER=2.15.0 - (cd /opt && \\ - wget -q --timeout=600 -O os.tgz \\ - "https://artifacts.opensearch.org/releases/bundle/opensearch/$OS_VER/opensearch-$OS_VER-linux-x64.tar.gz" && \\ - tar -xzf os.tgz && rm -f os.tgz && \\ - mv "opensearch-$OS_VER" opensearch) > /tmp/pkb_opensearch_build.log 2>&1 || \\ - echo "[pkb] WARNING: OpenSearch download/extract failed (see /tmp/pkb_opensearch_build.log); Phase 3c will skip" >&2 - fi - if [ -x /opt/opensearch/bin/opensearch ]; then - # pkbos owns and runs OpenSearch (it refuses to run as root). - # Give it a home so HOME/temp paths are writable. - id pkbos >/dev/null 2>&1 || useradd -r -d /opt/opensearch -s /bin/bash pkbos 2>/dev/null || true - printf 'discovery.type: single-node\\nnetwork.host: 127.0.0.1\\nplugins.security.disabled: true\\n' \\ - > /opt/opensearch/config/opensearch.yml - mkdir -p /opt/opensearch/config/jvm.options.d - # 2 GB heap: 512 MB was too small and OpenSearch aborted early. - # On a 252 GB node this still leaves plenty of page cache to - # pressure into swap during the phase. - printf -- '-Xms2g\\n-Xmx2g\\n' \\ - > /opt/opensearch/config/jvm.options.d/pkb-heap.options - sysctl -w vm.max_map_count=262144 >/dev/null 2>&1 || true - # CRITICAL: never run the binary as root here (it bails and - # leaves root-owned files in logs/ that block the pkbos server). - # Clear any stale logs and chown everything to pkbos LAST. - rm -f /opt/opensearch/logs/* 2>/dev/null || true - chown -R pkbos /opt/opensearch 2>/dev/null || true - echo "[pkb] OpenSearch installed at /opt/opensearch (heap 2g, runs as pkbos)" + echo "[pkb] fio: $(fio --version 2>&1 | head -1)" + echo "[pkb] stress-ng: $(stress-ng --version 2>&1 | head -1)" + echo "[pkb] Verifying swap device is active..." + PKB_SWAP_FOUND=0 + for _attempt in $(seq 1 30); do + if awk 'NR>1{found=1} END{exit !found}' /proc/swaps 2>/dev/null; then + PKB_SWAP_DEV=$(awk 'NR==2{print $1}' /proc/swaps) + echo "[pkb] Swap device active: $PKB_SWAP_DEV" + PKB_SWAP_FOUND=1 + break + fi + echo "[pkb] Waiting for swap device (attempt $_attempt/30)..." >&2 + sleep 5 + done + if [ "$PKB_SWAP_FOUND" != "1" ]; then + echo "[pkb] WARNING: no active swap device after 150s — " \ + "check linuxConfig.swapConfig / kubelet swap config." >&2 fi echo "[pkb] Pre-fetching kernel source for Phase 3b build workload..." PKB_KVER="{{ kernel_version }}" @@ -158,62 +91,15 @@ spec: PKB_KURL="https://cdn.kernel.org/pub/linux/kernel/v${PKB_KVER%%.*}.x/linux-$PKB_KVER.tar.xz" mkdir -p "$PKB_KROOT" if [ ! -f "$PKB_KTARBALL" ]; then - wget -q --timeout=300 -O "$PKB_KTARBALL" "$PKB_KURL" 2>&1 || \\ + wget -q --timeout=300 -O "$PKB_KTARBALL" "$PKB_KURL" 2>&1 || \ echo "[pkb] WARNING: kernel tarball download failed" >&2 fi if [ -f "$PKB_KTARBALL" ] && [ ! -d "$PKB_KSRC" ]; then - echo "[pkb] Extracting kernel source (xz)..." - tar -xf "$PKB_KTARBALL" -C "$PKB_KROOT" 2>&1 || \\ + echo "[pkb] Extracting kernel source (xz, may take ~60 s)..." + tar -xf "$PKB_KTARBALL" -C "$PKB_KROOT" 2>&1 || \ echo "[pkb] WARNING: kernel source extraction failed" >&2 fi - echo "[pkb] Unlocking container cgroup swap limits..." - # GKE cgroup v2 sets memory.swap.max=0 per-container, which - # prevents swap usage even when the node has a swap device and - # vm.swappiness>0. Stress-ng gets OOM-killed in ~15s because - # the kernel can't page out to swap for this cgroup. - # - # NOTE: the old approach derived the cgroup path from - # /proc/self/cgroup, but inside a cgroup namespace that reports - # "0::/" — so the write targeted the host ROOT cgroup, silently - # no-op'd, and swap stayed locked (the OOM-in-15s symptom above). - # /sys is the host cgroup tree (hostPath mount) and this pod is - # privileged, so instead unlock swap across the entire kubepods - # hierarchy, which is guaranteed to contain our own container. - if [ -d /sys/fs/cgroup/kubepods.slice ] || \ - [ -d /sys/fs/cgroup/kubepods ]; then - # cgroup v2: write 'max' to every memory.swap.max under kubepods*. - find /sys/fs/cgroup -path '*kubepods*' -name memory.swap.max \ - 2>/dev/null | while read -r _f; do - echo max > "$_f" 2>/dev/null || true - done - fi - # Best-effort: our own namespaced path and the unified root. - PKB_CG=$(awk -F: '$2==""{print $3; exit}' /proc/self/cgroup \ - 2>/dev/null) - for _cgf in "/sys/fs/cgroup${PKB_CG}/memory.swap.max" \ - /sys/fs/cgroup/memory.swap.max; do - [ -f "$_cgf" ] && { echo max > "$_cgf" 2>/dev/null || true; } - done - # cgroup v1 fallback: lift the combined RAM+swap hard ceiling. - find /sys/fs/cgroup/memory -path '*kubepods*' \ - -name memory.memsw.limit_in_bytes 2>/dev/null \ - | while read -r _f; do - echo -1 > "$_f" 2>/dev/null || true - done - # Verify and surface the result in the pod log. grep -L lists - # files that do NOT contain 'max' on their first line, i.e. ones - # still capping swap. - PKB_STILL_CAPPED=$(find /sys/fs/cgroup -path '*kubepods*' \ - -name memory.swap.max 2>/dev/null \ - | xargs -r grep -L '^max' 2>/dev/null | head -1) - if [ -n "$PKB_STILL_CAPPED" ]; then - echo "[pkb] WARNING: cgroup swap still capped at \ - $PKB_STILL_CAPPED=$(cat "$PKB_STILL_CAPPED" 2>/dev/null) — stress-ng may be \ - OOM-killed before swap is exercised" >&2 - else - echo "[pkb] cgroup swap unlocked (memory.swap.max=max across kubepods)" - fi - echo "[pkb] Tools installed. Writing ready sentinel." + echo "[pkb] Benchmark tools ready. Writing ready sentinel." touch /tmp/pkb_ready sleep infinity securityContext: diff --git a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py index 5b32395e9c..ffcca8123a 100644 --- a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py +++ b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py @@ -84,11 +84,7 @@ # --------------------------------------------------------------------------- - -FLAGS = flags.FLAGS - - -BENCHMARK_NAME = 'swap_encryption' +BENCHMARK_NAME = "swap_encryption" BENCHMARK_CONFIG = """ @@ -118,318 +114,320 @@ _SWAP_DEVICE = flags.DEFINE_string( - 'swap_encryption_device', - '', - 'Explicit swap block-device path on the cluster node, e.g. ' - '/dev/nvme1n1 or /dev/dm-0. When empty the benchmark auto-detects ' - 'via /proc/swaps after setup.', + "swap_encryption_device", + "", + "Explicit swap block-device path on the cluster node, e.g. " + "/dev/nvme1n1 or /dev/dm-0. When empty the benchmark auto-detects " + "via /proc/swaps after setup.", ) _SWAP_SIZE_GB = flags.DEFINE_integer( - 'swap_encryption_swap_size_gb', + "swap_encryption_swap_size_gb", 32, - 'Size in GB of the swap space to configure on the node. ' - 'Ignored when a ready swap device already exists.', + "Size in GB of the swap space to configure on the node. " + "Ignored when a ready swap device already exists.", ) _SWAP_TYPE = flags.DEFINE_enum( - 'swap_encryption_swap_type', - 'auto', - ['auto', 'hyperdisk', 'lssd', 'boot_disk', 'instance_store', 'io2'], - 'Swap backing storage target, one per methodology test-matrix row:\n' - ' GKE: boot_disk (swap file on the OS boot disk — pd-balanced or ' - 'hyperdisk-balanced, chosen via --swap_encryption_boot_disk_type),\n' - ' hyperdisk (dedicated hyperdisk-balanced data disk),\n' - ' lssd (dedicated Local SSD RAID-0).\n' - ' AWS: instance_store (NVMe Instance Store, Nitro-encrypted),\n' - ' io2 (EBS io2 data/root volume).\n' - 'dm-crypt is applied on the GKE targets when ' - '--swap_encryption_enable_dmcrypt is set; AWS targets are encrypted by ' - 'Nitro at the hardware level. auto = detect from cloud + instance type.', + "swap_encryption_swap_type", + "auto", + ["auto", "hyperdisk", "lssd", "boot_disk", "instance_store", "io2"], + "Swap backing storage target, one per methodology test-matrix row:\n" + " GKE: boot_disk (swap file on the OS boot disk — pd-balanced or " + "hyperdisk-balanced, chosen via --swap_encryption_boot_disk_type),\n" + " hyperdisk (dedicated hyperdisk-balanced data disk),\n" + " lssd (dedicated Local SSD RAID-0).\n" + " AWS: instance_store (NVMe Instance Store, Nitro-encrypted),\n" + " io2 (EBS io2 data/root volume).\n" + "dm-crypt is applied on the GKE targets when " + "--swap_encryption_enable_dmcrypt is set; AWS targets are encrypted by " + "Nitro at the hardware level. auto = detect from cloud + instance type.", ) _FIO_RUNTIME_SEC = flags.DEFINE_integer( - 'swap_encryption_fio_runtime_sec', + "swap_encryption_fio_runtime_sec", 60, - 'Wall-clock runtime in seconds for each individual fio job.', + "Wall-clock runtime in seconds for each individual fio job.", ) _STRESS_TIMEOUT_SEC = flags.DEFINE_integer( - 'swap_encryption_stress_timeout_sec', + "swap_encryption_stress_timeout_sec", 300, - 'Duration in seconds of each stress-ng memory-pressure phase.', + "Duration in seconds of each stress-ng memory-pressure phase.", ) _STRESS_VM_BYTES = flags.DEFINE_string( - 'swap_encryption_stress_vm_bytes', - '28G', - 'Combined stress-ng working-set size (total in-flight footprint, not ' - 'per-worker). It is divided equally across --swap_encryption_stress_vm_' - 'workers before being passed to stress-ng, so the total memory touched ' - 'equals this value. Should exceed node RAM to force kernel swapping.', + "swap_encryption_stress_vm_bytes", + "28G", + "Combined stress-ng working-set size (total in-flight footprint, not " + "per-worker). It is divided equally across --swap_encryption_stress_vm_" + "workers before being passed to stress-ng, so the total memory touched " + "equals this value. Should exceed node RAM to force kernel swapping.", ) _STRESS_VM_BYTES_LIST = flags.DEFINE_string( - 'swap_encryption_stress_vm_bytes_list', - '', - 'Comma-separated list of stress-ng --vm-bytes values to iterate over ' + "swap_encryption_stress_vm_bytes_list", + "", + "Comma-separated list of stress-ng --vm-bytes values to iterate over " 'in Phase 2a CPU-overhead sweeps, e.g. "14G,21G,28G". When non-empty ' - 'this overrides --swap_encryption_stress_vm_bytes and Phase 2a is run ' - 'once per entry so that the swap-pressure intensity curve is captured.', + "this overrides --swap_encryption_stress_vm_bytes and Phase 2a is run " + "once per entry so that the swap-pressure intensity curve is captured.", ) _STRESS_VM_WORKERS = flags.DEFINE_integer( - 'swap_encryption_stress_vm_workers', + "swap_encryption_stress_vm_workers", 4, - 'Number of parallel stress-ng --vm workers for Phase 2a. The total ' - 'working set (the autoscaled vm_bytes) is divided equally across workers, ' - 'so the combined footprint stays under RAM+swap (no OOM) while exceeding ' - 'RAM (forcing swap). Multiple workers are needed for fill speed — a ' - 'single write64 worker cannot dirty enough memory within the timeout to ' - 'reach RAM (run swap1: ~184 GB resident, no swap). To stop the N ' - 'workers\' resident sets from collapsing to one worker\'s share, the ' - 'stressor uses random access (rand-set) and disables KSM page-merging ' - '(without those, identical write64 pages across workers were merged, ' - 'leaving only ~vm_bytes/N resident and swap_out ~0).', + "Number of parallel stress-ng --vm workers for Phase 2a. The total " + "working set (the autoscaled vm_bytes) is divided equally across workers, " + "so the combined footprint stays under RAM+swap (no OOM) while exceeding " + "RAM (forcing swap). Multiple workers are needed for fill speed — a " + "single write64 worker cannot dirty enough memory within the timeout to " + "reach RAM (run swap1: ~184 GB resident, no swap). To stop the N " + "workers' resident sets from collapsing to one worker's share, the " + "stressor uses random access (rand-set) and disables KSM page-merging " + "(without those, identical write64 pages across workers were merged, " + "leaving only ~vm_bytes/N resident and swap_out ~0).", ) _ENABLE_ZSWAP = flags.DEFINE_boolean( - 'swap_encryption_enable_zswap', + "swap_encryption_enable_zswap", False, - 'Enable zswap (lz4 compressor, 20%% max pool) before running tests.', + "Enable zswap (lz4 compressor, 20%% max pool) before running tests.", ) _MIN_FREE_KBYTES = flags.DEFINE_integer( - 'swap_encryption_min_free_kbytes', + "swap_encryption_min_free_kbytes", 65536, - 'Value written to /proc/sys/vm/min_free_kbytes to trigger earlier ' - 'swapping. Set 0 to leave the kernel default unchanged.', + "Value written to /proc/sys/vm/min_free_kbytes to trigger earlier " + "swapping. Set 0 to leave the kernel default unchanged.", ) _DAEMONSET_IMAGE = flags.DEFINE_string( - 'swap_encryption_daemonset_image', - 'ubuntu:22.04', - 'Container image used for the privileged benchmark DaemonSet pod.', + "swap_encryption_daemonset_image", + "ubuntu:22.04", + "Container image used for the privileged benchmark DaemonSet pod.", ) _NODEPOOL = flags.DEFINE_string( - 'swap_encryption_nodepool', - 'benchmark', - 'Name of the node pool to deploy the benchmark DaemonSet on.', + "swap_encryption_nodepool", + "benchmark", + "Name of the node pool to deploy the benchmark DaemonSet on.", ) _INSTANCE_SIZE_LABEL = flags.DEFINE_string( - 'swap_encryption_instance_size_label', - '', - 'Human-readable label for the current instance size being tested, e.g. ' + "swap_encryption_instance_size_label", + "", + "Human-readable label for the current instance size being tested, e.g. " '"n4-highmem-32" or "i4i.4xlarge". Stored in sample metadata so that ' - 'results from multiple PKB runs across different instance sizes can be ' - 'collated and compared. Defaults to the value reported by the cloud ' - 'metadata endpoint inside the pod.', + "results from multiple PKB runs across different instance sizes can be " + "collated and compared. Defaults to the value reported by the cloud " + "metadata endpoint inside the pod.", ) _COLLECT_COST = flags.DEFINE_boolean( - 'swap_encryption_collect_cost', + "swap_encryption_collect_cost", False, - 'When True, emit a cost_estimate_usd sample using on-demand pricing ' - 'for the instance type detected at runtime.', + "When True, emit a cost_estimate_usd sample using on-demand pricing " + "for the instance type detected at runtime.", ) _IO2_ENCRYPTED = flags.DEFINE_boolean( - 'swap_encryption_io2_encrypted', + "swap_encryption_io2_encrypted", True, - 'When True (default), the dedicated io2 swap volume is created with EBS ' + "When True (default), the dedicated io2 swap volume is created with EBS " 'encryption (Nitro/KMS) -> matrix row "io2 + hardware encryption". ' - 'Set False for the unencrypted io2 baseline row. Only applies when ' - '--swap_encryption_swap_type=io2 on AWS/EKS.', + "Set False for the unencrypted io2 baseline row. Only applies when " + "--swap_encryption_swap_type=io2 on AWS/EKS.", ) _IO2_KMS_KEY_ID = flags.DEFINE_string( - 'swap_encryption_io2_kms_key_id', - '', - 'Optional KMS key id/ARN for the encrypted io2 volume. Empty = the ' - 'account default aws/ebs key. Ignored unless io2_encrypted is True.', + "swap_encryption_io2_kms_key_id", + "", + "Optional KMS key id/ARN for the encrypted io2 volume. Empty = the " + "account default aws/ebs key. Ignored unless io2_encrypted is True.", ) _FAIL_ON_DEGRADED = flags.DEFINE_boolean( - 'swap_encryption_fail_on_degraded', + "swap_encryption_fail_on_degraded", True, - 'When True (default), raise an error at the end of Run() if the run was ' - 'catastrophically degraded — e.g. the benchmark pod was OOM-evicted and ' - 'replaced mid-run, Gate 1 (fio) produced no samples, or the stress-ng ' - 'swap-pressure phase was OOM-killed before completing. This prevents PKB ' - 'from reporting SUCCEEDED for a run whose post-eviction phases produced ' - 'empty or meaningless data. Set False to keep the legacy behaviour of ' - 'always returning whatever partial samples were collected.', + "When True (default), raise an error at the end of Run() if the run was " + "catastrophically degraded — e.g. the benchmark pod was OOM-evicted and " + "replaced mid-run, Gate 1 (fio) produced no samples, or the stress-ng " + "swap-pressure phase was OOM-killed before completing. This prevents PKB " + "from reporting SUCCEEDED for a run whose post-eviction phases produced " + "empty or meaningless data. Set False to keep the legacy behaviour of " + "always returning whatever partial samples were collected.", ) _PHASES = flags.DEFINE_list( - 'swap_encryption_phases', - ['all'], - 'Which Run() phases to execute, for fast iteration against an ' - 'already-provisioned cluster (e.g. --run_stage=run --run_uri=...). ' - 'Comma-separated subset of: fio (Tier 1 microbenchmarks), 2a (stress-ng ' - 'CPU overhead + swap pressure), 2b (I/O interference), 3a (redis), ' + "swap_encryption_phases", + ["all"], + "Which Run() phases to execute, for fast iteration against an " + "already-provisioned cluster (e.g. --run_stage=run --run_uri=...). " + "Comma-separated subset of: fio (Tier 1 microbenchmarks), 2a (stress-ng " + "CPU overhead + swap pressure), 2b (I/O interference), 3a (redis), " '3b (kernel build), 3c (opensearch). Default "all" runs everything. ' - 'Example: --swap_encryption_phases=2a runs only the swap-pressure phase. ' - 'Phases not listed are skipped and do not affect the degraded-run gate ' + "Example: --swap_encryption_phases=2a runs only the swap-pressure phase. " + "Phases not listed are skipped and do not affect the degraded-run gate " '(e.g. skipping fio will not be reported as "Gate 1 produced no samples").', ) _MIN_SWAP_OUT_PAGES = flags.DEFINE_integer( - 'swap_encryption_min_swap_out_pages', + "swap_encryption_min_swap_out_pages", 1000, - 'Minimum peak swap-out rate (pages/s) that Phase 2a must reach for the run ' - 'to count as a real swap-encryption measurement. Below this the working ' - 'set never meaningfully paged (e.g. run swap1 peaked at 176 pages/s of ' - 'noise yet "passed" the old zero-only gate), so the dm-crypt overhead is ' - 'hollow and the run is flagged degraded. A genuinely swapping run peaks in ' - 'the tens-to-hundreds of thousands of pages/s. Set 0 to accept any ' - 'non-zero swap-out (legacy behaviour).', + "Minimum peak swap-out rate (pages/s) that Phase 2a must reach for the run" + " to count as a real swap-encryption measurement. Below this the working" + " set never meaningfully paged (e.g. run swap1 peaked at 176 pages/s of" + ' noise yet "passed" the old zero-only gate), so the dm-crypt overhead is' + " hollow and the run is flagged degraded. A genuinely swapping run peaks" + " in the tens-to-hundreds of thousands of pages/s. Set 0 to accept any" + " non-zero swap-out (legacy behaviour).", ) _BENCHMARK_MACHINE_TYPE = flags.DEFINE_string( - 'swap_encryption_benchmark_machine_type', - 'n4-highmem-32', - 'Machine type for the benchmark nodepool created in Prepare(). ' - 'Use n4-highmem-32 (hyperdisk, default) or c4-standard-8-lssd ' - '(LSSD RAID-0). The matching swap setup is selected automatically.', + "swap_encryption_benchmark_machine_type", + "n4-highmem-32", + "Machine type for the benchmark nodepool created in Prepare(). " + "Use n4-highmem-32 (hyperdisk, default) or c4-standard-8-lssd " + "(LSSD RAID-0). The matching swap setup is selected automatically.", ) _BENCHMARK_LSSD = flags.DEFINE_boolean( - 'swap_encryption_lssd', + "swap_encryption_lssd", False, - 'Force LSSD RAID-0 swap path even when the machine type name does not ' + "Force LSSD RAID-0 swap path even when the machine type name does not " 'contain "lssd". Auto-detected from machine type when False.', ) _LSSD_COUNT = flags.DEFINE_integer( - 'swap_encryption_lssd_count', + "swap_encryption_lssd_count", 1, - 'Number of local NVMe SSDs to attach as raw block devices ' - '(--local-nvme-ssd-block count=N). Must match the fixed local SSD ' - 'count for the chosen machine type: c4-standard-8-lssd=1, ' - 'c4-standard-16-lssd=2, i4i.4xlarge has NVMe Instance Store (AWS). ' - 'Default 1 covers most single-lssd machine types.', + "Number of local NVMe SSDs to attach as raw block devices " + "(--local-nvme-ssd-block count=N). Must match the fixed local SSD " + "count for the chosen machine type: c4-standard-8-lssd=1, " + "c4-standard-16-lssd=2, i4i.4xlarge has NVMe Instance Store (AWS). " + "Default 1 covers most single-lssd machine types.", ) _ENABLE_DMCRYPT = flags.DEFINE_boolean( - 'swap_encryption_enable_dmcrypt', + "swap_encryption_enable_dmcrypt", True, - 'When True (default), configure dm-crypt on the swap device — the ' + "When True (default), configure dm-crypt on the swap device — the " '"encryption enabled" column of the test matrix. Set False to use ' - 'plain swap (encryption disabled column).', + "plain swap (encryption disabled column).", ) _NODE_IMAGE_TYPE = flags.DEFINE_string( - 'swap_encryption_node_image_type', - 'UBUNTU_CONTAINERD', - 'GKE node image type for the benchmark nodepool. ' - 'UBUNTU_CONTAINERD is required for dm-crypt measurement: COS locks ' - 'down device-mapper at the kernel LSM level and cryptsetup hangs ' - 'indefinitely from any pod context (even privileged, even via nsenter ' - 'into the host mount namespace). Ubuntu GKE nodes allow cryptsetup ' - 'from privileged pods without restriction. ' - 'Use COS_CONTAINERD only when dm-crypt is disabled ' - '(--noswap_encryption_enable_dmcrypt) to measure plain-swap overhead. ' - 'AL2 on EKS.', + "swap_encryption_node_image_type", + "UBUNTU_CONTAINERD", + "GKE node image type for the benchmark nodepool. " + "UBUNTU_CONTAINERD is required for dm-crypt measurement: COS locks " + "down device-mapper at the kernel LSM level and cryptsetup hangs " + "indefinitely from any pod context (even privileged, even via nsenter " + "into the host mount namespace). Ubuntu GKE nodes allow cryptsetup " + "from privileged pods without restriction. " + "Use COS_CONTAINERD only when dm-crypt is disabled " + "(--noswap_encryption_enable_dmcrypt) to measure plain-swap overhead. " + "AL2 on EKS.", ) _BOOT_DISK_TYPE = flags.DEFINE_string( - 'swap_encryption_boot_disk_type', - 'hyperdisk-balanced', - 'Disk type for the benchmark nodepool boot disk. Use hyperdisk-balanced ' - 'for production machines (n4, c3, c4 families). Use pd-ssd for n2/e2 ' - 'dev/test machines, which do not support hyperdisk-balanced.', + "swap_encryption_boot_disk_type", + "hyperdisk-balanced", + "Disk type for the benchmark nodepool boot disk. Use hyperdisk-balanced " + "for production machines (n4, c3, c4 families). Use pd-ssd for n2/e2 " + "dev/test machines, which do not support hyperdisk-balanced.", ) _BOOT_DISK_IOPS = flags.DEFINE_integer( - 'swap_encryption_boot_disk_iops', + "swap_encryption_boot_disk_iops", 80000, - 'Provisioned IOPS for the boot disk (hyperdisk-balanced only). ' - '80 000 is the COS max-IOPS target. Ignored for pd-ssd.', + "Provisioned IOPS for the boot disk (hyperdisk-balanced only). " + "80 000 is the COS max-IOPS target. Ignored for pd-ssd.", ) _BOOT_DISK_THROUGHPUT = flags.DEFINE_integer( - 'swap_encryption_boot_disk_throughput', + "swap_encryption_boot_disk_throughput", 1200, - 'Provisioned throughput in MB/s for the boot disk (hyperdisk-balanced ' - 'only). Must be set together with iops. 1200 MB/s pairs with 80 000 ' - 'IOPS for production; use 140 (minimum) for dev/test. Ignored for ' - 'pd-ssd.', + "Provisioned throughput in MB/s for the boot disk (hyperdisk-balanced " + "only). Must be set together with iops. 1200 MB/s pairs with 80 000 " + "IOPS for production; use 140 (minimum) for dev/test. Ignored for " + "pd-ssd.", ) _BOOT_DISK_SIZE_GB = flags.DEFINE_integer( - 'swap_encryption_boot_disk_size_gb', + "swap_encryption_boot_disk_size_gb", 500, - 'Boot disk size in GiB for the benchmark nodepool. 500 GiB is ' - 'required for the n4-highmem-32 + hyperdisk-balanced Config 2 run ' - '(see Engineer Assignments table in execution-plan.md). ' - 'For LSSD configs the boot disk is smaller; 100 GiB is fine.', + "Boot disk size in GiB for the benchmark nodepool. 500 GiB is " + "required for the n4-highmem-32 + hyperdisk-balanced Config 2 run " + "(see Engineer Assignments table in execution-plan.md). " + "For LSSD configs the boot disk is smaller; 100 GiB is fine.", ) _ADD_SWAP_DISK = flags.DEFINE_boolean( - 'swap_encryption_add_swap_disk', + "swap_encryption_add_swap_disk", False, - 'Attach a dedicated second disk to the benchmark nodepool for use as ' - 'the swap device. Required for dm-crypt measurement on single-boot-disk ' - 'machines (n4-highmem-32, n4-highmem-8) because COS blocks device-mapper ' - 'from pod namespaces. The second disk is provisioned via ' - '--additional-node-disk using the same type/IOPS/throughput as the boot ' - 'disk flags.', + "Attach a dedicated second disk to the benchmark nodepool for use as " + "the swap device. Required for dm-crypt measurement on single-boot-disk " + "machines (n4-highmem-32, n4-highmem-8) because COS blocks device-mapper " + "from pod namespaces. The second disk is provisioned via " + "--additional-node-disk using the same type/IOPS/throughput as the boot " + "disk flags.", ) _SWAP_DISK_SIZE_GB = flags.DEFINE_integer( - 'swap_encryption_swap_disk_size_gb', + "swap_encryption_swap_disk_size_gb", 500, - 'Size in GiB of the dedicated swap disk when ' - '--swap_encryption_add_swap_disk is True. Must satisfy the ' - 'hyperdisk-balanced IOPS constraint: provisioned_iops ≤ size_gb × 80.', + "Size in GiB of the dedicated swap disk when " + "--swap_encryption_add_swap_disk is True. Must satisfy the " + "hyperdisk-balanced IOPS constraint: provisioned_iops ≤ size_gb × 80.", ) -_DS_NAME = 'pkb-swap-benchmark' +_DS_NAME = "pkb-swap-benchmark" -_DS_NAMESPACE = 'default' +_DS_NAMESPACE = "default" -_DS_LABEL = 'pkb-swap-benchmark' +_DS_LABEL = "pkb-swap-benchmark" _active_pod: list[str] = [] # single-element list so closures can mutate it -_stress_vm_method: list[str] = [] # single-element list; '' means no --vm-method flag +_stress_vm_method: list[str] = ( + [] +) # single-element list; '' means no --vm-method flag _degraded_reasons: list[str] = [] @@ -441,145 +439,157 @@ _oom_events: list[str] = [] -_BENCHMARK_NODEPOOL = 'benchmark' +_BENCHMARK_NODEPOOL = "benchmark" -_DEFAULT_NODEPOOL = 'default-pool' +_DEFAULT_NODEPOOL = "default-pool" _FIO_JOBS = ( - ('rand_write_iops', 'randwrite', '4k', 256, 'Random write IOPS'), - ('rand_read_iops', 'randread', '4k', 256, 'Random read IOPS'), - ('rand_rw_mixed', 'randrw', '4k', 256, 'Mixed random R/W (50/50)'), - ('seq_write_bw', 'write', '1m', 64, 'Sequential write bandwidth'), - ('seq_read_bw', 'read', '1m', 64, 'Sequential read bandwidth'), - ('lat_write', 'randwrite', '4k', 1, 'Random write latency'), - ('lat_read', 'randread', '4k', 1, 'Random read latency'), + ("rand_write_iops", "randwrite", "4k", 256, "Random write IOPS"), + ("rand_read_iops", "randread", "4k", 256, "Random read IOPS"), + ("rand_rw_mixed", "randrw", "4k", 256, "Mixed random R/W (50/50)"), + ("seq_write_bw", "write", "1m", 64, "Sequential write bandwidth"), + ("seq_read_bw", "read", "1m", 64, "Sequential read bandwidth"), + ("lat_write", "randwrite", "4k", 1, "Random write latency"), + ("lat_read", "randread", "4k", 1, "Random read latency"), ) -_VMSTAT_LOG = '/tmp/pkb_vmstat.log' +_VMSTAT_LOG = "/tmp/pkb_vmstat.log" -_PIDSTAT_LOG = '/tmp/pkb_pidstat.log' +_PIDSTAT_LOG = "/tmp/pkb_pidstat.log" -_CRYPTO_PROCS = ('kswapd', 'kworker', 'kcryptd', 'dmcrypt_write') +_CRYPTO_PROCS = ("kswapd", "kworker", "kcryptd", "dmcrypt_write") def _daemonset_yaml(image: str) -> str: - """Render the privileged benchmark DaemonSet manifest. - - The manifest is a PKB data file rendered with Jinja2 - (data/cluster/swap_encryption_daemonset.yaml.j2) rather than an inline - string, per PKB conventions. The DaemonSet is pinned to the benchmark - nodepool via nodeSelector so it never lands on the dummy default pool. - """ - return vm_util.ReadAndRenderJinja2Template( - 'cluster/swap_encryption_daemonset.yaml.j2', - ds_name=_DS_NAME, - ds_namespace=_DS_NAMESPACE, - ds_label=_DS_LABEL, - benchmark_nodepool=_BENCHMARK_NODEPOOL, - image=image, - kernel_version=_KERNEL_VERSION.value, - ) + """Render the privileged benchmark DaemonSet manifest. + + The manifest is a PKB data file rendered with Jinja2 + (data/cluster/swap_encryption_daemonset.yaml.j2) rather than an inline + string, per PKB conventions. The DaemonSet is pinned to the benchmark + nodepool via nodeSelector so it never lands on the dummy default pool. + """ + return vm_util.ReadAndRenderJinja2Template( + "cluster/swap_encryption_daemonset.yaml.j2", + ds_name=_DS_NAME, + ds_namespace=_DS_NAMESPACE, + ds_label=_DS_LABEL, + benchmark_nodepool=_BENCHMARK_NODEPOOL, + image=image, + kernel_version=_KERNEL_VERSION.value, + ) def GetConfig(user_config: dict[str, Any]) -> dict[str, Any]: - return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME) + return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME) def Prepare(spec) -> None: - """Two-step nodepool setup then DaemonSet deployment. - - Step 1 (handled by PKB infrastructure): cluster provisioned with a cheap - e2-medium default nodepool. - - Step 2 (this function): - a. Create the benchmark nodepool (n4-highmem-32 or c4-*-lssd) with - COS_CONTAINERD, 80 000 IOPS, and a node startup script that configures - dm-crypt swap at the OS level — before any pod is scheduled. - b. Delete the dummy default nodepool to stop its cost immediately. - c. Deploy the privileged DaemonSet (pinned via nodeSelector to the - benchmark nodepool) and wait for tools to install. - """ - cluster = spec.container_cluster - - # ── Step 2a: add real benchmark nodepool ──────────────────────────────── - if getattr(cluster, 'project', None): - # GCP path: true two-step nodepool setup - logging.info('[swap_encryption] Step 2a: creating benchmark nodepool') - _create_benchmark_node_pool(cluster) - - # ── Step 2b: wait for the benchmark node to join and be Ready ───────── - logging.info('[swap_encryption] Step 2b: waiting for benchmark node') - _wait_for_benchmark_node() - - # ── Step 2b2: attach dedicated swap disk (if requested) ─────────────── - # --additional-node-disk is not available in all gcloud versions, so we - # create + attach the disk after the node is up using gcloud compute. - if _ADD_SWAP_DISK.value: - logging.info('[swap_encryption] Step 2b2: attaching dedicated swap disk') - _attach_swap_disk(cluster) - else: - # AWS / EKS: nodepool management is external. PKB's cluster creation - # labels nodes pkb_nodepool=default, so re-label all existing nodes here - # to match the DaemonSet nodeSelector (pkb_nodepool=benchmark). - logging.info( - '[swap_encryption] EKS cluster — labelling existing nodes with ' - 'pkb_nodepool=%s so the DaemonSet nodeSelector matches.', - _BENCHMARK_NODEPOOL) - kubectl.RunKubectlCommand([ - 'label', 'nodes', '--all', '--overwrite', - f'pkb_nodepool={_BENCHMARK_NODEPOOL}', - ]) - # io2 test-matrix row: create + attach a real io2 EBS volume so swap runs - # on io2 hardware-encrypted storage (no-op unless swap_type=io2). - _ensure_io2_volume() - - # ── Step 2c: deploy DaemonSet ──────────────────────────────────────────── - # Deploy and wait for the pod BEFORE deleting the default nodepool. - # Deleting the default pool while the benchmark node is still joining causes - # a temporary API server i/o timeout (control plane busy with two nodepool - # ops simultaneously). Once the pod is Running the cluster is fully stable. - logging.info('[swap_encryption] Step 2c: deploying privileged DaemonSet') - _deploy_daemonset() - - pod = _wait_for_benchmark_pod() - logging.info('[swap_encryption] Benchmark pod ready: %s', pod) - - # ── Step 2d: now safe to remove the dummy default nodepool ─────────────── - if getattr(cluster, 'project', None): - logging.info('[swap_encryption] Step 2d: deleting dummy default nodepool') - _delete_default_node_pool(cluster) - # The DaemonSet pod may be evicted and rescheduled with a new name during - # the nodepool deletion (cluster control plane briefly interrupts pod - # lifecycle). Re-resolve the pod name to avoid stale-reference errors on - # all subsequent _pod_exec calls. - logging.info('[swap_encryption] Step 2d: re-resolving benchmark pod ' - 'after nodepool deletion') + """Two-step nodepool setup then DaemonSet deployment. + + Step 1 (handled by PKB infrastructure): cluster provisioned with a cheap + e2-medium default nodepool. + + Step 2 (this function): + a. Create the benchmark nodepool (n4-highmem-32 or c4-*-lssd) with + COS_CONTAINERD, 80 000 IOPS, and a node startup script that configures + dm-crypt swap at the OS level — before any pod is scheduled. + b. Delete the dummy default nodepool to stop its cost immediately. + c. Deploy the privileged DaemonSet (pinned via nodeSelector to the + benchmark nodepool) and wait for tools to install. + """ + cluster = spec.container_cluster + + # ── Step 2a: add real benchmark nodepool ──────────────────────────────── + if getattr(cluster, "project", None): + # GCP path: true two-step nodepool setup + logging.info("[swap_encryption] Step 2a: creating benchmark nodepool") + _create_benchmark_node_pool(cluster) + + # ── Step 2b: wait for the benchmark node to join and be Ready ───────── + logging.info("[swap_encryption] Step 2b: waiting for benchmark node") + _wait_for_benchmark_node() + + # ── Step 2b2: attach dedicated swap disk (if requested) ─────────────── + # --additional-node-disk is not available in all gcloud versions, so we + # create + attach the disk after the node is up using gcloud compute. + if _ADD_SWAP_DISK.value: + logging.info( + "[swap_encryption] Step 2b2: attaching dedicated swap disk" + ) + _attach_swap_disk(cluster) + else: + # AWS / EKS: nodepool management is external. PKB's cluster creation + # labels nodes pkb_nodepool=default, so re-label all existing nodes here + # to match the DaemonSet nodeSelector (pkb_nodepool=benchmark). + logging.info( + "[swap_encryption] EKS cluster — labelling existing nodes with " + "pkb_nodepool=%s so the DaemonSet nodeSelector matches.", + _BENCHMARK_NODEPOOL, + ) + kubectl.RunKubectlCommand([ + "label", + "nodes", + "--all", + "--overwrite", + f"pkb_nodepool={_BENCHMARK_NODEPOOL}", + ]) + # io2 test-matrix row: create + attach a real io2 EBS volume so swap runs + # on io2 hardware-encrypted storage (no-op unless swap_type=io2). + _ensure_io2_volume() + + # ── Step 2c: deploy DaemonSet ──────────────────────────────────────────── + # Deploy and wait for the pod BEFORE deleting the default nodepool. + # Deleting the default pool while the benchmark node is still joining causes + # a temporary API server i/o timeout (control plane busy with two nodepool + # ops simultaneously). Once the pod is Running the cluster is fully stable. + logging.info("[swap_encryption] Step 2c: deploying privileged DaemonSet") + _deploy_daemonset() + pod = _wait_for_benchmark_pod() - logging.info('[swap_encryption] Benchmark pod (post-deletion): %s', pod) - - # Tune kernel swap aggressiveness. - # vm.swappiness=100 (maximum): GKE nodes default to 0 (avoid swap, prefer - # OOM-kill). At 60 the kernel still under-swapped on n4-highmem-32 — under - # cgroup-level memory pressure with ~160 GB node RAM free it would leave - # anonymous pages resident and record swap_out ~0 (run bb4a782d), making the - # result non-deterministic. 100 maximally biases the kernel toward paging - # anonymous pages out to the (encrypted) swap device, which is exactly the - # path this benchmark is meant to exercise. - _pod_exec(pod, 'sysctl -w vm.swappiness=100', ignore_failure=True) - if _MIN_FREE_KBYTES.value > 0: - _pod_exec(pod, f'sysctl -w vm.min_free_kbytes={_MIN_FREE_KBYTES.value}') - - # Unlock container cgroup swap. - # GKE cgroup v2 sets memory.swap.max=0 per-container even when the node has - # a swap device. This blocks swap for the container regardless of - # vm.swappiness. Stress-ng gets OOM-killed in ~15s because the kernel can - # page out for this cgroup. Set 'max' so the container can use all swap. - _pod_exec(pod, textwrap.dedent(""" + logging.info("[swap_encryption] Benchmark pod ready: %s", pod) + + # ── Step 2d: now safe to remove the dummy default nodepool ─────────────── + if getattr(cluster, "project", None): + logging.info( + "[swap_encryption] Step 2d: deleting dummy default nodepool" + ) + _delete_default_node_pool(cluster) + # The DaemonSet pod may be evicted and rescheduled with a new name during + # the nodepool deletion (cluster control plane briefly interrupts pod + # lifecycle). Re-resolve the pod name to avoid stale-reference errors on + # all subsequent _pod_exec calls. + logging.info( + "[swap_encryption] Step 2d: re-resolving benchmark pod " + "after nodepool deletion" + ) + pod = _wait_for_benchmark_pod() + logging.info("[swap_encryption] Benchmark pod (post-deletion): %s", pod) + + # Tune kernel swap aggressiveness. + # vm.swappiness=100 (maximum): GKE nodes default to 0 (avoid swap, prefer + # OOM-kill). At 60 the kernel still under-swapped on n4-highmem-32 — under + # cgroup-level memory pressure with ~160 GB node RAM free it would leave + # anonymous pages resident and record swap_out ~0 (run bb4a782d), making the + # result non-deterministic. 100 maximally biases the kernel toward paging + # anonymous pages out to the (encrypted) swap device, which is exactly the + # path this benchmark is meant to exercise. + _pod_exec(pod, "sysctl -w vm.swappiness=100", ignore_failure=True) + if _MIN_FREE_KBYTES.value > 0: + _pod_exec(pod, f"sysctl -w vm.min_free_kbytes={_MIN_FREE_KBYTES.value}") + + # Unlock container cgroup swap. + # GKE cgroup v2 sets memory.swap.max=0 per-container even when the node has + # a swap device. This blocks swap for the container regardless of + # vm.swappiness. Stress-ng gets OOM-killed in ~15s because the kernel can + # page out for this cgroup. Set 'max' so the container can use all swap. + _pod_exec( + pod, + textwrap.dedent(""" PKB_CG=$(awk -F: '/^0::/{print $3; exit}' /proc/self/cgroup 2>/dev/null) if [ -n "$PKB_CG" ] && [ -f "/sys/fs/cgroup${PKB_CG}/memory.swap.max" ]; then echo max > "/sys/fs/cgroup${PKB_CG}/memory.swap.max" 2>/dev/null || true @@ -590,196 +600,231 @@ def Prepare(spec) -> None: echo -1 > "/sys/fs/cgroup/memory${PKB_CG1}/memory.memsw.limit_in_bytes" \ 2>/dev/null || true fi - """), ignore_failure=True) - - # Enable zswap if requested - if _ENABLE_ZSWAP.value: - _enable_zswap(pod) + """), + ignore_failure=True, + ) - # Configure cloud-specific swap - cloud = _detect_cloud(pod) - logging.info('[swap_encryption] Detected cloud: %s', cloud) + # Enable zswap if requested + if _ENABLE_ZSWAP.value: + _enable_zswap(pod) - if cloud == 'gcp': - _setup_gke_swap(pod) - elif cloud == 'aws': - _setup_eks_swap(pod) - else: - logging.warning( - '[swap_encryption] Unknown cloud – falling back to plain swapfile' - ) - _setup_plain_swap_file(pod, _SWAP_SIZE_GB.value) + # Configure cloud-specific swap + cloud = _detect_cloud(pod) + logging.info("[swap_encryption] Detected cloud: %s", cloud) + if cloud == "gcp": + _setup_gke_swap(pod) + elif cloud == "aws": + _setup_eks_swap(pod) + else: + logging.warning( + "[swap_encryption] Unknown cloud – falling back to plain swapfile" + ) + _setup_plain_swap_file(pod, _SWAP_SIZE_GB.value) def _phase_selected(token: str) -> bool: - """Return True if phase `token` should run given --swap_encryption_phases. + """Return True if phase `token` should run given --swap_encryption_phases. - 'all' (the default) selects every phase. Otherwise only the comma-separated - tokens listed in the flag run. Tokens: fio, 2a, 2b, 3a, 3b, 3c. - """ - selected = [p.strip().lower() for p in _PHASES.value if p.strip()] - return (not selected) or ('all' in selected) or (token.lower() in selected) + 'all' (the default) selects every phase. Otherwise only the comma-separated + tokens listed in the flag run. Tokens: fio, 2a, 2b, 3a, 3b, 3c. + """ + selected = [p.strip().lower() for p in _PHASES.value if p.strip()] + return (not selected) or ("all" in selected) or (token.lower() in selected) def Run(spec) -> list[sample.Sample]: - """Execute all benchmark phases with gate logic. - - Execution is structured in three gated tiers matching the execution plan: - - Tier 1 (Gate 1) — fio microbenchmarks - Raw I/O ceiling of the swap device. Gate 1 fails if fio produces - zero samples (device not found, O_DIRECT error, etc.). - - Tier 2 (Gate 2) — stress-ng CPU overhead + I/O interference - Requires an active swap device (Gate 1 must pass). Gate 2 fails if - stress-ng does not complete within timeout. - - Tier 3 (Gate 3) — real-world workloads (Redis, kernel build, OpenSearch) - Independent of Tier 2 results; always attempted if Gate 1 passed. - Individual workload failures are logged but do not abort the others. - - If Gate 1 fails, Tiers 2 and 3 are skipped — there is no point measuring - application-level swap performance when the raw device is inaccessible. - """ - pod = _wait_for_benchmark_pod() - # Initialise the module-level active-pod tracker so _pod_exec and - # _recover_pod can transparently redirect to a replacement pod if the - # original is evicted during the run. - _active_pod.clear() - _active_pod.append(pod) - _degraded_reasons.clear() - _pod_lost.clear() - _oom_events.clear() - original_pod = pod - swap_dev = _detect_swap_device(pod) - base_meta = _build_metadata(pod, swap_dev) - results: list[sample.Sample] = [] - t_run_start = time.time() - - logging.info('[swap_encryption] swap device: %s', swap_dev) - - # ── Tier 1 / Gate 1: fio microbenchmarks ───────────────────────────────── - tier1_results = [] - if _phase_selected('fio'): - logging.info( - '[swap_encryption] ── Tier 1 / Gate 1: fio microbenchmarks ──') - try: - tier1_results = _phase1_fio(pod, swap_dev, base_meta) - results += tier1_results - except Exception as e: # pylint: disable=broad-except - logging.error('[swap_encryption] Gate 1 FAILED — fio phase error: %s', e) - logging.error('[swap_encryption] Skipping Tiers 2 and 3 (no swap device)') - return results - - if not tier1_results: - logging.warning('[swap_encryption] Gate 1 produced no samples ' - '(loop-device skip or parse error) — ' - 'continuing to Tier 2 with caution') - else: - logging.info('[swap_encryption] Skipping Tier 1 (fio) — not selected by ' - '--swap_encryption_phases=%s', ','.join(_PHASES.value)) - - # ── Tier 2 / Gate 2: stress-ng CPU overhead + I/O interference ─────────── - if _phase_selected('2a') or _phase_selected('2b'): - logging.info('[swap_encryption] ── Tier 2 / Gate 2: stress-ng phases ──') - try: - if _phase_selected('2a'): - logging.info('[swap_encryption] Phase 2a: CPU overhead') - results += _phase2a_cpu_overhead(pod, base_meta) - if _phase_selected('2b'): - logging.info('[swap_encryption] Phase 2b: I/O interference') - results += _phase2b_io_interference(pod, base_meta) - except Exception as e: # pylint: disable=broad-except - logging.error('[swap_encryption] Gate 2 FAILED — stress phase error: %s', - e) - logging.warning('[swap_encryption] Proceeding to Tier 3 (workloads are ' - 'independent of stress-ng results)') - - # ── Cost estimate ───────────────────────────────────────────────────────── - if _COLLECT_COST.value: - elapsed = time.time() - t_run_start - results += _collect_cost_sample(pod, elapsed, base_meta) - - # ── Final degradation gate ──────────────────────────────────────────────── - # The phase try/except blocks above keep the run alive so partial data is - # still collected, but that means a catastrophic failure (pod OOM-evicted - # mid-run, no fio data, stress-ng killed before it could drive swap I/O) - # would otherwise be reported by PKB as SUCCEEDED with empty/garbage metrics. - # Detect those conditions here and surface them explicitly. - if _active_pod and _active_pod[0] != original_pod: - _degraded_reasons.append( - f'benchmark pod was replaced during the run ' - f'({original_pod} → {_active_pod[0]}) — it was OOM-evicted under swap ' - f'pressure; phases executed after the eviction ran against a ' - f'freshly-initialised pod (empty /tmp, swap re-setup) and may be ' - f'invalid') - if _pod_lost: - _degraded_reasons.append( - f'benchmark pod(s) went NotFound during the run ({", ".join(_pod_lost)}) ' - f'— the pod died (node memory-pressure eviction or container exit) and ' - f'any phase running at or after that point (e.g. kernel-build baseline, ' - f'OpenSearch) produced invalid data') - if _oom_events: - _degraded_reasons.append( - f'OOM kill(s) (rc=137) occurred during the run on pod(s) ' - f'{", ".join(_oom_events)} — a phase exceeded memory and was killed by ' - f'the OOM killer (the container may have restarted in place), so the ' - f'affected phase(s) produced no or partial data') - - if _phase_selected('fio') and not tier1_results: - if swap_dev.startswith('/dev/loop'): - # Expected: COS blocks device-mapper from pod namespaces on single-disk - # nodes (n2/n4 without --swap_encryption_add_swap_disk or lssd). - # Tier 2/3 results are still valid; do NOT mark the run as degraded. - logging.warning( - '[swap_encryption] Gate 1 (fio) skipped — loop device %s has no ' - 'dm-crypt support from inside a pod. Tier 2/3 results are valid. ' - 'Use c4-*-lssd or --swap_encryption_add_swap_disk for fio data.', - swap_dev) + """Execute all benchmark phases with gate logic. + + Execution is structured in three gated tiers matching the execution plan: + + Tier 1 (Gate 1) — fio microbenchmarks + Raw I/O ceiling of the swap device. Gate 1 fails if fio produces + zero samples (device not found, O_DIRECT error, etc.). + + Tier 2 (Gate 2) — stress-ng CPU overhead + I/O interference + Requires an active swap device (Gate 1 must pass). Gate 2 fails if + stress-ng does not complete within timeout. + + Tier 3 (Gate 3) — real-world workloads (Redis, kernel build, OpenSearch) + Independent of Tier 2 results; always attempted if Gate 1 passed. + Individual workload failures are logged but do not abort the others. + + If Gate 1 fails, Tiers 2 and 3 are skipped — there is no point measuring + application-level swap performance when the raw device is inaccessible. + """ + pod = _wait_for_benchmark_pod() + # Initialise the module-level active-pod tracker so _pod_exec and + # _recover_pod can transparently redirect to a replacement pod if the + # original is evicted during the run. + _active_pod.clear() + _active_pod.append(pod) + _degraded_reasons.clear() + _pod_lost.clear() + _oom_events.clear() + original_pod = pod + swap_dev = _detect_swap_device(pod) + base_meta = _build_metadata(pod, swap_dev) + results: list[sample.Sample] = [] + t_run_start = time.time() + + logging.info("[swap_encryption] swap device: %s", swap_dev) + + # ── Tier 1 / Gate 1: fio microbenchmarks ───────────────────────────────── + tier1_results = [] + if _phase_selected("fio"): + logging.info( + "[swap_encryption] ── Tier 1 / Gate 1: fio microbenchmarks ──" + ) + try: + tier1_results = _phase1_fio(pod, swap_dev, base_meta) + results += tier1_results + except Exception as e: # pylint: disable=broad-except + logging.error( + "[swap_encryption] Gate 1 FAILED — fio phase error: %s", e + ) + logging.error( + "[swap_encryption] Skipping Tiers 2 and 3 (no swap device)" + ) + return results + + if not tier1_results: + logging.warning( + "[swap_encryption] Gate 1 produced no samples " + "(loop-device skip or parse error) — " + "continuing to Tier 2 with caution" + ) + else: + logging.info( + "[swap_encryption] Skipping Tier 1 (fio) — not selected by " + "--swap_encryption_phases=%s", + ",".join(_PHASES.value), + ) + + # ── Tier 2 / Gate 2: stress-ng CPU overhead + I/O interference ─────────── + if _phase_selected("2a") or _phase_selected("2b"): + logging.info( + "[swap_encryption] ── Tier 2 / Gate 2: stress-ng phases ──" + ) + try: + if _phase_selected("2a"): + logging.info("[swap_encryption] Phase 2a: CPU overhead") + results += _phase2a_cpu_overhead(pod, base_meta) + if _phase_selected("2b"): + logging.info("[swap_encryption] Phase 2b: I/O interference") + results += _phase2b_io_interference(pod, base_meta) + except Exception as e: # pylint: disable=broad-except + logging.error( + "[swap_encryption] Gate 2 FAILED — stress phase error: %s", e + ) + logging.warning( + "[swap_encryption] Proceeding to Tier 3 (workloads are " + "independent of stress-ng results)" + ) + + # ── Cost estimate ───────────────────────────────────────────────────────── + if _COLLECT_COST.value: + elapsed = time.time() - t_run_start + results += _collect_cost_sample(pod, elapsed, base_meta) + + # ── Final degradation gate ──────────────────────────────────────────────── + # The phase try/except blocks above keep the run alive so partial data is + # still collected, but that means a catastrophic failure (pod OOM-evicted + # mid-run, no fio data, stress-ng killed before it could drive swap I/O) + # would otherwise be reported by PKB as SUCCEEDED with empty/garbage metrics. + # Detect those conditions here and surface them explicitly. + if _active_pod and _active_pod[0] != original_pod: + _degraded_reasons.append( + f"benchmark pod was replaced during the run ({original_pod} →" + f" {_active_pod[0]}) — it was OOM-evicted under swap pressure;" + " phases executed after the eviction ran against a" + " freshly-initialised pod (empty /tmp, swap re-setup) and may be" + " invalid" + ) + if _pod_lost: + _degraded_reasons.append( + "benchmark pod(s) went NotFound during the run" + f' ({", ".join(_pod_lost)}) — the pod died (node memory-pressure' + " eviction or container exit) and any phase running at or after" + " that point (e.g. kernel-build baseline, OpenSearch) produced" + " invalid data" + ) + if _oom_events: + _degraded_reasons.append( + f"OOM kill(s) (rc=137) occurred during the run on pod(s) " + f'{", ".join(_oom_events)} — a phase exceeded memory and was' + " killed by " + f"the OOM killer (the container may have restarted in place), so" + f" the " + f"affected phase(s) produced no or partial data" + ) + + if _phase_selected("fio") and not tier1_results: + if swap_dev.startswith("/dev/loop"): + # Expected: COS blocks device-mapper from pod namespaces on single-disk + # nodes (n2/n4 without --swap_encryption_add_swap_disk or lssd). + # Tier 2/3 results are still valid; do NOT mark the run as degraded. + logging.warning( + "[swap_encryption] Gate 1 (fio) skipped — loop device %s has no" + " dm-crypt support from inside a pod. Tier 2/3 results are" + " valid. Use c4-*-lssd or --swap_encryption_add_swap_disk for" + " fio data.", + swap_dev, + ) + else: + _degraded_reasons.append( + "Gate 1 (fio microbenchmarks) produced no samples — the raw" + " swap device was never characterised" + ) + + degraded = bool(_degraded_reasons) + results.append( + sample.Sample( + "swap_encryption_run_status", + 0.0 if degraded else 1.0, + "status", + dict( + base_meta, + degraded=degraded, + degraded_reasons="; ".join(_degraded_reasons) or "none", + num_samples=len(results) + 1, + ), + ) + ) + + if degraded: + msg = "[swap_encryption] RUN DEGRADED — " + "; ".join(_degraded_reasons) + logging.error(msg) + if _FAIL_ON_DEGRADED.value: + # Raise so PKB marks the benchmark FAILED instead of SUCCEEDED. The + # samples collected so far are still published by PKB before the failure + # is recorded, so no data is lost. + raise errors.Benchmarks.RunError(msg) else: - _degraded_reasons.append( - 'Gate 1 (fio microbenchmarks) produced no samples — the raw swap ' - 'device was never characterised') - - - degraded = bool(_degraded_reasons) - results.append(sample.Sample( - 'swap_encryption_run_status', - 0.0 if degraded else 1.0, - 'status', - dict(base_meta, - degraded=degraded, - degraded_reasons='; '.join(_degraded_reasons) or 'none', - num_samples=len(results) + 1))) - - if degraded: - msg = ('[swap_encryption] RUN DEGRADED — ' - + '; '.join(_degraded_reasons)) - logging.error(msg) - if _FAIL_ON_DEGRADED.value: - # Raise so PKB marks the benchmark FAILED instead of SUCCEEDED. The - # samples collected so far are still published by PKB before the failure - # is recorded, so no data is lost. - raise errors.Benchmarks.RunError(msg) - else: - logging.info('[swap_encryption] Run completed cleanly (%d samples)', - len(results)) - - return results + logging.info( + "[swap_encryption] Run completed cleanly (%d samples)", len(results) + ) + + return results def Cleanup(spec) -> None: - """Remove the DaemonSet and tear down any swap configuration.""" - pod = _wait_for_benchmark_pod(timeout=30) - if pod: - _pod_exec(pod, 'swapoff -a 2>/dev/null || true', ignore_failure=True) - _pod_exec(pod, textwrap.dedent(""" + """Remove the DaemonSet and tear down any swap configuration.""" + pod = _wait_for_benchmark_pod(timeout=30) + if pod: + _pod_exec(pod, "swapoff -a 2>/dev/null || true", ignore_failure=True) + _pod_exec( + pod, + textwrap.dedent(""" swapoff /dev/mapper/swap_encrypted 2>/dev/null || true dmsetup remove --noudevrules --noudevsync swap_encrypted 2>/dev/null || true - """), ignore_failure=True) - # Clean up loop device backing files (single-disk fallback path). - _pod_exec(pod, textwrap.dedent(""" + """), + ignore_failure=True, + ) + # Clean up loop device backing files (single-disk fallback path). + _pod_exec( + pod, + textwrap.dedent(""" for backing in /var/pkb_swap_backing /run/pkb_swap_backing \ /mnt/stateful_partition/pkb_swap_backing do @@ -790,154 +835,228 @@ def Cleanup(spec) -> None: done rm -f "$backing" done - """), ignore_failure=True) - _pod_exec(pod, "pkill -9 'stress-ng|fio' 2>/dev/null || true", - ignore_failure=True) + """), + ignore_failure=True, + ) + _pod_exec( + pod, + "pkill -9 'stress-ng|fio' 2>/dev/null || true", + ignore_failure=True, + ) + + _delete_daemonset() + + # Detach and delete the dedicated swap disk if one was provisioned. + cluster = spec.container_cluster + if _ADD_SWAP_DISK.value and getattr(cluster, "project", None): + _detach_and_delete_swap_disk(cluster) + - _delete_daemonset() +def _configure_eks_kubelet_swap(spec) -> None: + """Configure EKS kubelet for LimitedSwap via nodeadm bootstrap. - # Detach and delete the dedicated swap disk if one was provisioned. - cluster = spec.container_cluster - if _ADD_SWAP_DISK.value and getattr(cluster, 'project', None): - _detach_and_delete_swap_disk(cluster) + NOTE: Deferred — requires Ajay's PR #6780 (SwapConfigSpec + nodeadm + integration) to merge. When that lands, EKS node pools should include + a preBootstrapCommands block writing nodeadm config with + memorySwapBehavior: LimitedSwap before kubelet starts. + + See also: https://github.com/GoogleCloudPlatform/PerfKitBenchmarker/pull/6780 + """ + logging.warning( + "[swap_encryption] EKS kubelet LimitedSwap config via nodeadm is " + "deferred (blocked on PR #6780 — SwapConfigSpec). " + "EKS nodes will use default kubelet swap settings until that PR merges." + ) def _deploy_daemonset() -> None: - """Apply the benchmark DaemonSet manifest to the cluster.""" - manifest = _daemonset_yaml(image=_DAEMONSET_IMAGE.value) - with vm_util.NamedTemporaryFile(mode='w', suffix='.yaml') as f: - f.write(manifest) - f.close() - kubectl.RunKubectlCommand(['apply', '-f', f.name]) - logging.info('[swap_encryption] DaemonSet applied') + """Apply the benchmark DaemonSet manifest to the cluster.""" + manifest = _daemonset_yaml(image=_DAEMONSET_IMAGE.value) + with vm_util.NamedTemporaryFile(mode="w", suffix=".yaml") as f: + f.write(manifest) + f.close() + kubectl.RunKubectlCommand(["apply", "-f", f.name]) + logging.info("[swap_encryption] DaemonSet applied") def _wait_for_benchmark_pod(timeout: int = 900) -> str | None: - """Wait until the DaemonSet pod is Running AND tools are installed. - - The benchmark container installs apt packages on first start and writes - /tmp/pkb_ready when done (~2-4 min on a cold node). We must wait for - that sentinel before exec-ing any commands, otherwise tools like - cryptsetup / fio may not yet be on PATH. - - Uses tab-separated name/phase output so kubectl always exits 0 regardless - of whether any pods are present, avoiding jsonpath index errors. - """ - deadline = time.time() + timeout - last_phase = '' - ready_pod = None # pod name once phase == Running - - while time.time() < deadline: - # ── Step 1: wait for Running phase ────────────────────────────────────── - if ready_pod is None: - out, _, rc = kubectl.RunKubectlCommand([ - 'get', 'pods', - '-l', f'app={_DS_LABEL}', - '-n', _DS_NAMESPACE, - '-o', - r'jsonpath={range .items[*]}{.metadata.name}{"\t"}{.status.phase}{"\n"}{end}', - ], raise_on_failure=False) - - if rc == 0 and out.strip(): - for line in out.strip().splitlines(): - parts = line.split('\t') - if len(parts) == 2: - pod_name, phase = parts[0].strip(), parts[1].strip() - if phase == 'Running': - logging.info('[swap_encryption] Pod %s is Running – ' - 'waiting for tool install to finish...', pod_name) - ready_pod = pod_name - break - if phase != last_phase: - logging.info('[swap_encryption] Pod %s phase: %s', pod_name, phase) - last_phase = phase - if phase in ('Pending',): - _log_pod_events(pod_name) - else: - logging.info('[swap_encryption] Waiting for DaemonSet pod to appear...') - - # ── Step 2: poll for /tmp/pkb_ready sentinel ──────────────────────────── - if ready_pod is not None: - sentinel_out, sentinel_err, sentinel_rc = kubectl.RunKubectlCommand([ - 'exec', ready_pod, '-n', _DS_NAMESPACE, - '--', 'test', '-f', '/tmp/pkb_ready', - ], raise_on_failure=False) - if sentinel_rc == 0: - logging.info( - '[swap_encryption] Pod %s ready (tools installed)', ready_pod) - return ready_pod - # "container not found" means the container crashed (CrashLoopBackOff or - # exited) — treat it as a hard reset: re-check pod phase on next iteration. - if ('container not found' in sentinel_err - or 'unable to upgrade connection' in sentinel_err): - logging.warning('[swap_encryption] Pod %s: container not running (%s) ' - '— will re-check pod state', ready_pod, sentinel_err.strip()) - ready_pod = None - last_phase = '' - else: - logging.info( - '[swap_encryption] Pod %s: still installing tools...', ready_pod) - - time.sleep(15) + """Wait until the DaemonSet pod is Running AND tools are installed. + + The benchmark container installs apt packages on first start and writes + /tmp/pkb_ready when done (~2-4 min on a cold node). We must wait for + that sentinel before exec-ing any commands, otherwise tools like + cryptsetup / fio may not yet be on PATH. + + Uses tab-separated name/phase output so kubectl always exits 0 regardless + of whether any pods are present, avoiding jsonpath index errors. + """ + deadline = time.time() + timeout + last_phase = "" + ready_pod = None # pod name once phase == Running + + while time.time() < deadline: + # ── Step 1: wait for Running phase ────────────────────────────────────── + if ready_pod is None: + out, _, rc = kubectl.RunKubectlCommand( + [ + "get", + "pods", + "-l", + f"app={_DS_LABEL}", + "-n", + _DS_NAMESPACE, + "-o", + ( + r"jsonpath={range" + r' .items[*]}{.metadata.name}{"\t"}{.status.phase}{"\n"}{end}' + ), + ], + raise_on_failure=False, + ) + + if rc == 0 and out.strip(): + for line in out.strip().splitlines(): + parts = line.split("\t") + if len(parts) == 2: + pod_name, phase = parts[0].strip(), parts[1].strip() + if phase == "Running": + logging.info( + "[swap_encryption] Pod %s is Running – " + "waiting for tool install to finish...", + pod_name, + ) + ready_pod = pod_name + break + if phase != last_phase: + logging.info( + "[swap_encryption] Pod %s phase: %s", + pod_name, + phase, + ) + last_phase = phase + if phase in ("Pending",): + _log_pod_events(pod_name) + else: + logging.info( + "[swap_encryption] Waiting for DaemonSet pod to appear..." + ) + + # ── Step 2: poll for /tmp/pkb_ready sentinel ──────────────────────────── + if ready_pod is not None: + sentinel_out, sentinel_err, sentinel_rc = kubectl.RunKubectlCommand( + [ + "exec", + ready_pod, + "-n", + _DS_NAMESPACE, + "--", + "test", + "-f", + "/tmp/pkb_ready", + ], + raise_on_failure=False, + ) + if sentinel_rc == 0: + logging.info( + "[swap_encryption] Pod %s ready (tools installed)", + ready_pod, + ) + return ready_pod + # "container not found" means the container crashed (CrashLoopBackOff or + # exited) — treat it as a hard reset: re-check pod phase on next iteration. + if ( + "container not found" in sentinel_err + or "unable to upgrade connection" in sentinel_err + ): + logging.warning( + "[swap_encryption] Pod %s: container not running (%s) " + "— will re-check pod state", + ready_pod, + sentinel_err.strip(), + ) + ready_pod = None + last_phase = "" + else: + logging.info( + "[swap_encryption] Pod %s: still installing tools...", + ready_pod, + ) + + time.sleep(15) - logging.warning( - '[swap_encryption] Benchmark pod not ready after %ds', timeout) - return None + logging.warning( + "[swap_encryption] Benchmark pod not ready after %ds", timeout + ) + return None def _log_pod_events(pod_name: str) -> None: - """Dump recent Kubernetes events for the pod to help diagnose startup hangs.""" - events_out, _, _ = kubectl.RunKubectlCommand([ - 'describe', 'pod', pod_name, - '-n', _DS_NAMESPACE, - ], raise_on_failure=False) - # Only log the Events section to keep output manageable - in_events = False - lines = [] - for line in events_out.splitlines(): - if line.startswith('Events:'): - in_events = True - if in_events: - lines.append(line) - if lines: - logging.info('[swap_encryption] Pod events:\n%s', '\n'.join(lines[:30])) - else: - logging.info('[swap_encryption] kubectl describe output:\n%s', - events_out[-2000:] if len(events_out) > 2000 else events_out) + """Dump recent Kubernetes events for the pod to help diagnose startup hangs.""" + events_out, _, _ = kubectl.RunKubectlCommand( + [ + "describe", + "pod", + pod_name, + "-n", + _DS_NAMESPACE, + ], + raise_on_failure=False, + ) + # Only log the Events section to keep output manageable + in_events = False + lines = [] + for line in events_out.splitlines(): + if line.startswith("Events:"): + in_events = True + if in_events: + lines.append(line) + if lines: + logging.info("[swap_encryption] Pod events:\n%s", "\n".join(lines[:30])) + else: + logging.info( + "[swap_encryption] kubectl describe output:\n%s", + events_out[-2000:] if len(events_out) > 2000 else events_out, + ) def _delete_daemonset() -> None: - """Delete the benchmark DaemonSet.""" - kubectl.RunKubectlCommand([ - 'delete', 'daemonset', _DS_NAME, - '-n', _DS_NAMESPACE, - '--ignore-not-found', - ], raise_on_failure=False) - logging.info('[swap_encryption] DaemonSet deleted') + """Delete the benchmark DaemonSet.""" + kubectl.RunKubectlCommand( + [ + "delete", + "daemonset", + _DS_NAME, + "-n", + _DS_NAMESPACE, + "--ignore-not-found", + ], + raise_on_failure=False, + ) + logging.info("[swap_encryption] DaemonSet deleted") def _build_node_startup_script(enable_dmcrypt: bool, lssd: bool) -> str: - """Return a bash startup script for the benchmark nodepool. - - NOTE: This function is not currently used. GKE reserves the - `startup-script` node metadata key, so dm-crypt setup is performed - from within the privileged DaemonSet pod instead (see - _setup_gke_hyperdisk_swap / _setup_gke_lssd_swap). Kept as reference. - - Args: - enable_dmcrypt: When True, wrap the swap device in dm-crypt plain - mode (aes-xts-plain64, ephemeral random key) matching GKE's - go/node:swap-encryption implementation. - lssd: When True, build a RAID-0 array across all local SSDs before - setting up swap (matches go/gke-swap-lssd). - - Returns: - A bash script string suitable for running as root at node boot. - """ - dmcrypt_str = 'true' if enable_dmcrypt else 'false' - lssd_str = 'true' if lssd else 'false' - - return textwrap.dedent(f"""\ + """Return a bash startup script for the benchmark nodepool. + + NOTE: This function is not currently used. GKE reserves the + `startup-script` node metadata key, so dm-crypt setup is performed + from within the privileged DaemonSet pod instead (see + _setup_gke_hyperdisk_swap / _setup_gke_lssd_swap). Kept as reference. + + Args: + enable_dmcrypt: When True, wrap the swap device in dm-crypt plain + mode (aes-xts-plain64, ephemeral random key) matching GKE's + go/node:swap-encryption implementation. + lssd: When True, build a RAID-0 array across all local SSDs before + setting up swap (matches go/gke-swap-lssd). + + Returns: + A bash script string suitable for running as root at node boot. + """ + dmcrypt_str = "true" if enable_dmcrypt else "false" + lssd_str = "true" if lssd else "false" + + return textwrap.dedent(f"""\ #!/bin/bash # PKB swap_encryption_benchmark — nodepool startup script. # Configures swap once at node boot so all benchmark phases see a @@ -1001,355 +1120,519 @@ def _build_node_startup_script(enable_dmcrypt: bool, lssd: bool) -> str: """) -_HYPERDISK_MAX_IOPS_PER_MBPS = 256 # GCP Hyperdisk Balanced: IOPS <= 256 x MiB/s +_HYPERDISK_MAX_IOPS_PER_MBPS = ( + 256 # GCP Hyperdisk Balanced: IOPS <= 256 x MiB/s +) def _valid_hyperdisk_throughput(iops: int, throughput: int) -> int: - """Return a throughput (MiB/s) that satisfies GCP's Hyperdisk constraint. - - Hyperdisk Balanced rejects disk creation when provisioned IOPS exceed - 256 x provisioned throughput (MiB/s) — e.g. 80000 IOPS with 300 MiB/s fails - with "Requested provisioned throughput is too low for the provisioned iops". - Clamp throughput UP to the minimum the requested IOPS need (plus a small - margin) and warn, so a mismatched flag pairing cannot abort node-pool/disk - creation. - """ - min_tput = -(-int(iops) // _HYPERDISK_MAX_IOPS_PER_MBPS) # ceil(iops/256) - if throughput < min_tput: - logging.warning( - '[swap_encryption] boot/swap disk throughput %d MiB/s is too low for ' - '%d IOPS (Hyperdisk needs >= ceil(iops/256) = %d MiB/s); raising to %d', - throughput, iops, min_tput, min_tput) - return min_tput - return throughput + """Return a throughput (MiB/s) that satisfies GCP's Hyperdisk constraint. + + Hyperdisk Balanced rejects disk creation when provisioned IOPS exceed + 256 x provisioned throughput (MiB/s) — e.g. 80000 IOPS with 300 MiB/s fails + with "Requested provisioned throughput is too low for the provisioned iops". + Clamp throughput UP to the minimum the requested IOPS need (plus a small + margin) and warn, so a mismatched flag pairing cannot abort node-pool/disk + creation. + """ + min_tput = -(-int(iops) // _HYPERDISK_MAX_IOPS_PER_MBPS) # ceil(iops/256) + if throughput < min_tput: + logging.warning( + "[swap_encryption] boot/swap disk throughput %d MiB/s is too low" + " for %d IOPS (Hyperdisk needs >= ceil(iops/256) = %d MiB/s);" + " raising to %d", + throughput, + iops, + min_tput, + min_tput, + ) + return min_tput + return throughput def _create_benchmark_node_pool(cluster) -> None: - """Add the benchmark nodepool to the existing cluster (Step 2 of setup). - - Uses: - --swap_encryption_benchmark_machine_type (default n4-highmem-32) - --swap_encryption_node_image_type (default COS_CONTAINERD) - --swap_encryption_boot_disk_iops (default 80000) - --swap_encryption_enable_dmcrypt (default True) - - The nodepool is labelled pkb_nodepool=benchmark so the DaemonSet - nodeSelector targets it exclusively. dm-crypt swap setup is performed - from within the privileged DaemonSet pod (see _setup_gke_hyperdisk_swap / - _setup_gke_lssd_swap) — we do NOT inject a startup-script via node metadata - because GKE reserves that metadata key and rejects it at the API level. - """ - machine_type = _BENCHMARK_MACHINE_TYPE.value - # Auto-detect LSSD from machine type name; flag overrides only when True. - is_lssd = _BENCHMARK_LSSD.value or 'lssd' in machine_type.lower() - - # Determine zone/region from the cluster object. - zone_flags: list[str] = [] - if getattr(cluster, 'zones', None): - zone_flags = ['--zone', cluster.zones[0]] - elif getattr(cluster, 'region', None): - zone_flags = ['--region', cluster.region] - - # LSSD configs only need a small boot disk (OS only; swap is on local NVMe). - # Hyperdisk configs need 500 GiB to hit 80 000 IOPS (the IOPS/GiB ratio on - # hyperdisk-balanced is 1:1 up to the provisioned ceiling, so a 100 GiB disk - # can only provision up to 100 000 IOPS but a 500 GiB gives comfortable - # headroom and matches the Config 2 spec in the Engineer Assignments table). - disk_size_gb = 100 if is_lssd else _BOOT_DISK_SIZE_GB.value - - disk_type = _BOOT_DISK_TYPE.value - cmd = [ - 'gcloud', 'container', 'node-pools', 'create', _BENCHMARK_NODEPOOL, - '--cluster', cluster.name, - '--project', cluster.project, - '--machine-type', machine_type, - '--image-type', _NODE_IMAGE_TYPE.value, - '--disk-type', disk_type, - '--disk-size', str(disk_size_gb), - '--num-nodes', '1', - '--node-labels', f'pkb_nodepool={_BENCHMARK_NODEPOOL}', - '--no-enable-autoupgrade', - '--no-enable-autorepair', - ] + zone_flags - - # IOPS and throughput provisioning only applies to hyperdisk-* types AND - # only when the boot disk is also the swap device (non-LSSD configs). - # For LSSD machines the boot disk is OS-only; swap is on local NVMe. - # Provisioning 80k IOPS on a 100 GiB boot disk would exceed the - # hyperdisk-balanced per-GiB cap (80 IOPS/GiB × 100 GiB = 8 000 max). - if disk_type.startswith('hyperdisk') and not is_lssd: - cmd += [ - '--boot-disk-provisioned-iops', str(_BOOT_DISK_IOPS.value), - '--boot-disk-provisioned-throughput', - str(_valid_hyperdisk_throughput(_BOOT_DISK_IOPS.value, - _BOOT_DISK_THROUGHPUT.value)), - ] + """Add the benchmark nodepool to the existing cluster (Step 2 of setup). + + Uses: + --swap_encryption_benchmark_machine_type (default n4-highmem-32) + --swap_encryption_node_image_type (default COS_CONTAINERD) + --swap_encryption_boot_disk_iops (default 80000) + --swap_encryption_enable_dmcrypt (default True) + + The nodepool is labelled pkb_nodepool=benchmark so the DaemonSet + nodeSelector targets it exclusively. dm-crypt swap setup is performed + from within the privileged DaemonSet pod (see _setup_gke_hyperdisk_swap / + _setup_gke_lssd_swap) — we do NOT inject a startup-script via node metadata + because GKE reserves that metadata key and rejects it at the API level. + """ + machine_type = _BENCHMARK_MACHINE_TYPE.value + # Auto-detect LSSD from machine type name; flag overrides only when True. + is_lssd = _BENCHMARK_LSSD.value or "lssd" in machine_type.lower() + + # Determine zone/region from the cluster object. + zone_flags: list[str] = [] + if getattr(cluster, "zones", None): + zone_flags = ["--zone", cluster.zones[0]] + elif getattr(cluster, "region", None): + zone_flags = ["--region", cluster.region] + + # LSSD configs only need a small boot disk (OS only; swap is on local NVMe). + # Hyperdisk configs need 500 GiB to hit 80 000 IOPS (the IOPS/GiB ratio on + # hyperdisk-balanced is 1:1 up to the provisioned ceiling, so a 100 GiB disk + # can only provision up to 100 000 IOPS but a 500 GiB gives comfortable + # headroom and matches the Config 2 spec in the Engineer Assignments table). + disk_size_gb = 100 if is_lssd else _BOOT_DISK_SIZE_GB.value + + disk_type = _BOOT_DISK_TYPE.value + cmd = [ + "gcloud", + "container", + "node-pools", + "create", + _BENCHMARK_NODEPOOL, + "--cluster", + cluster.name, + "--project", + cluster.project, + "--machine-type", + machine_type, + "--image-type", + _NODE_IMAGE_TYPE.value, + "--disk-type", + disk_type, + "--disk-size", + str(disk_size_gb), + "--num-nodes", + "1", + "--node-labels", + f"pkb_nodepool={_BENCHMARK_NODEPOOL}", + "--no-enable-autoupgrade", + "--no-enable-autorepair", + ] + zone_flags + + # IOPS and throughput provisioning only applies to hyperdisk-* types AND + # only when the boot disk is also the swap device (non-LSSD configs). + # For LSSD machines the boot disk is OS-only; swap is on local NVMe. + # Provisioning 80k IOPS on a 100 GiB boot disk would exceed the + # hyperdisk-balanced per-GiB cap (80 IOPS/GiB × 100 GiB = 8 000 max). + if disk_type.startswith("hyperdisk") and not is_lssd: + cmd += [ + "--boot-disk-provisioned-iops", + str(_BOOT_DISK_IOPS.value), + "--boot-disk-provisioned-throughput", + str( + _valid_hyperdisk_throughput( + _BOOT_DISK_IOPS.value, _BOOT_DISK_THROUGHPUT.value + ) + ), + ] + + # For LSSD machines, expose local NVMe as raw block devices so fio/mdadm + # can access them directly (go/gke-swap-lssd uses local-nvme-ssd-block). + if is_lssd: + cmd += ["--local-nvme-ssd-block", f"count={_LSSD_COUNT.value}"] - # For LSSD machines, expose local NVMe as raw block devices so fio/mdadm - # can access them directly (go/gke-swap-lssd uses local-nvme-ssd-block). - if is_lssd: - cmd += ['--local-nvme-ssd-block', f'count={_LSSD_COUNT.value}'] - - logging.info('[swap_encryption] Creating benchmark nodepool: %s / %s / ' - 'image=%s / disk=%dGiB / iops=%d / dmcrypt=%s / lssd=%s / ' - 'add_swap_disk=%s', - _BENCHMARK_NODEPOOL, machine_type, _NODE_IMAGE_TYPE.value, - disk_size_gb, _BOOT_DISK_IOPS.value, - _ENABLE_DMCRYPT.value, is_lssd, _ADD_SWAP_DISK.value) - - # LSSD nodepools take longer to provision than PD-only nodepools because - # GKE must also initialise the local NVMe devices before marking nodes Ready. - # 1200 s (20 min) covers observed worst-case times on c4-lssd and n4 configs. - stdout, stderr, rc = vm_util.IssueCommand(cmd, timeout=1200, - raise_on_failure=False) - - if rc != 0: - # Idempotent prepare: if the nodepool already exists (e.g. re-running - # --run_stage=prepare,run to redeploy the DaemonSet onto an existing - # cluster), reuse it instead of failing. gcloud returns a 409 / - # "Already exists" message in this case. - low = (stderr or '').lower() - if 'already exists' in low or 'alreadyexists' in low or 'code=409' in low: - logging.info('[swap_encryption] Benchmark nodepool already exists — ' - 'reusing it (idempotent prepare); proceeding to DaemonSet') - return - raise errors.Benchmarks.RunError( - f'[swap_encryption] Failed to create benchmark nodepool ' - f'(rc={rc}): {stderr}' + logging.info( + "[swap_encryption] Creating benchmark nodepool: %s / %s / " + "image=%s / disk=%dGiB / iops=%d / dmcrypt=%s / lssd=%s / " + "add_swap_disk=%s", + _BENCHMARK_NODEPOOL, + machine_type, + _NODE_IMAGE_TYPE.value, + disk_size_gb, + _BOOT_DISK_IOPS.value, + _ENABLE_DMCRYPT.value, + is_lssd, + _ADD_SWAP_DISK.value, ) - logging.info('[swap_encryption] Benchmark nodepool ready') + + # LSSD nodepools take longer to provision than PD-only nodepools because + # GKE must also initialise the local NVMe devices before marking nodes Ready. + # 1200 s (20 min) covers observed worst-case times on c4-lssd and n4 configs. + stdout, stderr, rc = vm_util.IssueCommand( + cmd, timeout=1200, raise_on_failure=False + ) + + if rc != 0: + # Idempotent prepare: if the nodepool already exists (e.g. re-running + # --run_stage=prepare,run to redeploy the DaemonSet onto an existing + # cluster), reuse it instead of failing. gcloud returns a 409 / + # "Already exists" message in this case. + low = (stderr or "").lower() + if ( + "already exists" in low + or "alreadyexists" in low + or "code=409" in low + ): + logging.info( + "[swap_encryption] Benchmark nodepool already exists — " + "reusing it (idempotent prepare); proceeding to DaemonSet" + ) + return + raise errors.Benchmarks.RunError( + "[swap_encryption] Failed to create benchmark nodepool " + f"(rc={rc}): {stderr}" + ) + logging.info("[swap_encryption] Benchmark nodepool ready") def _wait_for_benchmark_node(timeout: int = 900) -> None: - """Block until a node labelled pkb_nodepool=benchmark is Ready. - - gcloud container node-pools create returns as soon as the API accepts the - request — the actual node VM may take another 2-4 minutes to boot, join the - cluster, and pass its readiness checks. Deploying the DaemonSet before that - point leaves the pod Pending indefinitely because the nodeSelector finds no - eligible node. - - This function polls kubectl every 15 s until at least one node with - pkb_nodepool=benchmark has Ready=True, then returns. - """ - deadline = time.time() + timeout - logging.info('[swap_encryption] Waiting for benchmark node ' - '(pkb_nodepool=benchmark) to be Ready...') - while time.time() < deadline: - out, _, rc = kubectl.RunKubectlCommand([ - 'get', 'nodes', - '-l', f'pkb_nodepool={_BENCHMARK_NODEPOOL}', - '-o', r'jsonpath={range .items[*]}' - r'{.metadata.name}{"\t"}' - r'{range .status.conditions[?(@.type=="Ready")]}' - r'{.status}{"\n"}{end}{end}', - ], raise_on_failure=False) - - if rc == 0 and out.strip(): - for line in out.strip().splitlines(): - parts = line.split('\t') - if len(parts) == 2 and parts[1].strip() == 'True': - logging.info('[swap_encryption] Benchmark node ready: %s', - parts[0].strip()) - return - - logging.info('[swap_encryption] Benchmark node not yet Ready — ' - 'retrying in 15 s...') - time.sleep(15) - - raise errors.Benchmarks.RunError( - '[swap_encryption] Timed out waiting for benchmark node ' - f'(pkb_nodepool={_BENCHMARK_NODEPOOL}) to become Ready ' - f'after {timeout}s' - ) + """Block until a node labelled pkb_nodepool=benchmark is Ready. + + gcloud container node-pools create returns as soon as the API accepts the + request — the actual node VM may take another 2-4 minutes to boot, join the + cluster, and pass its readiness checks. Deploying the DaemonSet before that + point leaves the pod Pending indefinitely because the nodeSelector finds no + eligible node. + + This function polls kubectl every 15 s until at least one node with + pkb_nodepool=benchmark has Ready=True, then returns. + """ + deadline = time.time() + timeout + logging.info( + "[swap_encryption] Waiting for benchmark node " + "(pkb_nodepool=benchmark) to be Ready..." + ) + while time.time() < deadline: + out, _, rc = kubectl.RunKubectlCommand( + [ + "get", + "nodes", + "-l", + f"pkb_nodepool={_BENCHMARK_NODEPOOL}", + "-o", + r"jsonpath={range .items[*]}" + r'{.metadata.name}{"\t"}' + r'{range .status.conditions[?(@.type=="Ready")]}' + r'{.status}{"\n"}{end}{end}', + ], + raise_on_failure=False, + ) + + if rc == 0 and out.strip(): + for line in out.strip().splitlines(): + parts = line.split("\t") + if len(parts) == 2 and parts[1].strip() == "True": + logging.info( + "[swap_encryption] Benchmark node ready: %s", + parts[0].strip(), + ) + return + logging.info( + "[swap_encryption] Benchmark node not yet Ready — " + "retrying in 15 s..." + ) + time.sleep(15) -def _attach_swap_disk(cluster) -> None: - """Create a dedicated hyperdisk and attach it to the benchmark node. - - gcloud container node-pools create --additional-node-disk is not available - in all gcloud SDK versions, so we use gcloud compute to create the disk and - attach it after the node is ready. In GKE the Kubernetes node name is the - same as the GCE instance name, so no translation is needed. - - After attachment the disk appears as /dev/sdb (or /dev/nvme1n1 on NVMe - nodes) inside the pod, and _setup_gke_hyperdisk_swap detects it via lsblk. - - The disk is named pkb-swap- to avoid name collisions across - concurrent runs. Cleanup deletes it in Cleanup() if it exists. - """ - # Resolve zone from cluster - zone = None - if getattr(cluster, 'zones', None): - zone = cluster.zones[0] - elif getattr(cluster, 'region', None): - zone = cluster.region - if not zone: raise errors.Benchmarks.RunError( - '[swap_encryption] Cannot attach swap disk: cluster zone unknown') - - project = cluster.project - disk_name = f'pkb-swap-{cluster.name}' - disk_type = _BOOT_DISK_TYPE.value - disk_size_gb = _SWAP_DISK_SIZE_GB.value - - # ── Step 1: get the GCE instance name of the benchmark node ─────────────── - node_out, _, rc = kubectl.RunKubectlCommand([ - 'get', 'nodes', - '-l', f'pkb_nodepool={_BENCHMARK_NODEPOOL}', - '-o', 'jsonpath={.items[0].metadata.name}', - ], raise_on_failure=False) - instance_name = node_out.strip() - if rc != 0 or not instance_name: - raise errors.Benchmarks.RunError( - '[swap_encryption] Cannot find benchmark node for swap disk attach') - logging.info('[swap_encryption] Benchmark node instance: %s', instance_name) - - # ── Step 2: create the hyperdisk ────────────────────────────────────────── - logging.info('[swap_encryption] Creating swap disk %s (%dGiB %s)', - disk_name, disk_size_gb, disk_type) - create_cmd = [ - 'gcloud', 'compute', 'disks', 'create', disk_name, - '--project', project, - '--zone', zone, - '--type', disk_type, - '--size', f'{disk_size_gb}GB', - '--quiet', - ] - if disk_type.startswith('hyperdisk'): - create_cmd += [ - '--provisioned-iops', str(_BOOT_DISK_IOPS.value), - '--provisioned-throughput', - str(_valid_hyperdisk_throughput(_BOOT_DISK_IOPS.value, - _BOOT_DISK_THROUGHPUT.value)), + "[swap_encryption] Timed out waiting for benchmark node " + f"(pkb_nodepool={_BENCHMARK_NODEPOOL}) to become Ready " + f"after {timeout}s" + ) + + +def _attach_swap_disk(cluster) -> None: + """Create a dedicated hyperdisk and attach it to the benchmark node. + + gcloud container node-pools create --additional-node-disk is not available + in all gcloud SDK versions, so we use gcloud compute to create the disk and + attach it after the node is ready. In GKE the Kubernetes node name is the + same as the GCE instance name, so no translation is needed. + + After attachment the disk appears as /dev/sdb (or /dev/nvme1n1 on NVMe + nodes) inside the pod, and _setup_gke_hyperdisk_swap detects it via lsblk. + + The disk is named pkb-swap- to avoid name collisions across + concurrent runs. Cleanup deletes it in Cleanup() if it exists. + """ + # Resolve zone from cluster + zone = None + if getattr(cluster, "zones", None): + zone = cluster.zones[0] + elif getattr(cluster, "region", None): + zone = cluster.region + if not zone: + raise errors.Benchmarks.RunError( + "[swap_encryption] Cannot attach swap disk: cluster zone unknown" + ) + + project = cluster.project + disk_name = f"pkb-swap-{cluster.name}" + disk_type = _BOOT_DISK_TYPE.value + disk_size_gb = _SWAP_DISK_SIZE_GB.value + + # ── Step 1: get the GCE instance name of the benchmark node ─────────────── + node_out, _, rc = kubectl.RunKubectlCommand( + [ + "get", + "nodes", + "-l", + f"pkb_nodepool={_BENCHMARK_NODEPOOL}", + "-o", + "jsonpath={.items[0].metadata.name}", + ], + raise_on_failure=False, + ) + instance_name = node_out.strip() + if rc != 0 or not instance_name: + raise errors.Benchmarks.RunError( + "[swap_encryption] Cannot find benchmark node for swap disk attach" + ) + logging.info("[swap_encryption] Benchmark node instance: %s", instance_name) + + # ── Step 2: create the hyperdisk ────────────────────────────────────────── + logging.info( + "[swap_encryption] Creating swap disk %s (%dGiB %s)", + disk_name, + disk_size_gb, + disk_type, + ) + create_cmd = [ + "gcloud", + "compute", + "disks", + "create", + disk_name, + "--project", + project, + "--zone", + zone, + "--type", + disk_type, + "--size", + f"{disk_size_gb}GB", + "--quiet", ] - _, stderr, rc = vm_util.IssueCommand(create_cmd, timeout=120, - raise_on_failure=False) - if rc != 0: - raise errors.Benchmarks.RunError( - f'[swap_encryption] Failed to create swap disk {disk_name}: {stderr}') - - # ── Step 3: attach the disk to the node VM ──────────────────────────────── - logging.info('[swap_encryption] Attaching swap disk %s to %s', - disk_name, instance_name) - attach_cmd = [ - 'gcloud', 'compute', 'instances', 'attach-disk', instance_name, - '--project', project, - '--zone', zone, - '--disk', disk_name, - '--device-name', 'pkb-swap', - '--quiet', - ] - _, stderr, rc = vm_util.IssueCommand(attach_cmd, timeout=120, - raise_on_failure=False) - if rc != 0: - raise errors.Benchmarks.RunError( - f'[swap_encryption] Failed to attach swap disk to {instance_name}: ' - f'{stderr}') - logging.info('[swap_encryption] Swap disk attached: %s → %s', - disk_name, instance_name) + if disk_type.startswith("hyperdisk"): + create_cmd += [ + "--provisioned-iops", + str(_BOOT_DISK_IOPS.value), + "--provisioned-throughput", + str( + _valid_hyperdisk_throughput( + _BOOT_DISK_IOPS.value, _BOOT_DISK_THROUGHPUT.value + ) + ), + ] + _, stderr, rc = vm_util.IssueCommand( + create_cmd, timeout=120, raise_on_failure=False + ) + if rc != 0: + raise errors.Benchmarks.RunError( + f"[swap_encryption] Failed to create swap disk {disk_name}:" + f" {stderr}" + ) + + # ── Step 3: attach the disk to the node VM ──────────────────────────────── + logging.info( + "[swap_encryption] Attaching swap disk %s to %s", + disk_name, + instance_name, + ) + attach_cmd = [ + "gcloud", + "compute", + "instances", + "attach-disk", + instance_name, + "--project", + project, + "--zone", + zone, + "--disk", + disk_name, + "--device-name", + "pkb-swap", + "--quiet", + ] + _, stderr, rc = vm_util.IssueCommand( + attach_cmd, timeout=120, raise_on_failure=False + ) + if rc != 0: + raise errors.Benchmarks.RunError( + f"[swap_encryption] Failed to attach swap disk to {instance_name}: " + f"{stderr}" + ) + logging.info( + "[swap_encryption] Swap disk attached: %s → %s", + disk_name, + instance_name, + ) def _delete_disk_by_name(disk_name: str, project: str, zone: str) -> bool: - """Detach (if attached) and delete a GCE disk, robustly, with retries. - - Finds the attached instance from the disk's own `users` field rather than - kubectl — kubectl is often unavailable during teardown (cluster being - deleted), which previously left the disk attached and undeletable, so it - leaked. Returns True if the disk is gone (deleted or already absent). - """ - for attempt in range(1, 5): - users, _, rc = vm_util.IssueCommand( - ['gcloud', 'compute', 'disks', 'describe', disk_name, - '--project', project, '--zone', zone, '--format=value(users)'], - timeout=60, raise_on_failure=False) - if rc != 0: - logging.info('[swap_encryption] Swap disk %s not present — nothing to ' - 'delete', disk_name) - return True # already gone - user = users.strip() - if user: - inst = user.split('/')[-1] - logging.info('[swap_encryption] Detaching swap disk %s from %s', - disk_name, inst) - vm_util.IssueCommand( - ['gcloud', 'compute', 'instances', 'detach-disk', inst, - '--project', project, '--zone', zone, '--disk', disk_name, - '--quiet'], timeout=120, raise_on_failure=False) - _, derr, drc = vm_util.IssueCommand( - ['gcloud', 'compute', 'disks', 'delete', disk_name, - '--project', project, '--zone', zone, '--quiet'], - timeout=180, raise_on_failure=False) - if drc == 0: - logging.info('[swap_encryption] Swap disk deleted: %s', disk_name) - return True - logging.warning('[swap_encryption] Swap disk delete attempt %d/4 failed ' - '(%s); retrying in 10s', attempt, derr.strip()[:160]) - time.sleep(10) - logging.error('[swap_encryption] Could NOT delete swap disk %s after retries ' - '— delete it manually: gcloud compute disks delete %s ' - '--zone %s --quiet', disk_name, disk_name, zone) - return False + """Detach (if attached) and delete a GCE disk, robustly, with retries. + + Finds the attached instance from the disk's own `users` field rather than + kubectl — kubectl is often unavailable during teardown (cluster being + deleted), which previously left the disk attached and undeletable, so it + leaked. Returns True if the disk is gone (deleted or already absent). + """ + for attempt in range(1, 5): + users, _, rc = vm_util.IssueCommand( + [ + "gcloud", + "compute", + "disks", + "describe", + disk_name, + "--project", + project, + "--zone", + zone, + "--format=value(users)", + ], + timeout=60, + raise_on_failure=False, + ) + if rc != 0: + logging.info( + "[swap_encryption] Swap disk %s not present — nothing to " + "delete", + disk_name, + ) + return True # already gone + user = users.strip() + if user: + inst = user.split("/")[-1] + logging.info( + "[swap_encryption] Detaching swap disk %s from %s", + disk_name, + inst, + ) + vm_util.IssueCommand( + [ + "gcloud", + "compute", + "instances", + "detach-disk", + inst, + "--project", + project, + "--zone", + zone, + "--disk", + disk_name, + "--quiet", + ], + timeout=120, + raise_on_failure=False, + ) + _, derr, drc = vm_util.IssueCommand( + [ + "gcloud", + "compute", + "disks", + "delete", + disk_name, + "--project", + project, + "--zone", + zone, + "--quiet", + ], + timeout=180, + raise_on_failure=False, + ) + if drc == 0: + logging.info("[swap_encryption] Swap disk deleted: %s", disk_name) + return True + logging.warning( + "[swap_encryption] Swap disk delete attempt %d/4 failed " + "(%s); retrying in 10s", + attempt, + derr.strip()[:160], + ) + time.sleep(10) + logging.error( + "[swap_encryption] Could NOT delete swap disk %s after retries " + "— delete it manually: gcloud compute disks delete %s " + "--zone %s --quiet", + disk_name, + disk_name, + zone, + ) + return False def _detach_and_delete_swap_disk(cluster) -> None: - """Detach and delete the dedicated swap disk created by _attach_swap_disk.""" - zone = None - if getattr(cluster, 'zones', None): - zone = cluster.zones[0] - elif getattr(cluster, 'region', None): - zone = cluster.region - if not zone or not getattr(cluster, 'project', None): - return - _delete_disk_by_name(f'pkb-swap-{cluster.name}', cluster.project, zone) + """Detach and delete the dedicated swap disk created by _attach_swap_disk.""" + zone = None + if getattr(cluster, "zones", None): + zone = cluster.zones[0] + elif getattr(cluster, "region", None): + zone = cluster.region + if not zone or not getattr(cluster, "project", None): + return + _delete_disk_by_name(f"pkb-swap-{cluster.name}", cluster.project, zone) def _delete_default_node_pool(cluster) -> None: - """Delete the dummy default nodepool after the benchmark pool is ready. - - The default nodepool (e2-medium) was only needed to satisfy GKE's - requirement that a cluster must have at least one nodepool at creation time. - Removing it stops the clock on its cost immediately. - """ - zone_flags: list[str] = [] - if getattr(cluster, 'zones', None): - zone_flags = ['--zone', cluster.zones[0]] - elif getattr(cluster, 'region', None): - zone_flags = ['--region', cluster.region] - - cmd = [ - 'gcloud', 'container', 'node-pools', 'delete', _DEFAULT_NODEPOOL, - '--cluster', cluster.name, - '--project', cluster.project, - '--quiet', - ] + zone_flags - - logging.info( - '[swap_encryption] Deleting default nodepool: %s', _DEFAULT_NODEPOOL) - stdout, stderr, rc = vm_util.IssueCommand(cmd, timeout=300, - raise_on_failure=False) - if rc != 0: - logging.warning('[swap_encryption] Could not delete default nodepool ' - '(rc=%d): %s', rc, stderr) - else: - logging.info('[swap_encryption] Default nodepool deleted') + """Delete the dummy default nodepool after the benchmark pool is ready. + + The default nodepool (e2-medium) was only needed to satisfy GKE's + requirement that a cluster must have at least one nodepool at creation time. + Removing it stops the clock on its cost immediately. + """ + zone_flags: list[str] = [] + if getattr(cluster, "zones", None): + zone_flags = ["--zone", cluster.zones[0]] + elif getattr(cluster, "region", None): + zone_flags = ["--region", cluster.region] + + cmd = [ + "gcloud", + "container", + "node-pools", + "delete", + _DEFAULT_NODEPOOL, + "--cluster", + cluster.name, + "--project", + cluster.project, + "--quiet", + ] + zone_flags + + logging.info( + "[swap_encryption] Deleting default nodepool: %s", _DEFAULT_NODEPOOL + ) + stdout, stderr, rc = vm_util.IssueCommand( + cmd, timeout=300, raise_on_failure=False + ) + if rc != 0: + logging.warning( + "[swap_encryption] Could not delete default nodepool (rc=%d): %s", + rc, + stderr, + ) + else: + logging.info("[swap_encryption] Default nodepool deleted") def _is_pod_gone(pod: str) -> bool: - """Return True if the named pod no longer exists in the cluster. - - Used to distinguish OOM-killed container processes (pod still alive, rc=137) - from OOM-evicted pods (pod gone, DaemonSet will create a replacement). - """ - try: - _, err, rc = kubectl.RunKubectlCommand( - ['get', 'pod', pod, '-n', _DS_NAMESPACE, - '-o', 'jsonpath={.metadata.name}'], - raise_on_failure=False, timeout=15, - ) - return rc != 0 and 'not found' in (err or '').lower() - except Exception: # pylint: disable=broad-except - return False + """Return True if the named pod no longer exists in the cluster. + + Used to distinguish OOM-killed container processes (pod still alive, rc=137) + from OOM-evicted pods (pod gone, DaemonSet will create a replacement). + """ + try: + _, err, rc = kubectl.RunKubectlCommand( + [ + "get", + "pod", + pod, + "-n", + _DS_NAMESPACE, + "-o", + "jsonpath={.metadata.name}", + ], + raise_on_failure=False, + timeout=15, + ) + return rc != 0 and "not found" in (err or "").lower() + except Exception: # pylint: disable=broad-except + return False def _pod_exec( @@ -1359,364 +1642,447 @@ def _pod_exec( timeout: int = 300, _retries: int = 2, ) -> tuple[str, str]: - """Run a shell command inside the benchmark pod via kubectl exec. - - Args: - pod: Pod name returned by _wait_for_benchmark_pod. - cmd: Shell command string passed to bash -c. - ignore_failure: When True, non-zero exit codes are logged but not - raised. - timeout: Seconds before PKB kills the kubectl exec process. Default - 300 s matches PKB's IssueCommand default. Pass a larger value for - long-running jobs (fio, stress-ng, kernel build). - _retries: Number of automatic retries on transient GKE websocket - resets ("connection reset by peer"). Set to 0 to disable retries - for idempotent-sensitive commands. - - Returns: - Tuple of (stdout, stderr) strings. - """ - _TRANSIENT_ERRORS = ('connection reset by peer', 'websocket: close') - # Errors that indicate the container/pod is gone and needs recovery. - # 'not found' covers "Error from server (NotFound): pods ... not found" - # which occurs when the DaemonSet pod was evicted and recreated under a - # new name (e.g. after OOM-triggered node pressure eviction). - # 'deleted state' covers "cannot exec in a deleted state" — the container - # was OOM-killed and is mid-termination (not yet recreated). - _CONTAINER_GONE_ERRORS = ('container not found', 'procReady not received', - 'unable to upgrade connection', 'not found', - 'deleted state') - # Use the globally-tracked active pod name — it may have been updated by - # a previous _recover_pod call when eviction replaced the pod. - active = _active_pod[0] if _active_pod else pod - - for attempt in range(_retries + 1): - out, err, rc = kubectl.RunKubectlCommand( - ['exec', active, '-n', _DS_NAMESPACE, - '--', 'bash', '-c', cmd], - raise_on_failure=False, - raise_on_timeout=False, # let _pod_exec's own retry loop handle transient resets - timeout=timeout, - ) - is_transient = rc != 0 and any(e in err for e in _TRANSIENT_ERRORS) - if is_transient and attempt < _retries: - logging.warning( - '[swap_encryption] kubectl exec connection reset (attempt %d/%d); ' - 'retrying in 10 s', attempt + 1, _retries + 1) - time.sleep(10) - continue - # rc=137 (SIGKILL): the OOM killer terminated the container process. - # Two sub-cases: - # A) Pod eviction: pod is gone, DaemonSet recreates it under a new name. - # B) Container OOM restart: pod still exists, container restarts in place. - # (DaemonSet restartPolicy=Always restarts the container, /tmp is lost, - # tools must be re-installed before subsequent commands can run.) - # In both cases we call _recover_pod to wait for tools + sentinel, and - # we do NOT retry the OOM-triggering command itself. - if rc == 137: - # Record the OOM so the run-level gate can flag it even if the container - # restarts in place under the same pod name (which leaves both the - # "pod replaced" and "pod NotFound" checks silent). - if active not in _oom_events: - _oom_events.append(active) - # CRITICAL: sleep before checking pod state. Kubernetes takes a few - # seconds to mark a just-evicted pod as Terminating / NotFound. Without - # this delay _recover_pod sees the pod still in "Running" phase, returns - # the old pod name immediately, and every subsequent command fails with - # "Error from server (NotFound): pods … not found". - logging.warning( - '[swap_encryption] rc=137 — sleeping 15s for Kubernetes to update ' - 'pod state before recovery check') - time.sleep(15) - pod_gone = _is_pod_gone(active) - if pod_gone: - logging.warning( - '[swap_encryption] OOM-eviction detected (rc=137, pod gone) — ' - 'recovering pod name for subsequent commands (not retrying this cmd)') - else: - logging.warning( - '[swap_encryption] Container OOM-killed (rc=137, pod still exists) — ' - 'waiting for container restart and tool re-install before continuing') - new_pod = _recover_pod(active) - if new_pod != active: - logging.info('[swap_encryption] Pod name updated: %s → %s', active, new_pod) - if _active_pod: - _active_pod[0] = new_pod - active = new_pod - break # Do NOT retry — the OOM cmd itself is not re-run on the new pod. - - is_container_gone = (rc != 0 and - any(e in err.lower() for e in _CONTAINER_GONE_ERRORS)) - if is_container_gone: - # Record the loss for the run-level degradation gate REGARDLESS of retry - # budget or ignore_failure. A "pods … not found" on a best-effort command - # (kernel build, opensearch, cleanup of a dead pod) still means the pod - # died; without this the gate stays blind because _active_pod is only - # renamed on the retry path below, which _retries=0 callers never reach. - if active and active not in _pod_lost: - _pod_lost.append(active) - logging.error( - '[swap_encryption] Benchmark pod %s is gone (%s) — recording run ' - 'as degraded', active, (err or '').strip()[:160]) - if attempt < _retries: - logging.warning( - '[swap_encryption] Container gone/restarting (attempt %d/%d) — ' - 'waiting for pod to recover...', attempt + 1, _retries + 1) - new_pod = _recover_pod(active) - if new_pod != active: - logging.info('[swap_encryption] Pod name updated: %s → %s', active, new_pod) - if _active_pod: - _active_pod[0] = new_pod - active = new_pod - continue - break - - if rc != 0 and not ignore_failure: - raise errors.VmUtil.IssueCommandError( - f'[swap_encryption] _pod_exec failed (rc={rc}): {err}') - return out, err + """Run a shell command inside the benchmark pod via kubectl exec. + + Args: + pod: Pod name returned by _wait_for_benchmark_pod. + cmd: Shell command string passed to bash -c. + ignore_failure: When True, non-zero exit codes are logged but not + raised. + timeout: Seconds before PKB kills the kubectl exec process. Default + 300 s matches PKB's IssueCommand default. Pass a larger value for + long-running jobs (fio, stress-ng, kernel build). + _retries: Number of automatic retries on transient GKE websocket + resets ("connection reset by peer"). Set to 0 to disable retries + for idempotent-sensitive commands. + + Returns: + Tuple of (stdout, stderr) strings. + """ + _TRANSIENT_ERRORS = ("connection reset by peer", "websocket: close") + # Errors that indicate the container/pod is gone and needs recovery. + # 'not found' covers "Error from server (NotFound): pods ... not found" + # which occurs when the DaemonSet pod was evicted and recreated under a + # new name (e.g. after OOM-triggered node pressure eviction). + # 'deleted state' covers "cannot exec in a deleted state" — the container + # was OOM-killed and is mid-termination (not yet recreated). + _CONTAINER_GONE_ERRORS = ( + "container not found", + "procReady not received", + "unable to upgrade connection", + "not found", + "deleted state", + ) + # Use the globally-tracked active pod name — it may have been updated by + # a previous _recover_pod call when eviction replaced the pod. + active = _active_pod[0] if _active_pod else pod + + for attempt in range(_retries + 1): + out, err, rc = kubectl.RunKubectlCommand( + ["exec", active, "-n", _DS_NAMESPACE, "--", "bash", "-c", cmd], + raise_on_failure=False, + raise_on_timeout=False, # let _pod_exec's own retry loop handle transient resets + timeout=timeout, + ) + is_transient = rc != 0 and any(e in err for e in _TRANSIENT_ERRORS) + if is_transient and attempt < _retries: + logging.warning( + "[swap_encryption] kubectl exec connection reset (attempt" + " %d/%d); retrying in 10 s", + attempt + 1, + _retries + 1, + ) + time.sleep(10) + continue + # rc=137 (SIGKILL): the OOM killer terminated the container process. + # Two sub-cases: + # A) Pod eviction: pod is gone, DaemonSet recreates it under a new name. + # B) Container OOM restart: pod still exists, container restarts in place. + # (DaemonSet restartPolicy=Always restarts the container, /tmp is lost, + # tools must be re-installed before subsequent commands can run.) + # In both cases we call _recover_pod to wait for tools + sentinel, and + # we do NOT retry the OOM-triggering command itself. + if rc == 137: + # Record the OOM so the run-level gate can flag it even if the container + # restarts in place under the same pod name (which leaves both the + # "pod replaced" and "pod NotFound" checks silent). + if active not in _oom_events: + _oom_events.append(active) + # CRITICAL: sleep before checking pod state. Kubernetes takes a few + # seconds to mark a just-evicted pod as Terminating / NotFound. Without + # this delay _recover_pod sees the pod still in "Running" phase, returns + # the old pod name immediately, and every subsequent command fails with + # "Error from server (NotFound): pods … not found". + logging.warning( + "[swap_encryption] rc=137 — sleeping 15s for Kubernetes to" + " update pod state before recovery check" + ) + time.sleep(15) + pod_gone = _is_pod_gone(active) + if pod_gone: + logging.warning( + "[swap_encryption] OOM-eviction detected (rc=137, pod gone)" + " — recovering pod name for subsequent commands (not" + " retrying this cmd)" + ) + else: + logging.warning( + "[swap_encryption] Container OOM-killed (rc=137, pod still" + " exists) — waiting for container restart and tool" + " re-install before continuing" + ) + new_pod = _recover_pod(active) + if new_pod != active: + logging.info( + "[swap_encryption] Pod name updated: %s → %s", + active, + new_pod, + ) + if _active_pod: + _active_pod[0] = new_pod + active = new_pod + break # Do NOT retry — the OOM cmd itself is not re-run on the new pod. + + is_container_gone = rc != 0 and any( + e in err.lower() for e in _CONTAINER_GONE_ERRORS + ) + if is_container_gone: + # Record the loss for the run-level degradation gate REGARDLESS of retry + # budget or ignore_failure. A "pods … not found" on a best-effort command + # (kernel build, opensearch, cleanup of a dead pod) still means the pod + # died; without this the gate stays blind because _active_pod is only + # renamed on the retry path below, which _retries=0 callers never reach. + if active and active not in _pod_lost: + _pod_lost.append(active) + logging.error( + "[swap_encryption] Benchmark pod %s is gone (%s) —" + " recording run as degraded", + active, + (err or "").strip()[:160], + ) + if attempt < _retries: + logging.warning( + "[swap_encryption] Container gone/restarting (attempt" + " %d/%d) — waiting for pod to recover...", + attempt + 1, + _retries + 1, + ) + new_pod = _recover_pod(active) + if new_pod != active: + logging.info( + "[swap_encryption] Pod name updated: %s → %s", + active, + new_pod, + ) + if _active_pod: + _active_pod[0] = new_pod + active = new_pod + continue + break + + if rc != 0 and not ignore_failure: + raise errors.VmUtil.IssueCommandError( + f"[swap_encryption] _pod_exec failed (rc={rc}): {err}" + ) + return out, err def _recover_pod(pod: str, timeout_sec: int = 600) -> str: - """Wait for a DaemonSet container to recover after OOM kill or eviction. - - Handles two scenarios: - 1. Container OOM restart: same pod name, container restarting in place. - DaemonSet restartPolicy=Always brings it back under the same pod name. - 2. Pod eviction/deletion: the pod is gone entirely; the DaemonSet creates - a new pod with a DIFFERENT name. We detect this by checking whether - the named pod still exists; if not, we search by the DaemonSet label - selector for a Running pod. - - Returns the (possibly new) pod name once it is Running and ready. - """ - deadline = time.time() + timeout_sec - logging.info('[swap_encryption] Waiting for pod %s to recover ' - '(up to %ds)...', pod, timeout_sec) - - # Phase 1: wait for a Running pod — either the named one (container - # restart) or a replacement pod found via label selector (eviction). - # - # IMPORTANT: we query BOTH status.phase AND metadata.deletionTimestamp in a - # single call. When a pod is evicted, Kubernetes first sets deletionTimestamp - # (the pod is "Terminating") while status.phase may still read "Running" for - # several seconds. Checking only status.phase causes a false-positive: we - # return the old pod name immediately and every subsequent command fails with - # "Error from server (NotFound)". Checking deletionTimestamp catches this. - recovered_pod = pod - while time.time() < deadline: - # IMPORTANT: capture stderr — kubectl writes "not found" to stderr, not - # stdout. When the pod is gone, status_out is empty and the error text - # lives entirely in status_err. Discarding stderr (using _) means the - # 'not found' check below never fires and we spin until deadline. - status_out, status_err, status_rc = kubectl.RunKubectlCommand( - ['get', 'pod', pod, '-n', _DS_NAMESPACE, - '-o', 'jsonpath={.status.phase}|{.metadata.deletionTimestamp}'], - raise_on_failure=False, timeout=30, - ) - # Parse "Running|" (no deletionTimestamp) vs "Running|2026-…" (terminating) - fields = status_out.strip().split('|') - phase = fields[0].strip() if fields else '' - is_terminating = len(fields) > 1 and bool(fields[1].strip()) - - # Pod is genuinely Running and NOT being deleted — recovery complete. - if status_rc == 0 and phase == 'Running' and not is_terminating: - break - - # Pod no longer exists, OR it exists but is being terminated (Terminating - # state or deletionTimestamp set) — look for a replacement pod by label. - pod_gone_or_terminating = ( - (status_rc != 0 and 'not found' in (status_out + status_err).lower()) - or is_terminating - ) - if pod_gone_or_terminating: - label_out, _, label_rc = kubectl.RunKubectlCommand( - ['get', 'pods', '-n', _DS_NAMESPACE, - '-l', f'app={_DS_LABEL}', - '-o', 'jsonpath={range .items[?(@.status.phase=="Running")]}' - '{.metadata.name}{"\\n"}{end}'], - raise_on_failure=False, timeout=30, - ) - new_pods = [p.strip() for p in label_out.strip().splitlines() if p.strip() - and p.strip() != pod] # exclude the dying pod - if label_rc == 0 and new_pods: - recovered_pod = new_pods[0] - logging.info('[swap_encryption] Original pod %s gone/terminating; ' - 'found replacement %s', pod, recovered_pod) - break + """Wait for a DaemonSet container to recover after OOM kill or eviction. + + Handles two scenarios: + 1. Container OOM restart: same pod name, container restarting in place. + DaemonSet restartPolicy=Always brings it back under the same pod name. + 2. Pod eviction/deletion: the pod is gone entirely; the DaemonSet creates + a new pod with a DIFFERENT name. We detect this by checking whether + the named pod still exists; if not, we search by the DaemonSet label + selector for a Running pod. + + Returns the (possibly new) pod name once it is Running and ready. + """ + deadline = time.time() + timeout_sec + logging.info( + "[swap_encryption] Waiting for pod %s to recover (up to %ds)...", + pod, + timeout_sec, + ) - time.sleep(10) - else: - raise errors.VmUtil.IssueCommandError( - f'[swap_encryption] No Running pod found (original: {pod}) ' - f'within {timeout_sec}s after OOM kill / eviction') + # Phase 1: wait for a Running pod — either the named one (container + # restart) or a replacement pod found via label selector (eviction). + # + # IMPORTANT: we query BOTH status.phase AND metadata.deletionTimestamp in a + # single call. When a pod is evicted, Kubernetes first sets deletionTimestamp + # (the pod is "Terminating") while status.phase may still read "Running" for + # several seconds. Checking only status.phase causes a false-positive: we + # return the old pod name immediately and every subsequent command fails with + # "Error from server (NotFound)". Checking deletionTimestamp catches this. + recovered_pod = pod + while time.time() < deadline: + # IMPORTANT: capture stderr — kubectl writes "not found" to stderr, not + # stdout. When the pod is gone, status_out is empty and the error text + # lives entirely in status_err. Discarding stderr (using _) means the + # 'not found' check below never fires and we spin until deadline. + status_out, status_err, status_rc = kubectl.RunKubectlCommand( + [ + "get", + "pod", + pod, + "-n", + _DS_NAMESPACE, + "-o", + "jsonpath={.status.phase}|{.metadata.deletionTimestamp}", + ], + raise_on_failure=False, + timeout=30, + ) + # Parse "Running|" (no deletionTimestamp) vs "Running|2026-…" (terminating) + fields = status_out.strip().split("|") + phase = fields[0].strip() if fields else "" + is_terminating = len(fields) > 1 and bool(fields[1].strip()) + + # Pod is genuinely Running and NOT being deleted — recovery complete. + if status_rc == 0 and phase == "Running" and not is_terminating: + break + + # Pod no longer exists, OR it exists but is being terminated (Terminating + # state or deletionTimestamp set) — look for a replacement pod by label. + pod_gone_or_terminating = ( + status_rc != 0 and "not found" in (status_out + status_err).lower() + ) or is_terminating + if pod_gone_or_terminating: + label_out, _, label_rc = kubectl.RunKubectlCommand( + [ + "get", + "pods", + "-n", + _DS_NAMESPACE, + "-l", + f"app={_DS_LABEL}", + "-o", + ( + 'jsonpath={range .items[?(@.status.phase=="Running")]}' + '{.metadata.name}{"\\n"}{end}' + ), + ], + raise_on_failure=False, + timeout=30, + ) + new_pods = [ + p.strip() + for p in label_out.strip().splitlines() + if p.strip() and p.strip() != pod + ] # exclude the dying pod + if label_rc == 0 and new_pods: + recovered_pod = new_pods[0] + logging.info( + "[swap_encryption] Original pod %s gone/terminating; " + "found replacement %s", + pod, + recovered_pod, + ) + break + + time.sleep(10) + else: + raise errors.VmUtil.IssueCommandError( + f"[swap_encryption] No Running pod found (original: {pod}) " + f"within {timeout_sec}s after OOM kill / eviction" + ) + + # Phase 2: wait for init script to finish (sentinel written last). + while time.time() < deadline: + ready_out, _, ready_rc = kubectl.RunKubectlCommand( + [ + "exec", + recovered_pod, + "-n", + _DS_NAMESPACE, + "--", + "bash", + "-c", + "test -f /tmp/pkb_ready && echo READY", + ], + raise_on_failure=False, + timeout=30, + ) + if ready_rc == 0 and "READY" in ready_out: + logging.info( + "[swap_encryption] Pod %s recovered and ready", recovered_pod + ) + return recovered_pod + time.sleep(15) - # Phase 2: wait for init script to finish (sentinel written last). - while time.time() < deadline: - ready_out, _, ready_rc = kubectl.RunKubectlCommand( - ['exec', recovered_pod, '-n', _DS_NAMESPACE, - '--', 'bash', '-c', 'test -f /tmp/pkb_ready && echo READY'], - raise_on_failure=False, timeout=30, + raise errors.VmUtil.IssueCommandError( + f"[swap_encryption] Pod {recovered_pod} did not become ready " + f"within {timeout_sec}s after OOM kill / eviction" ) - if ready_rc == 0 and 'READY' in ready_out: - logging.info('[swap_encryption] Pod %s recovered and ready', recovered_pod) - return recovered_pod - time.sleep(15) - - raise errors.VmUtil.IssueCommandError( - f'[swap_encryption] Pod {recovered_pod} did not become ready ' - f'within {timeout_sec}s after OOM kill / eviction') def _detect_cloud(pod: str) -> str: - """Detect GCP vs AWS from DMI product info exposed via /sys hostPath mount. - - DMI is the most reliable in-container detection method because it reads - directly from the host kernel's SMBIOS table via /sys (already mounted). - It avoids HTTP metadata endpoint quoting issues and network timeouts. - - Falls back to metadata HTTP endpoints if DMI is inconclusive. - """ - # Primary: DMI product name / vendor (available via /sys hostPath mount) - dmi_out, _ = _pod_exec( - pod, - 'cat /sys/class/dmi/id/sys_vendor /sys/class/dmi/id/product_name ' - '/sys/class/dmi/id/bios_vendor 2>/dev/null || echo ""', - ignore_failure=True, - ) - dmi = dmi_out.strip().lower() - if 'google' in dmi: - logging.info( - '[swap_encryption] Cloud detected via DMI: gcp (%s)', dmi_out.strip()) - return 'gcp' - if any(k in dmi for k in ('amazon', 'ec2', 'aws')): - logging.info( - '[swap_encryption] Cloud detected via DMI: aws (%s)', dmi_out.strip()) - return 'aws' - - # Secondary: GCP metadata endpoint. - # Use -H with no space after colon to avoid shell-quoting issues through - # the kubectl exec → bash -c pipeline. - gcp_out, _ = _pod_exec( - pod, - 'curl -s -m 3 ' - 'http://metadata.google.internal/computeMetadata/v1/instance/zone ' - '-H Metadata-Flavor:Google 2>/dev/null || echo ""', - ignore_failure=True, - ) - if gcp_out.strip(): - logging.info('[swap_encryption] Cloud detected via metadata: gcp') - return 'gcp' - - # Tertiary: AWS IMDS (IMDSv2 token-based; IMDSv1 is often disabled). - aws_out, _ = _pod_exec( - pod, - 'T=$(curl -s -m 3 -X PUT ' - 'http://169.254.169.254/latest/api/token ' - '-H "X-aws-ec2-metadata-token-ttl-seconds: 60" 2>/dev/null); ' - 'curl -s -m 3 -H "X-aws-ec2-metadata-token: $T" ' - 'http://169.254.169.254/latest/meta-data/instance-id ' - '2>/dev/null || echo ""', - ignore_failure=True, - ) - if aws_out.strip(): - logging.info('[swap_encryption] Cloud detected via IMDS: aws') - return 'aws' - - logging.warning( - '[swap_encryption] Could not detect cloud from DMI or metadata') - return 'unknown' + """Detect GCP vs AWS from DMI product info exposed via /sys hostPath mount. + DMI is the most reliable in-container detection method because it reads + directly from the host kernel's SMBIOS table via /sys (already mounted). + It avoids HTTP metadata endpoint quoting issues and network timeouts. -def _setup_gke_swap(pod: str) -> None: - """Configure dm-crypt swap on the GKE node, mirroring go/node:swap-encryption. - - GKE nodes use dm-crypt with an ephemeral random key so that swap contents - are encrypted at rest without requiring persistent key management. - We replicate this exactly using cryptsetup in plain mode (no LUKS header). - """ - swap_type = _SWAP_TYPE.value - if swap_type == 'auto': - # Check whether Local SSDs are present - lssd_out, _ = _pod_exec( + Falls back to metadata HTTP endpoints if DMI is inconclusive. + """ + # Primary: DMI product name / vendor (available via /sys hostPath mount) + dmi_out, _ = _pod_exec( pod, - "lsblk -d -o NAME,MODEL | grep -i 'local\\|nvme' | " - "grep -v 'nvme0' | awk '{print $1}' | head -1", + "cat /sys/class/dmi/id/sys_vendor /sys/class/dmi/id/product_name " + '/sys/class/dmi/id/bios_vendor 2>/dev/null || echo ""', ignore_failure=True, ) - swap_type = 'lssd' if lssd_out.strip() else 'hyperdisk' + dmi = dmi_out.strip().lower() + if "google" in dmi: + logging.info( + "[swap_encryption] Cloud detected via DMI: gcp (%s)", + dmi_out.strip(), + ) + return "gcp" + if any(k in dmi for k in ("amazon", "ec2", "aws")): + logging.info( + "[swap_encryption] Cloud detected via DMI: aws (%s)", + dmi_out.strip(), + ) + return "aws" + + # Secondary: GCP metadata endpoint. + # Use -H with no space after colon to avoid shell-quoting issues through + # the kubectl exec → bash -c pipeline. + gcp_out, _ = _pod_exec( + pod, + "curl -s -m 3 " + "http://metadata.google.internal/computeMetadata/v1/instance/zone " + '-H Metadata-Flavor:Google 2>/dev/null || echo ""', + ignore_failure=True, + ) + if gcp_out.strip(): + logging.info("[swap_encryption] Cloud detected via metadata: gcp") + return "gcp" - if swap_type == 'lssd': - _setup_gke_lssd_swap(pod) - elif swap_type == 'boot_disk': - _setup_gke_bootdisk_swap(pod) - else: - _setup_gke_hyperdisk_swap(pod) + # Tertiary: AWS IMDS (IMDSv2 token-based; IMDSv1 is often disabled). + aws_out, _ = _pod_exec( + pod, + "T=$(curl -s -m 3 -X PUT " + "http://169.254.169.254/latest/api/token " + '-H "X-aws-ec2-metadata-token-ttl-seconds: 60" 2>/dev/null); ' + 'curl -s -m 3 -H "X-aws-ec2-metadata-token: $T" ' + "http://169.254.169.254/latest/meta-data/instance-id " + '2>/dev/null || echo ""', + ignore_failure=True, + ) + if aws_out.strip(): + logging.info("[swap_encryption] Cloud detected via IMDS: aws") + return "aws" + + logging.warning( + "[swap_encryption] Could not detect cloud from DMI or metadata" + ) + return "unknown" + + +def _setup_gke_swap(pod: str) -> None: + """Configure dm-crypt swap on the GKE node, mirroring go/node:swap-encryption. + + GKE nodes use dm-crypt with an ephemeral random key so that swap contents + are encrypted at rest without requiring persistent key management. + We replicate this exactly using cryptsetup in plain mode (no LUKS header). + """ + swap_type = _SWAP_TYPE.value + if swap_type == "auto": + # Check whether Local SSDs are present + lssd_out, _ = _pod_exec( + pod, + "lsblk -d -o NAME,MODEL | grep -i 'local\\|nvme' | " + "grep -v 'nvme0' | awk '{print $1}' | head -1", + ignore_failure=True, + ) + swap_type = "lssd" if lssd_out.strip() else "hyperdisk" + + if swap_type == "lssd": + _setup_gke_lssd_swap(pod) + elif swap_type == "boot_disk": + _setup_gke_bootdisk_swap(pod) + else: + _setup_gke_hyperdisk_swap(pod) def _setup_gke_hyperdisk_swap(pod: str) -> None: - """Configure dm-crypt swap on hyperdisk-balanced (GKE default). - - Disk detection is split into two separate commands so that the boot-device - name is resolved first and then substituted as a literal string — nested - $() expansions inside a kubectl exec bash -c argument are unreliable. - - If no dedicated data disk is attached (single-disk node) dm-crypt is set up - over a loop device backed by a file on the boot hyperdisk, which still - exercises the full encryption path on the same storage tier. - """ - logging.info('[swap_encryption] GKE: setting up dm-crypt on hyperdisk') - - # Step 1: identify the boot device name (e.g. "nvme0n1", "sda") - boot_out, _ = _pod_exec( - pod, - 'lsblk -no pkname "$(findmnt -n -o SOURCE /)" 2>/dev/null | head -1', - ignore_failure=True, - ) - boot_base = boot_out.strip() or 'nvme0n1' - logging.info('[swap_encryption] GKE: boot device: %s', boot_base) - - # Step 2: find a non-boot disk using the literal name from step 1 - disk_out, _ = _pod_exec( - pod, - f"lsblk -d -o NAME,TYPE | awk '$2==\"disk\"{{print $1}}' " - f"| grep -v '^{boot_base}$' | head -1", - ignore_failure=True, - ) - disk_name = disk_out.strip() - - if not disk_name: + """Configure dm-crypt swap on hyperdisk-balanced (GKE default). + + Disk detection is split into two separate commands so that the boot-device + name is resolved first and then substituted as a literal string — nested + $() expansions inside a kubectl exec bash -c argument are unreliable. + + If no dedicated data disk is attached (single-disk node) dm-crypt is set up + over a loop device backed by a file on the boot hyperdisk, which still + exercises the full encryption path on the same storage tier. + """ + logging.info("[swap_encryption] GKE: setting up dm-crypt on hyperdisk") + + # Step 1: identify the boot device name (e.g. "nvme0n1", "sda") + boot_out, _ = _pod_exec( + pod, + 'lsblk -no pkname "$(findmnt -n -o SOURCE /)" 2>/dev/null | head -1', + ignore_failure=True, + ) + boot_base = boot_out.strip() or "nvme0n1" + logging.info("[swap_encryption] GKE: boot device: %s", boot_base) + + # Step 2: find a non-boot disk using the literal name from step 1 + disk_out, _ = _pod_exec( + pod, + "lsblk -d -o NAME,TYPE | awk '$2==\"disk\"{print $1}' " + f"| grep -v '^{boot_base}$' | head -1", + ignore_failure=True, + ) + disk_name = disk_out.strip() + + if not disk_name: + logging.info( + "[swap_encryption] No dedicated data disk found – " + "falling back to loop device on /mnt/stateful_partition " + "(direct-io=on, dm-crypt=%s)", + _ENABLE_DMCRYPT.value, + ) + _setup_gke_loop_device_swap(pod) + return + + disk = f"/dev/{disk_name}" logging.info( - '[swap_encryption] No dedicated data disk found – ' - 'falling back to loop device on /mnt/stateful_partition ' - '(direct-io=on, dm-crypt=%s)', _ENABLE_DMCRYPT.value) - _setup_gke_loop_device_swap(pod) - return - - disk = f'/dev/{disk_name}' - logging.info('[swap_encryption] GKE: swap target disk: %s dmcrypt=%s', - disk, _ENABLE_DMCRYPT.value) - - # Clean up any stale mapping from a previous failed run. - _pod_exec(pod, textwrap.dedent(f""" + "[swap_encryption] GKE: swap target disk: %s dmcrypt=%s", + disk, + _ENABLE_DMCRYPT.value, + ) + + # Clean up any stale mapping from a previous failed run. + _pod_exec( + pod, + textwrap.dedent(f""" swapoff /dev/mapper/swap_encrypted 2>/dev/null || true dmsetup remove --noudevrules --noudevsync swap_encrypted 2>/dev/null || true wipefs -a {disk} 2>/dev/null || true - """), ignore_failure=True) + """), + ignore_failure=True, + ) - if _ENABLE_DMCRYPT.value: - # We cannot use cryptsetup open from inside a container because - # libdevmapper calls dm_udev_wait() after creating the target, which - # blocks on /run/udev/control. That socket belongs to udevd which is - # not running inside the container — so cryptsetup hangs forever. - # - # Instead we drive dmsetup directly with --noudevrules --noudevsync, - # which skips all udev synchronisation, and call dmsetup mknodes to - # ensure /dev/mapper/swap_encrypted appears without udev. - # - # insmod (not modprobe) loads the kernel module: modprobe also talks to - # systemd-udevd and can deadlock from a container for the same reason. - _pod_exec(pod, textwrap.dedent(f""" + if _ENABLE_DMCRYPT.value: + # We cannot use cryptsetup open from inside a container because + # libdevmapper calls dm_udev_wait() after creating the target, which + # blocks on /run/udev/control. That socket belongs to udevd which is + # not running inside the container — so cryptsetup hangs forever. + # + # Instead we drive dmsetup directly with --noudevrules --noudevsync, + # which skips all udev synchronisation, and call dmsetup mknodes to + # ensure /dev/mapper/swap_encrypted appears without udev. + # + # insmod (not modprobe) loads the kernel module: modprobe also talks to + # systemd-udevd and can deadlock from a container for the same reason. + _pod_exec( + pod, + textwrap.dedent(f""" grep -q dm_crypt /proc/modules 2>/dev/null || {{ KO=$(find /lib/modules/$(uname -r) -name 'dm-crypt.ko*' 2>/dev/null | head -1) [ -n "$KO" ] && insmod "$KO" 2>/dev/null || true @@ -1729,63 +2095,72 @@ def _setup_gke_hyperdisk_swap(pod: str) -> None: dmsetup mknodes swap_encrypted 2>/dev/null || true mkswap /dev/mapper/swap_encrypted swapon /dev/mapper/swap_encrypted - """)) - logging.info('[swap_encryption] GKE: dm-crypt swap active on ' - '/dev/mapper/swap_encrypted') - else: - # Encryption-disabled column of the test matrix - _pod_exec(pod, textwrap.dedent(f""" + """), + ) + logging.info( + "[swap_encryption] GKE: dm-crypt swap active on " + "/dev/mapper/swap_encrypted" + ) + else: + # Encryption-disabled column of the test matrix + _pod_exec( + pod, + textwrap.dedent(f""" mkswap {disk} && \\ swapon {disk} - """)) - logging.info('[swap_encryption] GKE: plain (unencrypted) swap active ' - 'on %s', disk) + """), + ) + logging.info( + "[swap_encryption] GKE: plain (unencrypted) swap active on %s", disk + ) def _setup_gke_loop_device_swap(pod: str) -> None: - """Plain loop-device swap for single-disk GKE nodes (no dedicated swap disk). - - Used when _setup_gke_hyperdisk_swap finds no dedicated second disk (e.g. - n2-highmem-32 / n4-highmem-32 single-boot-disk nodes, regardless of image - type). - - dm-crypt is skipped on this path for two reasons: - 1. On COS (Container-Optimised OS): the device-mapper kernel subsystem is - inaccessible from inside a Kubernetes pod (even privileged). Calls to - cryptsetup/dmsetup block indefinitely and are killed by the PKB timeout. - This is a deliberate COS security restriction, not a permissions issue. - 2. On UBUNTU_CONTAINERD: the loop device is created in the container - namespace; its behaviour under nsenter (needed for dm-crypt on dedicated - disks) is untested, so plain loop swap is used for safety. - For dedicated block devices (hyperdisk, LSSD) nsenter into the host mount - namespace works around the COS restriction (see _setup_gke_hyperdisk_swap). - The loop device path skips dm-crypt on all image types; plain loop swap is - used instead. - - Therefore this path uses a plain loop device as swap without dm-crypt. - Phase 1 (fio) is skipped for plain loop devices — the goal is enc-on vs - enc-off comparison, and fio on a plain loop device measures the backing - filesystem rather than the swap stack. Tiers 2–6 (stress-ng, Redis, - kernel build, OpenSearch) run normally. - - For dm-crypt measurement on GCP use a machine type with local NVMe (LSSD) - or provision a dedicated hyperdisk on a second disk slot (n4-highmem-32+). - - Improvements over the old /var path: - - Backing file on /mnt/stateful_partition (ext4), not the container - overlayfs — avoids overlayfs O_DIRECT limitation. - - losetup --direct-io=on passes I/O through to the host ext4, reducing - double-buffering for Tiers 2–6 workloads. - """ - size_gb = _SWAP_SIZE_GB.value - # /mnt/stateful_partition is ext4 on COS (mounted from the stateful - # partition of the node's persistent disk). It is NOT the container - # overlay filesystem and is mounted into the pod via the DaemonSet - # hostPath volume. - backing = '/mnt/stateful_partition/pkb_swap_backing' - - # ── Step 0: detach any stale loop device from a previous failed run ─────── - _pod_exec(pod, textwrap.dedent(f""" + """Plain loop-device swap for single-disk GKE nodes (no dedicated swap disk). + + Used when _setup_gke_hyperdisk_swap finds no dedicated second disk (e.g. + n2-highmem-32 / n4-highmem-32 single-boot-disk nodes, regardless of image + type). + + dm-crypt is skipped on this path for two reasons: + 1. On COS (Container-Optimised OS): the device-mapper kernel subsystem is + inaccessible from inside a Kubernetes pod (even privileged). Calls to + cryptsetup/dmsetup block indefinitely and are killed by the PKB timeout. + This is a deliberate COS security restriction, not a permissions issue. + 2. On UBUNTU_CONTAINERD: the loop device is created in the container + namespace; its behaviour under nsenter (needed for dm-crypt on dedicated + disks) is untested, so plain loop swap is used for safety. + For dedicated block devices (hyperdisk, LSSD) nsenter into the host mount + namespace works around the COS restriction (see _setup_gke_hyperdisk_swap). + The loop device path skips dm-crypt on all image types; plain loop swap is + used instead. + + Therefore this path uses a plain loop device as swap without dm-crypt. + Phase 1 (fio) is skipped for plain loop devices — the goal is enc-on vs + enc-off comparison, and fio on a plain loop device measures the backing + filesystem rather than the swap stack. Tiers 2–6 (stress-ng, Redis, + kernel build, OpenSearch) run normally. + + For dm-crypt measurement on GCP use a machine type with local NVMe (LSSD) + or provision a dedicated hyperdisk on a second disk slot (n4-highmem-32+). + + Improvements over the old /var path: + - Backing file on /mnt/stateful_partition (ext4), not the container + overlayfs — avoids overlayfs O_DIRECT limitation. + - losetup --direct-io=on passes I/O through to the host ext4, reducing + double-buffering for Tiers 2–6 workloads. + """ + size_gb = _SWAP_SIZE_GB.value + # /mnt/stateful_partition is ext4 on COS (mounted from the stateful + # partition of the node's persistent disk). It is NOT the container + # overlay filesystem and is mounted into the pod via the DaemonSet + # hostPath volume. + backing = "/mnt/stateful_partition/pkb_swap_backing" + + # ── Step 0: detach any stale loop device from a previous failed run ─────── + _pod_exec( + pod, + textwrap.dedent(f""" losetup -j {backing} 2>/dev/null | awk -F: '{{print $1}}' | \ while read dev do @@ -1793,71 +2168,88 @@ def _setup_gke_loop_device_swap(pod: str) -> None: losetup -d "$dev" 2>/dev/null || true done rm -f {backing} - """), ignore_failure=True) - - # ── Step 1: allocate backing file on stateful partition (ext4) ─────────── - logging.info( - '[swap_encryption] GKE: creating %dG backing file on stateful_partition', - size_gb) - # fallocate preallocates real ext4 blocks (avoids fragmentation during swap - # I/O); truncate is the sparse fallback for filesystems where fallocate - # fails. - _pod_exec(pod, textwrap.dedent(f""" + """), + ignore_failure=True, + ) + + # ── Step 1: allocate backing file on stateful partition (ext4) ─────────── + logging.info( + "[swap_encryption] GKE: creating %dG backing file on" + " stateful_partition", + size_gb, + ) + # fallocate preallocates real ext4 blocks (avoids fragmentation during swap + # I/O); truncate is the sparse fallback for filesystems where fallocate + # fails. + _pod_exec( + pod, + textwrap.dedent(f""" fallocate -l {size_gb}G {backing} 2>/dev/null || \\ truncate -s {size_gb}G {backing} - """)) + """), + ) - # ── Step 2: loop device with direct-io passthrough ─────────────────────── - # --direct-io=on lets the loop driver pass O_DIRECT to the host ext4, - # reducing double-buffering for workload I/O (kernel 5.x+, present on - # GKE COS ≥ 1.29). - loop_out, _ = _pod_exec(pod, textwrap.dedent(f""" + # ── Step 2: loop device with direct-io passthrough ─────────────────────── + # --direct-io=on lets the loop driver pass O_DIRECT to the host ext4, + # reducing double-buffering for workload I/O (kernel 5.x+, present on + # GKE COS ≥ 1.29). + loop_out, _ = _pod_exec( + pod, + textwrap.dedent(f""" LOOP=$(losetup -f) && \\ losetup --direct-io=on "$LOOP" {backing} && \\ echo "$LOOP" - """)) - loop_dev = loop_out.strip() - if not loop_dev.startswith('/dev/loop'): - raise RuntimeError( - f'[swap_encryption] losetup failed – output: {loop_out!r}' - ) - logging.info('[swap_encryption] GKE: loop device: %s direct-io=on', loop_dev) - - # ── Step 3: plain mkswap + swapon (dm-crypt skipped on loop devices) ──────── - _pod_exec(pod, f'mkswap {loop_dev}') - _pod_exec(pod, f'swapon {loop_dev}') - logging.warning( - '[swap_encryption] GKE: plain loop swap active on %s ' - '(dm-crypt unavailable from COS pod — device-mapper is blocked by ' - 'COS kernel namespace restrictions). ' - 'Phase 1 (fio) will be skipped. ' - 'Use a machine with LSSD (c4-*-lssd) or attach a dedicated second ' - 'hyperdisk for dm-crypt measurement.', - loop_dev, - ) + """), + ) + loop_dev = loop_out.strip() + if not loop_dev.startswith("/dev/loop"): + raise RuntimeError( + f"[swap_encryption] losetup failed – output: {loop_out!r}" + ) + logging.info( + "[swap_encryption] GKE: loop device: %s direct-io=on", loop_dev + ) + + # ── Step 3: plain mkswap + swapon (dm-crypt skipped on loop devices) ──────── + _pod_exec(pod, f"mkswap {loop_dev}") + _pod_exec(pod, f"swapon {loop_dev}") + logging.warning( + "[swap_encryption] GKE: plain loop swap active on %s " + "(dm-crypt unavailable from COS pod — device-mapper is blocked by " + "COS kernel namespace restrictions). " + "Phase 1 (fio) will be skipped. " + "Use a machine with LSSD (c4-*-lssd) or attach a dedicated second " + "hyperdisk for dm-crypt measurement.", + loop_dev, + ) def _setup_gke_bootdisk_swap(pod: str) -> None: - """Swap on the OS BOOT disk — methodology Table 0 rows 1-4. - - Creates a loop-backed swap file on /mnt/stateful_partition (the node's boot - disk, whose type — pd-balanced or hyperdisk-balanced — is chosen at - nodepool-creation time via --swap_encryption_boot_disk_type). dm-crypt is - layered on the loop device when --swap_encryption_enable_dmcrypt is set - (encryption-on rows 2/4); otherwise plain swap is used (encryption-off rows - 1/3). - - Reuses the same loop-creation and dmsetup patterns as the LSSD/hyperdisk - paths — no shared provider module is touched. Requires an Ubuntu node image - (dm-crypt from a pod is blocked on COS). - """ - size_gb = _SWAP_SIZE_GB.value - backing = '/mnt/stateful_partition/pkb_swap_backing' - logging.info('[swap_encryption] GKE: boot-disk swap (%dG backing, dmcrypt=%s)', - size_gb, _ENABLE_DMCRYPT.value) - - # Clean up any stale loop/mapping from a previous run. - _pod_exec(pod, textwrap.dedent(f""" + """Swap on the OS BOOT disk — methodology Table 0 rows 1-4. + + Creates a loop-backed swap file on /mnt/stateful_partition (the node's boot + disk, whose type — pd-balanced or hyperdisk-balanced — is chosen at + nodepool-creation time via --swap_encryption_boot_disk_type). dm-crypt is + layered on the loop device when --swap_encryption_enable_dmcrypt is set + (encryption-on rows 2/4); otherwise plain swap is used (encryption-off rows + 1/3). + + Reuses the same loop-creation and dmsetup patterns as the LSSD/hyperdisk + paths — no shared provider module is touched. Requires an Ubuntu node image + (dm-crypt from a pod is blocked on COS). + """ + size_gb = _SWAP_SIZE_GB.value + backing = "/mnt/stateful_partition/pkb_swap_backing" + logging.info( + "[swap_encryption] GKE: boot-disk swap (%dG backing, dmcrypt=%s)", + size_gb, + _ENABLE_DMCRYPT.value, + ) + + # Clean up any stale loop/mapping from a previous run. + _pod_exec( + pod, + textwrap.dedent(f""" swapoff /dev/mapper/swap_encrypted 2>/dev/null || true dmsetup remove --noudevrules --noudevsync swap_encrypted 2>/dev/null || true losetup -j {backing} 2>/dev/null | awk -F: '{{print $1}}' | while read d @@ -1866,24 +2258,37 @@ def _setup_gke_bootdisk_swap(pod: str) -> None: losetup -d "$d" 2>/dev/null || true done rm -f {backing} - """), ignore_failure=True) + """), + ignore_failure=True, + ) - # Allocate the backing file on the boot-disk ext4 stateful partition. - _pod_exec(pod, textwrap.dedent(f""" + # Allocate the backing file on the boot-disk ext4 stateful partition. + _pod_exec( + pod, + textwrap.dedent(f""" fallocate -l {size_gb}G {backing} 2>/dev/null || truncate -s {size_gb}G {backing} - """)) + """), + ) - loop_out, _ = _pod_exec(pod, textwrap.dedent(f""" + loop_out, _ = _pod_exec( + pod, + textwrap.dedent(f""" LOOP=$(losetup -f) && losetup --direct-io=on "$LOOP" {backing} && echo "$LOOP" - """)) - loop_dev = loop_out.strip().splitlines()[-1].strip() if loop_out.strip() else '' - if not loop_dev.startswith('/dev/loop'): - raise RuntimeError( - f'[swap_encryption] boot-disk losetup failed: {loop_out!r}') - logging.info('[swap_encryption] GKE: boot-disk loop device: %s', loop_dev) - - if _ENABLE_DMCRYPT.value: - _pod_exec(pod, textwrap.dedent(f""" + """), + ) + loop_dev = ( + loop_out.strip().splitlines()[-1].strip() if loop_out.strip() else "" + ) + if not loop_dev.startswith("/dev/loop"): + raise RuntimeError( + f"[swap_encryption] boot-disk losetup failed: {loop_out!r}" + ) + logging.info("[swap_encryption] GKE: boot-disk loop device: %s", loop_dev) + + if _ENABLE_DMCRYPT.value: + _pod_exec( + pod, + textwrap.dedent(f""" grep -q dm_crypt /proc/modules 2>/dev/null || {{ KO=$(find /lib/modules/$(uname -r) -name 'dm-crypt.ko*' 2>/dev/null | head -1) [ -n "$KO" ] && insmod "$KO" 2>/dev/null || true @@ -1896,52 +2301,66 @@ def _setup_gke_bootdisk_swap(pod: str) -> None: dmsetup mknodes swap_encrypted 2>/dev/null || true mkswap /dev/mapper/swap_encrypted swapon /dev/mapper/swap_encrypted - """)) - logging.info('[swap_encryption] GKE: boot-disk dm-crypt swap active on ' - '/dev/mapper/swap_encrypted') - else: - _pod_exec(pod, textwrap.dedent(f""" + """), + ) + logging.info( + "[swap_encryption] GKE: boot-disk dm-crypt swap active on " + "/dev/mapper/swap_encrypted" + ) + else: + _pod_exec( + pod, + textwrap.dedent(f""" mkswap {loop_dev} && swapon {loop_dev} - """)) - logging.info('[swap_encryption] GKE: boot-disk plain swap active on %s', - loop_dev) + """), + ) + logging.info( + "[swap_encryption] GKE: boot-disk plain swap active on %s", loop_dev + ) def _setup_gke_lssd_swap(pod: str) -> None: - """Configure dm-crypt on LSSD RAID-0 array (go/gke-swap-lssd).""" - logging.info('[swap_encryption] GKE: setting up LSSD RAID-0 swap') - - # Reused-node hygiene: a previous run on this node may have left an ACTIVE - # dm-crypt swap (e.g. /dev/nvme0n1 └─swap_encrypted [SWAP]). That makes the - # LSSD look "unclean/busy" to the device selector below, which then wrongly - # falls back to the hyperdisk path and tries the boot disk. Tear down any - # prior PKB swap mapping FIRST so the underlying LSSD is freed and selectable. - _pod_exec(pod, textwrap.dedent(""" + """Configure dm-crypt on LSSD RAID-0 array (go/gke-swap-lssd).""" + logging.info("[swap_encryption] GKE: setting up LSSD RAID-0 swap") + + # Reused-node hygiene: a previous run on this node may have left an ACTIVE + # dm-crypt swap (e.g. /dev/nvme0n1 └─swap_encrypted [SWAP]). That makes the + # LSSD look "unclean/busy" to the device selector below, which then wrongly + # falls back to the hyperdisk path and tries the boot disk. Tear down any + # prior PKB swap mapping FIRST so the underlying LSSD is freed and selectable. + _pod_exec( + pod, + textwrap.dedent(""" swapoff /dev/mapper/swap_encrypted 2>/dev/null || true swapoff -a 2>/dev/null || true dmsetup remove --force --noudevrules --noudevsync swap_encrypted 2>/dev/null || true - """), ignore_failure=True) - - # Log the full block-device topology up front for diagnosis (every prior - # swap failure traced back to picking the wrong device). - topo, _ = _pod_exec( - pod, 'lsblk -o NAME,TYPE,SIZE,ROTA,MOUNTPOINT 2>/dev/null', - ignore_failure=True) - logging.info('[swap_encryption] block device topology:\n%s', - (topo or '').strip()) - - # Identify candidate swap devices = whole disks that are NOT the boot/OS - # disk. We must NOT rely on a device name (boot disk enumerates as nvme0n1 - # on some nodes, nvme1n1 on others) and we cannot use `findmnt /` because the - # container root is an overlay. Instead we EXCLUDE any disk that: - # * has partition children (boot disk has p1/p14/p15/p16), or - # * has any mounted filesystem (itself or a child). - # A raw local SSD intended for swap has neither. This robustly prevents the - # catastrophic bug where the 100 GB boot disk (root mounted) was RAIDed into - # the swap device, yielding a non-functional swap (fio empty + stress OOM). - lssd_out, _ = _pod_exec( - pod, - textwrap.dedent(""" + """), + ignore_failure=True, + ) + + # Log the full block-device topology up front for diagnosis (every prior + # swap failure traced back to picking the wrong device). + topo, _ = _pod_exec( + pod, + "lsblk -o NAME,TYPE,SIZE,ROTA,MOUNTPOINT 2>/dev/null", + ignore_failure=True, + ) + logging.info( + "[swap_encryption] block device topology:\n%s", (topo or "").strip() + ) + + # Identify candidate swap devices = whole disks that are NOT the boot/OS + # disk. We must NOT rely on a device name (boot disk enumerates as nvme0n1 + # on some nodes, nvme1n1 on others) and we cannot use `findmnt /` because the + # container root is an overlay. Instead we EXCLUDE any disk that: + # * has partition children (boot disk has p1/p14/p15/p16), or + # * has any mounted filesystem (itself or a child). + # A raw local SSD intended for swap has neither. This robustly prevents the + # catastrophic bug where the 100 GB boot disk (root mounted) was RAIDed into + # the swap device, yielding a non-functional swap (fio empty + stress OOM). + lssd_out, _ = _pod_exec( + pod, + textwrap.dedent(""" for d in $(lsblk -dno NAME,ROTA | awk '$2==0{print $1}') do if lsblk -no TYPE "/dev/$d" 2>/dev/null | grep -q '^part$'; then @@ -1953,33 +2372,41 @@ def _setup_gke_lssd_swap(pod: str) -> None: echo "/dev/$d" done """), - ignore_failure=True, - ) - devices = [d.strip() for d in lssd_out.strip().splitlines() if d.strip()] - if not devices: - logging.warning( - '[swap_encryption] No clean (unpartitioned, unmounted) local SSD found ' - '— falling back to hyperdisk swap path') - _setup_gke_hyperdisk_swap(pod) - return - - device_list = ' '.join(devices) - n = len(devices) - logging.info('[swap_encryption] GKE: LSSD RAID-0 across %d clean device(s): ' - '%s dmcrypt=%s', n, device_list, _ENABLE_DMCRYPT.value) - - # Clean up stale mappings, RAID arrays, and GKE-managed mounts. - # - # GKE UBUNTU nodes run google-ssd-startup.service at boot which formats - # local NVMe SSDs as ext4 and mounts them at /mnt/disks/ssd0 etc. even - # when --local-nvme-ssd-block is set. The mount makes the block device - # busy so mdadm/wipefs fail silently (we had || true). We must unmount - # those paths first. /proc-host/mounts reflects the host mount table - # (hostPID:true + privileged gives us access). - # - # pkb_swap is the dm-crypt device created by the node startup script (for - # single-LSSD nodes it holds /dev/nvme1n1 directly without an md0 layer). - _pod_exec(pod, textwrap.dedent(f""" + ignore_failure=True, + ) + devices = [d.strip() for d in lssd_out.strip().splitlines() if d.strip()] + if not devices: + logging.warning( + "[swap_encryption] No clean (unpartitioned, unmounted) local SSD" + " found — falling back to hyperdisk swap path" + ) + _setup_gke_hyperdisk_swap(pod) + return + + device_list = " ".join(devices) + n = len(devices) + logging.info( + "[swap_encryption] GKE: LSSD RAID-0 across %d clean device(s): " + "%s dmcrypt=%s", + n, + device_list, + _ENABLE_DMCRYPT.value, + ) + + # Clean up stale mappings, RAID arrays, and GKE-managed mounts. + # + # GKE UBUNTU nodes run google-ssd-startup.service at boot which formats + # local NVMe SSDs as ext4 and mounts them at /mnt/disks/ssd0 etc. even + # when --local-nvme-ssd-block is set. The mount makes the block device + # busy so mdadm/wipefs fail silently (we had || true). We must unmount + # those paths first. /proc-host/mounts reflects the host mount table + # (hostPID:true + privileged gives us access). + # + # pkb_swap is the dm-crypt device created by the node startup script (for + # single-LSSD nodes it holds /dev/nvme1n1 directly without an md0 layer). + _pod_exec( + pod, + textwrap.dedent(f""" echo "[pkb-lssd-cleanup] /proc/mdstat:" >&2 cat /proc/mdstat 2>/dev/null || true echo "[pkb-lssd-cleanup] dmsetup ls:" >&2 @@ -2035,18 +2462,20 @@ def _setup_gke_lssd_swap(pod: str) -> None: losetup -D 2>/dev/null || true rm -f /mnt/stateful_partition/pkb_swap.img 2>/dev/null || true sleep 2 - """), ignore_failure=True) - - # Step 3: verify the devices are truly raw (unpartitioned). On GKE Ubuntu - # nodes the local NVMe device may be partitioned by node startup scripts - # even when --local-nvme-ssd-block is specified. The kernel refuses a - # whole-disk exclusive open (DM_TABLE_LOAD → EBUSY) when any partition of - # the disk is open by another process (e.g. the container overlay FS is - # backed by nvme1n1p1). Detect this and fall back to a loop device backed - # by a file on /mnt/stateful_partition (which IS the SSD partition). - raw_check_out, _ = _pod_exec( - pod, - textwrap.dedent(f""" + """), + ignore_failure=True, + ) + + # Step 3: verify the devices are truly raw (unpartitioned). On GKE Ubuntu + # nodes the local NVMe device may be partitioned by node startup scripts + # even when --local-nvme-ssd-block is specified. The kernel refuses a + # whole-disk exclusive open (DM_TABLE_LOAD → EBUSY) when any partition of + # the disk is open by another process (e.g. the container overlay FS is + # backed by nvme1n1p1). Detect this and fall back to a loop device backed + # by a file on /mnt/stateful_partition (which IS the SSD partition). + raw_check_out, _ = _pod_exec( + pod, + textwrap.dedent(f""" for dev in {device_list} do if lsblk -ln -o TYPE "$dev" 2>/dev/null | grep -q '^part$' @@ -2057,43 +2486,57 @@ def _setup_gke_lssd_swap(pod: str) -> None: fi done """), - ignore_failure=True, - ) - raw_devices = [d.strip() for d in raw_check_out.strip().splitlines() if d.strip()] + ignore_failure=True, + ) + raw_devices = [ + d.strip() for d in raw_check_out.strip().splitlines() if d.strip() + ] - if not raw_devices: + if not raw_devices: + logging.info( + "[swap_encryption] GKE: all LSSD devices are partitioned — " + "falling back to loop device on /mnt/stateful_partition" + ) + _setup_gke_lssd_stateful_loop_swap(pod) + return + + # Use only raw (unpartitioned) devices going forward. + devices = raw_devices + device_list = " ".join(devices) + n = len(devices) logging.info( - '[swap_encryption] GKE: all LSSD devices are partitioned — ' - 'falling back to loop device on /mnt/stateful_partition' - ) - _setup_gke_lssd_stateful_loop_swap(pod) - return - - # Use only raw (unpartitioned) devices going forward. - devices = raw_devices - device_list = ' '.join(devices) - n = len(devices) - logging.info('[swap_encryption] GKE: using %d raw LSSD device(s): %s ' - 'dmcrypt=%s', n, device_list, _ENABLE_DMCRYPT.value) - - # For N=1 LSSD, skip mdadm entirely and target the raw device directly. - # For N>1 we stripe across multiple NVMe devices. - if n > 1: - _pod_exec(pod, textwrap.dedent(f""" + "[swap_encryption] GKE: using %d raw LSSD device(s): %s dmcrypt=%s", + n, + device_list, + _ENABLE_DMCRYPT.value, + ) + + # For N=1 LSSD, skip mdadm entirely and target the raw device directly. + # For N>1 we stripe across multiple NVMe devices. + if n > 1: + _pod_exec( + pod, + textwrap.dedent(f""" mdadm --create /dev/md0 --force \\ --level=0 --raid-devices={n} \\ {device_list} test -b /dev/md0 || {{ echo "mdadm: /dev/md0 not created" >&2; exit 1; }} - """)) - swap_block_dev = '/dev/md0' - else: - swap_block_dev = devices[0] - logging.info('[swap_encryption] GKE: single LSSD — skipping mdadm, ' - 'using %s directly', swap_block_dev) - - if _ENABLE_DMCRYPT.value: - # Same dmsetup --noudevrules --noudevsync approach as _setup_gke_hyperdisk_swap. - _pod_exec(pod, textwrap.dedent(f""" + """), + ) + swap_block_dev = "/dev/md0" + else: + swap_block_dev = devices[0] + logging.info( + "[swap_encryption] GKE: single LSSD — skipping mdadm, " + "using %s directly", + swap_block_dev, + ) + + if _ENABLE_DMCRYPT.value: + # Same dmsetup --noudevrules --noudevsync approach as _setup_gke_hyperdisk_swap. + _pod_exec( + pod, + textwrap.dedent(f""" grep -q dm_crypt /proc/modules 2>/dev/null || {{ KO=$(find /lib/modules/$(uname -r) -name 'dm-crypt.ko*' 2>/dev/null | head -1) [ -n "$KO" ] && insmod "$KO" 2>/dev/null || true @@ -2108,70 +2551,94 @@ def _setup_gke_lssd_swap(pod: str) -> None: dmsetup mknodes swap_encrypted 2>/dev/null || true mkswap /dev/mapper/swap_encrypted swapon /dev/mapper/swap_encrypted - """)) - logging.info('[swap_encryption] GKE: LSSD dm-crypt swap active on %s', - swap_block_dev) - else: - _pod_exec(pod, textwrap.dedent(f""" + """), + ) + logging.info( + "[swap_encryption] GKE: LSSD dm-crypt swap active on %s", + swap_block_dev, + ) + else: + _pod_exec( + pod, + textwrap.dedent(f""" mkswap {swap_block_dev} swapon {swap_block_dev} - """)) - logging.info('[swap_encryption] GKE: LSSD plain swap active on %s', - swap_block_dev) + """), + ) + logging.info( + "[swap_encryption] GKE: LSSD plain swap active on %s", + swap_block_dev, + ) def _setup_gke_lssd_stateful_loop_swap(pod: str) -> None: - """Set up swap on the LSSD partition via a loop device. - - Used when the local NVMe device is partitioned by GKE startup scripts - and cannot be opened as a whole raw block device (DM_TABLE_LOAD EBUSY). - The DaemonSet mounts /mnt/stateful_partition (hostPath) from the host's - nvme1n1p1 — which is still local SSD storage. We create a large file - there and layer loop → dm-crypt → swap on top of it. - """ - img_path = '/mnt/stateful_partition/pkb_swap.img' - - # Clean up any previous run artifacts. - _pod_exec(pod, textwrap.dedent(f""" + """Set up swap on the LSSD partition via a loop device. + + Used when the local NVMe device is partitioned by GKE startup scripts + and cannot be opened as a whole raw block device (DM_TABLE_LOAD EBUSY). + The DaemonSet mounts /mnt/stateful_partition (hostPath) from the host's + nvme1n1p1 — which is still local SSD storage. We create a large file + there and layer loop → dm-crypt → swap on top of it. + """ + img_path = "/mnt/stateful_partition/pkb_swap.img" + + # Clean up any previous run artifacts. + _pod_exec( + pod, + textwrap.dedent(f""" swapoff -a 2>/dev/null || true dmsetup remove --force --noudevrules --noudevsync swap_encrypted 2>/dev/null || true losetup -D 2>/dev/null || true rm -f {img_path} 2>/dev/null || true - """), ignore_failure=True) - - # Determine file size: 80% of available space, at least 16 GB. - size_out, _ = _pod_exec( - pod, - f"df -P /mnt/stateful_partition | awk 'NR==2{{print $4}}'", - ignore_failure=True, - ) - avail_kb = int(size_out.strip() or '0') - swap_gb = max(16, int(avail_kb * 0.8 / 1024 / 1024)) - logging.info('[swap_encryption] GKE: LSSD stateful-loop: %d GB image at %s', - swap_gb, img_path) - - # Allocate file (fallocate is instant on ext4; dd fallback for others). - _pod_exec(pod, textwrap.dedent(f""" + """), + ignore_failure=True, + ) + + # Determine file size: 80% of available space, at least 16 GB. + size_out, _ = _pod_exec( + pod, + f"df -P /mnt/stateful_partition | awk 'NR==2{{print $4}}'", + ignore_failure=True, + ) + avail_kb = int(size_out.strip() or "0") + swap_gb = max(16, int(avail_kb * 0.8 / 1024 / 1024)) + logging.info( + "[swap_encryption] GKE: LSSD stateful-loop: %d GB image at %s", + swap_gb, + img_path, + ) + + # Allocate file (fallocate is instant on ext4; dd fallback for others). + _pod_exec( + pod, + textwrap.dedent(f""" fallocate -l {swap_gb}G {img_path} 2>/dev/null || \\ dd if=/dev/zero of={img_path} bs=1G count={swap_gb} chmod 600 {img_path} losetup --direct-io=on -f {img_path} - """), timeout=300) - - loop_out, _ = _pod_exec( - pod, - f"losetup -j {img_path} | awk -F: '{{print $1}}' | head -1", - ignore_failure=True, - ) - loop_dev = loop_out.strip() - if not loop_dev.startswith('/dev/loop'): - raise RuntimeError( - f'[swap_encryption] losetup failed for {img_path} — got: {loop_out!r}' - ) - logging.info('[swap_encryption] GKE: LSSD stateful-loop device: %s', loop_dev) - - if _ENABLE_DMCRYPT.value: - _pod_exec(pod, textwrap.dedent(f""" + """), + timeout=300, + ) + + loop_out, _ = _pod_exec( + pod, + f"losetup -j {img_path} | awk -F: '{{print $1}}' | head -1", + ignore_failure=True, + ) + loop_dev = loop_out.strip() + if not loop_dev.startswith("/dev/loop"): + raise RuntimeError( + f"[swap_encryption] losetup failed for {img_path} — got:" + f" {loop_out!r}" + ) + logging.info( + "[swap_encryption] GKE: LSSD stateful-loop device: %s", loop_dev + ) + + if _ENABLE_DMCRYPT.value: + _pod_exec( + pod, + textwrap.dedent(f""" grep -q dm_crypt /proc/modules 2>/dev/null || {{ KO=$(find /lib/modules/$(uname -r) -name 'dm-crypt.ko*' 2>/dev/null | head -1) [ -n "$KO" ] && insmod "$KO" 2>/dev/null || true @@ -2186,259 +2653,313 @@ def _setup_gke_lssd_stateful_loop_swap(pod: str) -> None: dmsetup mknodes swap_encrypted 2>/dev/null || true mkswap /dev/mapper/swap_encrypted swapon /dev/mapper/swap_encrypted - """)) - logging.info('[swap_encryption] GKE: LSSD stateful-loop dm-crypt swap active ' - 'on %s → %s', img_path, loop_dev) - else: - _pod_exec(pod, textwrap.dedent(f""" + """), + ) + logging.info( + "[swap_encryption] GKE: LSSD stateful-loop dm-crypt swap active " + "on %s → %s", + img_path, + loop_dev, + ) + else: + _pod_exec( + pod, + textwrap.dedent(f""" mkswap {loop_dev} swapon {loop_dev} - """)) - logging.info('[swap_encryption] GKE: LSSD stateful-loop plain swap active ' - 'on %s → %s', img_path, loop_dev) + """), + ) + logging.info( + "[swap_encryption] GKE: LSSD stateful-loop plain swap active " + "on %s → %s", + img_path, + loop_dev, + ) -_IO2_VOLUME_ID = '' # set by _ensure_io2_volume; serial-based detection +_IO2_VOLUME_ID = "" # set by _ensure_io2_volume; serial-based detection def _ensure_io2_volume() -> None: - """Create + attach a dedicated io2 EBS volume to the benchmark node so the - io2 test-matrix row swaps on real io2 hardware-encrypted storage. - - No-op unless --swap_encryption_swap_type=io2 on an AWS/EKS cluster. - Best-effort: logs and returns on failure. Stashes the created volume id in - _IO2_VOLUME_ID for serial-based device detection in _setup_eks_io2_swap. - """ - global _IO2_VOLUME_ID - if _SWAP_TYPE.value != 'io2': - return - out, _, rc = kubectl.RunKubectlCommand( - ['get', 'nodes', '-o', 'jsonpath={.items[0].spec.providerID}'], - raise_on_failure=False, - ) - provider = (out or '').strip() # aws:///us-east-1a/i-0abc... - if rc != 0 or 'aws://' not in provider: - logging.warning( - '[swap_encryption] io2 attach skipped: could not resolve ' - 'EC2 instance from providerID=%r', provider) - return - parts = [p for p in provider.split('/') if p] - instance_id, az = parts[-1], parts[-2] - region = az[:-1] - base = ['aws', 'ec2', '--region', region] - try: - create_args = [ - 'create-volume', - '--volume-type', 'io2', - '--size', '500', - '--iops', '16000', - '--availability-zone', az, - '--tag-specifications', - 'ResourceType=volume,Tags=[{Key=pkb,Value=swap_encryption}]', - ] - if _IO2_ENCRYPTED.value: - create_args.append('--encrypted') - if _IO2_KMS_KEY_ID.value: - create_args += ['--kms-key-id', _IO2_KMS_KEY_ID.value] - logging.info( - '[swap_encryption] io2 volume will be EBS-encrypted ' - '(row: hardware encryption)') - else: - logging.info('[swap_encryption] io2 volume UNENCRYPTED (baseline row)') - create_args += ['--query', 'VolumeId', '--output', 'text'] - vol_id, _, vrc = vm_util.IssueCommand( - base + create_args, raise_on_failure=False) - vol_id = (vol_id or '').strip() - if vrc != 0 or not vol_id.startswith('vol-'): - logging.warning('[swap_encryption] io2 create-volume failed: %r', vol_id) - return - vm_util.IssueCommand( - base + ['wait', 'volume-available', '--volume-ids', vol_id], - raise_on_failure=False) - vm_util.IssueCommand( - base + [ - 'attach-volume', - '--volume-id', vol_id, - '--instance-id', instance_id, - '--device', '/dev/sdf', - ], - raise_on_failure=False) - vm_util.IssueCommand( - base + ['wait', 'volume-in-use', '--volume-ids', vol_id], - raise_on_failure=False) - _IO2_VOLUME_ID = vol_id - logging.info( - '[swap_encryption] Attached io2 volume %s to %s as /dev/sdf', - vol_id, instance_id) - time.sleep(15) # allow the NVMe device node to appear - except Exception as e: # pylint: disable=broad-except - logging.warning('[swap_encryption] io2 attach error (continuing): %s', e) + """Create + attach a dedicated io2 EBS volume to the benchmark node so the + io2 test-matrix row swaps on real io2 hardware-encrypted storage. + + No-op unless --swap_encryption_swap_type=io2 on an AWS/EKS cluster. + Best-effort: logs and returns on failure. Stashes the created volume id in + _IO2_VOLUME_ID for serial-based device detection in _setup_eks_io2_swap. + """ + global _IO2_VOLUME_ID + if _SWAP_TYPE.value != "io2": + return + out, _, rc = kubectl.RunKubectlCommand( + ["get", "nodes", "-o", "jsonpath={.items[0].spec.providerID}"], + raise_on_failure=False, + ) + provider = (out or "").strip() # aws:///us-east-1a/i-0abc... + if rc != 0 or "aws://" not in provider: + logging.warning( + "[swap_encryption] io2 attach skipped: could not resolve " + "EC2 instance from providerID=%r", + provider, + ) + return + parts = [p for p in provider.split("/") if p] + instance_id, az = parts[-1], parts[-2] + region = az[:-1] + base = ["aws", "ec2", "--region", region] + try: + create_args = [ + "create-volume", + "--volume-type", + "io2", + "--size", + "500", + "--iops", + "16000", + "--availability-zone", + az, + "--tag-specifications", + "ResourceType=volume,Tags=[{Key=pkb,Value=swap_encryption}]", + ] + if _IO2_ENCRYPTED.value: + create_args.append("--encrypted") + if _IO2_KMS_KEY_ID.value: + create_args += ["--kms-key-id", _IO2_KMS_KEY_ID.value] + logging.info( + "[swap_encryption] io2 volume will be EBS-encrypted " + "(row: hardware encryption)" + ) + else: + logging.info( + "[swap_encryption] io2 volume UNENCRYPTED (baseline row)" + ) + create_args += ["--query", "VolumeId", "--output", "text"] + vol_id, _, vrc = vm_util.IssueCommand( + base + create_args, raise_on_failure=False + ) + vol_id = (vol_id or "").strip() + if vrc != 0 or not vol_id.startswith("vol-"): + logging.warning( + "[swap_encryption] io2 create-volume failed: %r", vol_id + ) + return + vm_util.IssueCommand( + base + ["wait", "volume-available", "--volume-ids", vol_id], + raise_on_failure=False, + ) + vm_util.IssueCommand( + base + + [ + "attach-volume", + "--volume-id", + vol_id, + "--instance-id", + instance_id, + "--device", + "/dev/sdf", + ], + raise_on_failure=False, + ) + vm_util.IssueCommand( + base + ["wait", "volume-in-use", "--volume-ids", vol_id], + raise_on_failure=False, + ) + _IO2_VOLUME_ID = vol_id + logging.info( + "[swap_encryption] Attached io2 volume %s to %s as /dev/sdf", + vol_id, + instance_id, + ) + time.sleep(15) # allow the NVMe device node to appear + except Exception as e: # pylint: disable=broad-except + logging.warning( + "[swap_encryption] io2 attach error (continuing): %s", e + ) def _setup_eks_swap(pod: str) -> None: - """Configure swap on EKS nodes — Instance Store OR io2 root disk. - - Swap type is selected by --swap_encryption_swap_type: - instance_store (default) – NVMe SSD attached by Nitro (i4i, m6id, c6id). - Nitro encrypts all block-device writes at hardware level; no extra - cryptsetup needed. - io2 – EBS io2 volume provisioned as the node root/data disk. - Used for apples-to-apples comparison against GKE hyperdisk-balanced. - """ - swap_type = _SWAP_TYPE.value - if swap_type in ('auto', 'instance_store'): - _setup_eks_instance_store_swap(pod) - elif swap_type == 'io2': - _setup_eks_io2_swap(pod) - else: - logging.warning( - '[swap_encryption] Unknown EKS swap type %s – fallback', swap_type) - _setup_eks_instance_store_swap(pod) + """Configure swap on EKS nodes — Instance Store OR io2 root disk. + + Swap type is selected by --swap_encryption_swap_type: + instance_store (default) – NVMe SSD attached by Nitro (i4i, m6id, c6id). + Nitro encrypts all block-device writes at hardware level; no extra + cryptsetup needed. + io2 – EBS io2 volume provisioned as the node root/data disk. + Used for apples-to-apples comparison against GKE hyperdisk-balanced. + """ + swap_type = _SWAP_TYPE.value + if swap_type in ("auto", "instance_store"): + _setup_eks_instance_store_swap(pod) + elif swap_type == "io2": + _setup_eks_io2_swap(pod) + else: + logging.warning( + "[swap_encryption] Unknown EKS swap type %s – fallback", swap_type + ) + _setup_eks_instance_store_swap(pod) def _setup_eks_instance_store_swap(pod: str) -> None: - """Swap on AWS NVMe Instance Store (Nitro hardware-offloaded encryption).""" - logging.info('[swap_encryption] EKS: setting up Instance Store swap') - - # Find the Instance Store NVMe device (not the root EBS volume) - nvme_out, _ = _pod_exec( - pod, - "nvme list 2>/dev/null | awk '/Instance Storage/{print $1}' | head -1 || " - "lsblk -d -o NAME,MODEL | grep -i 'instance\\|nvme' | " - "grep -v 'nvme0' | awk '{print \"/dev/\"$1}' | head -1", - ignore_failure=True, - ) - device = nvme_out.strip() - if not device: - # Common Instance Store device paths on AWS - for candidate in ['/dev/nvme1n1', '/dev/nvme2n1', '/dev/xvdb']: - exists_out, _ = _pod_exec( - pod, f'test -b {candidate} && echo yes || echo no', - ignore_failure=True, - ) - if exists_out.strip() == 'yes': - device = candidate - break + """Swap on AWS NVMe Instance Store (Nitro hardware-offloaded encryption).""" + logging.info("[swap_encryption] EKS: setting up Instance Store swap") - if not device: - logging.warning( - '[swap_encryption] No Instance Store NVMe found – creating swapfile' + # Find the Instance Store NVMe device (not the root EBS volume) + nvme_out, _ = _pod_exec( + pod, + "nvme list 2>/dev/null | awk '/Instance Storage/{print $1}' | head -1" + " || lsblk -d -o NAME,MODEL | grep -i 'instance\\|nvme' | grep -v" + " 'nvme0' | awk '{print \"/dev/\"$1}' | head -1", + ignore_failure=True, ) - _setup_plain_swap_file(pod, _SWAP_SIZE_GB.value) - return + device = nvme_out.strip() + if not device: + # Common Instance Store device paths on AWS + for candidate in ["/dev/nvme1n1", "/dev/nvme2n1", "/dev/xvdb"]: + exists_out, _ = _pod_exec( + pod, + f"test -b {candidate} && echo yes || echo no", + ignore_failure=True, + ) + if exists_out.strip() == "yes": + device = candidate + break + + if not device: + logging.warning( + "[swap_encryption] No Instance Store NVMe found – creating swapfile" + ) + _setup_plain_swap_file(pod, _SWAP_SIZE_GB.value) + return - logging.info('[swap_encryption] EKS: Instance Store device: %s', device) + logging.info("[swap_encryption] EKS: Instance Store device: %s", device) - # Nitro encrypts all Instance Store writes automatically. - # No additional cryptsetup required. - _pod_exec(pod, textwrap.dedent(f""" + # Nitro encrypts all Instance Store writes automatically. + # No additional cryptsetup required. + _pod_exec( + pod, + textwrap.dedent(f""" mkswap {device} && \\ swapon {device} - """)) - logging.info( - '[swap_encryption] EKS: Instance Store swap active on %s', device) + """), + ) + logging.info( + "[swap_encryption] EKS: Instance Store swap active on %s", device + ) def _setup_eks_io2_swap(pod: str) -> None: - """Swap on AWS EBS io2 volume – apples-to-apples comparison vs GKE hyperdisk. - - EBS io2 volumes on Nitro instances are encrypted at rest by AWS KMS (if - enabled on the volume) or via Nitro-level hardware encryption. No additional - cryptsetup is needed here; we simply format the attached data disk as swap. - - Device discovery order: - 1. Match the io2 volume created by _ensure_io2_volume() by its NVMe serial - (serial == volume id without the dash). This is unambiguous and never - picks the root disk or the instance store regardless of nvmeXn1 - enumeration order on Nitro. - 2. First non-root EBS ("Elastic Block Store") block device that is not - currently mounted. - """ - logging.info('[swap_encryption] EKS: setting up io2 EBS swap') - - # Identify root device so we can exclude it. - root_out, _ = _pod_exec( - pod, - 'lsblk -no pkname $(findmnt -n -o SOURCE /) 2>/dev/null || echo nvme0n1', - ignore_failure=True, - ) - root_base = root_out.strip() or 'nvme0n1' - - # Identify the io2 volume UNAMBIGUOUSLY by its NVMe serial == volume id. - # An EBS NVMe device's serial equals the volume id minus the dash - # (vol-0abc... -> serial vol0abc...). - device = '' - target = _IO2_VOLUME_ID.replace('-', '') - if target: - ser_out, _ = _pod_exec( - pod, - 'for d in /sys/block/nvme*n1; do ' - '[ -e "$d" ] || continue; ' - 's=$(cat "$d/device/serial" 2>/dev/null | tr -d "-" | tr -d " "); ' - f'[ "$s" = "{target}" ] && {{ echo "/dev/$(basename "$d")"; break; }}; ' - 'done', - ignore_failure=True, - ) - device = ser_out.strip() - if device: - logging.info( - '[swap_encryption] EKS: io2 matched by serial %s -> %s', - target, device) - - if not device: - # Fallback: first non-root EBS device, excluding any device that is - # currently mounted (root) or already active swap. - disk_out, _ = _pod_exec( + """Swap on AWS EBS io2 volume – apples-to-apples comparison vs GKE hyperdisk. + + EBS io2 volumes on Nitro instances are encrypted at rest by AWS KMS (if + enabled on the volume) or via Nitro-level hardware encryption. No additional + cryptsetup is needed here; we simply format the attached data disk as swap. + + Device discovery order: + 1. Match the io2 volume created by _ensure_io2_volume() by its NVMe serial + (serial == volume id without the dash). This is unambiguous and never + picks the root disk or the instance store regardless of nvmeXn1 + enumeration order on Nitro. + 2. First non-root EBS ("Elastic Block Store") block device that is not + currently mounted. + """ + logging.info("[swap_encryption] EKS: setting up io2 EBS swap") + + # Identify root device so we can exclude it. + root_out, _ = _pod_exec( pod, - 'for d in /sys/block/nvme*n1 /sys/block/xvd[b-z] /sys/block/sd[b-z];' - ' do [ -e "$d" ] || continue; n=$(basename "$d"); [ "$n" =' - f' "{root_base}" ] && continue; m=$(cat "$d/device/model" 2>/dev/null);' - ' echo "$m" | grep -qi "Elastic Block Store" || continue; mnt=$(lsblk' - ' -no MOUNTPOINT "/dev/$n" 2>/dev/null | tr -d " "); [ -n "$mnt" ] &&' - ' continue; echo "/dev/$n"; break; done', + "lsblk -no pkname $(findmnt -n -o SOURCE /) 2>/dev/null || echo" + " nvme0n1", ignore_failure=True, ) - device = disk_out.strip() - if device: - logging.info( - '[swap_encryption] EKS: io2 fallback EBS device: %s', device) - - if not device: - logging.warning( - '[swap_encryption] No io2 EBS disk found – creating plain swapfile') - _setup_plain_swap_file(pod, _SWAP_SIZE_GB.value) - return + root_base = root_out.strip() or "nvme0n1" + + # Identify the io2 volume UNAMBIGUOUSLY by its NVMe serial == volume id. + # An EBS NVMe device's serial equals the volume id minus the dash + # (vol-0abc... -> serial vol0abc...). + device = "" + target = _IO2_VOLUME_ID.replace("-", "") + if target: + ser_out, _ = _pod_exec( + pod, + "for d in /sys/block/nvme*n1; do " + '[ -e "$d" ] || continue; ' + 's=$(cat "$d/device/serial" 2>/dev/null | tr -d "-" | tr -d " "); ' + f'[ "$s" = "{target}" ] && {{ echo "/dev/$(basename "$d")"; break;' + " }; " + "done", + ignore_failure=True, + ) + device = ser_out.strip() + if device: + logging.info( + "[swap_encryption] EKS: io2 matched by serial %s -> %s", + target, + device, + ) + + if not device: + # Fallback: first non-root EBS device, excluding any device that is + # currently mounted (root) or already active swap. + disk_out, _ = _pod_exec( + pod, + "for d in /sys/block/nvme*n1 /sys/block/xvd[b-z]" + " /sys/block/sd[b-z];" + ' do [ -e "$d" ] || continue; n=$(basename "$d"); [ "$n" =' + f' "{root_base}" ] && continue; m=$(cat "$d/device/model"' + " 2>/dev/null);" + ' echo "$m" | grep -qi "Elastic Block Store" || continue;' + " mnt=$(lsblk" + ' -no MOUNTPOINT "/dev/$n" 2>/dev/null | tr -d " "); [ -n "$mnt"' + " ] &&" + ' continue; echo "/dev/$n"; break; done', + ignore_failure=True, + ) + device = disk_out.strip() + if device: + logging.info( + "[swap_encryption] EKS: io2 fallback EBS device: %s", device + ) + + if not device: + logging.warning( + "[swap_encryption] No io2 EBS disk found – creating plain swapfile" + ) + _setup_plain_swap_file(pod, _SWAP_SIZE_GB.value) + return - logging.info('[swap_encryption] EKS: io2 EBS device: %s', device) + logging.info("[swap_encryption] EKS: io2 EBS device: %s", device) - # EBS io2 encryption is handled at the AWS level (Nitro / KMS). - out, _ = _pod_exec( - pod, - textwrap.dedent(f""" + # EBS io2 encryption is handled at the AWS level (Nitro / KMS). + out, _ = _pod_exec( + pod, + textwrap.dedent(f""" swapoff {device} 2>/dev/null || true wipefs -a {device} 2>/dev/null || true mkswap -f {device} && swapon {device} swapon --show """), - ignore_failure=True, - ) - if device not in out: - raise RuntimeError( - f'[swap_encryption] io2 swap did not activate on {device}; ' - f'swapon --show output: {out!r}. The device may be busy/mounted ' - '(wrong device picked) or mkswap failed.') - logging.info('[swap_encryption] EKS: io2 EBS swap active on %s', device) + ignore_failure=True, + ) + if device not in out: + raise RuntimeError( + f"[swap_encryption] io2 swap did not activate on {device}; " + f"swapon --show output: {out!r}. The device may be busy/mounted " + "(wrong device picked) or mkswap failed." + ) + logging.info("[swap_encryption] EKS: io2 EBS swap active on %s", device) def _setup_plain_swap_file(pod: str, size_gb: int) -> None: - """Fallback: create a loop-device-backed swapfile. - - A plain file on overlayfs (the container root) cannot be used as swap — - the kernel rejects it with EINVAL. Routing it through a loop device - presents a proper block device to the mm subsystem and succeeds. - """ - logging.info('[swap_encryption] Creating %dGB loop-device swap', size_gb) - _pod_exec(pod, textwrap.dedent(f""" + """Fallback: create a loop-device-backed swapfile. + + A plain file on overlayfs (the container root) cannot be used as swap — + the kernel rejects it with EINVAL. Routing it through a loop device + presents a proper block device to the mm subsystem and succeeds. + """ + logging.info("[swap_encryption] Creating %dGB loop-device swap", size_gb) + _pod_exec( + pod, + textwrap.dedent(f""" fallocate -l {size_gb}G /tmp/pkb_swapfile && \\ chmod 600 /tmp/pkb_swapfile && \\ LOOP=$(losetup -f) && \\ @@ -2446,231 +2967,267 @@ def _setup_plain_swap_file(pod: str, size_gb: int) -> None: mkswap "$LOOP" && \\ swapon "$LOOP" && \\ echo "swap loop device: $LOOP" - """)) + """), + ) def _enable_zswap(pod: str) -> None: - """Enable zswap with lz4 compressor and 20% pool limit inside the pod.""" - logging.info('[swap_encryption] Enabling zswap (lz4, 20%% pool)') - for cmd in [ - 'echo 1 > /sys/module/zswap/parameters/enabled', - 'echo lz4 > /sys/module/zswap/parameters/compressor', - 'echo 20 > /sys/module/zswap/parameters/max_pool_percent', - 'echo z3fold > /sys/module/zswap/parameters/zpool', - ]: - _pod_exec(pod, cmd, ignore_failure=True) + """Enable zswap with lz4 compressor and 20% pool limit inside the pod.""" + logging.info("[swap_encryption] Enabling zswap (lz4, 20%% pool)") + for cmd in [ + "echo 1 > /sys/module/zswap/parameters/enabled", + "echo lz4 > /sys/module/zswap/parameters/compressor", + "echo 20 > /sys/module/zswap/parameters/max_pool_percent", + "echo z3fold > /sys/module/zswap/parameters/zpool", + ]: + _pod_exec(pod, cmd, ignore_failure=True) def _phase1_fio( pod: str, swap_dev: str, base_meta: dict ) -> list[sample.Sample]: - """Run fio directly on the swap block device for raw I/O characterisation. - - Skipped only for an UNINTENTIONAL loop fallback (a single-disk node with no - dedicated swap disk, where fio on the loop would measure the boot ext4 - filesystem rather than the swap stack). When the user explicitly selects the - boot_disk target (--swap_encryption_swap_type=boot_disk, methodology rows - 1-4), the loop over the boot disk IS the device under test, so fio runs and - characterises it. - - For dedicated second disks (hyperdisk, LSSD, NVMe) direct I/O is always - used and swap is restored (mkswap + swapon) after the fio run. - To get fio results use c4-*-lssd (local NVMe) or - --swap_encryption_add_swap_disk to provision a dedicated second disk. - """ - if swap_dev.startswith('/dev/loop') and _SWAP_TYPE.value != 'boot_disk': - logging.warning( - '[swap_encryption] Phase 1 (fio) SKIPPED for plain loop device %s ' - '(unintentional single-disk fallback). ' - 'fio on a loop-backed device measures the underlying ext4 filesystem ' - '(stateful_partition), not the swap stack. ' - 'Use c4-*-lssd, --swap_encryption_add_swap_disk, or ' - '--swap_encryption_swap_type=boot_disk for fio data.', - swap_dev, - ) - return [] - - results = [] - - _pod_exec(pod, f'swapoff {swap_dev}', ignore_failure=True) - - # Pre-fill device so read tests have real data (avoids zero-block optimisation - # by the storage controller skewing read latency measurements). - # Cap at 20 GiB — enough to warm up the dm-crypt pipeline and cover the fio - # runtime window. Writing 100% of a 500 GiB hyperdisk takes ~500+ seconds - # at provisioned throughput, which exceeds the PKB command timeout. - # Timeout: 20 GiB / ~150 MB/s (conservative dm-crypt write rate) + 60 s buffer. - _PREFILL_GIB = 20 - prefill_timeout = _PREFILL_GIB * 1024 // 150 + 60 # ~197 s, rounds up to ~200 s - prefill_timeout = max(prefill_timeout, 300) # floor at 5 min - logging.info('[swap_encryption] Pre-filling %d GiB of %s', _PREFILL_GIB, swap_dev) - # No --output-format=json for prefill; we only care that it completes. - # Still use --output to avoid streaming large stdout over the websocket. - _pod_exec(pod, ( - f'fio --name=prefill --filename={swap_dev} ' - f'--ioengine=libaio --direct=1 --rw=write --bs=1m ' - f'--size={_PREFILL_GIB}g --verify=0 --output=/tmp/pkb_fio_prefill.log' - ), timeout=prefill_timeout, ignore_failure=True) - - # Each fio job: runtime + 90 s buffer (run + JSON write + file read). - # We write fio output to a file inside the pod and retrieve it in a second - # short-lived kubectl exec, because: - # - A single 120 s kubectl exec session over GKE websocket can be reset - # by the control-plane load balancer mid-stream ("connection reset by - # peer"), losing the output. - # - Separating the long run from the short file-read gives each exec a - # much shorter window, avoiding the keepalive timeout. - fio_run_timeout = _FIO_RUNTIME_SEC.value + 90 - fio_read_timeout = 60 # just a cat of the JSON file - - for name, rw, bs, depth, label in _FIO_JOBS: - logging.info('[swap_encryption] fio: %s', name) - out_file = f'/tmp/pkb_fio_{name}.json' - # Remove any stale output first so a parse can never silently reuse a - # previous job's/run's result (rules out byte-identical results between - # runs being a caching artifact rather than a true device ceiling). - _pod_exec(pod, f'rm -f {out_file}', ignore_failure=True, _retries=0, - timeout=15) - run_cmd = ( - f'fio --name={name} --filename={swap_dev} ' - f'--ioengine=libaio --direct=1 --verify=0 --randrepeat=0 ' - f'--bs={bs} --iodepth={depth} --rw={rw} ' - f'--time_based --runtime={_FIO_RUNTIME_SEC.value}s ' - f'--output-format=json --output={out_file}' - ) - _, err = _pod_exec(pod, run_cmd, timeout=fio_run_timeout, - ignore_failure=True, _retries=0) - if 'connection reset by peer' in err: - logging.warning('[swap_encryption] fio %s: kubectl exec connection ' - 'reset; result may be incomplete', name) - out, _ = _pod_exec(pod, f'cat {out_file} 2>/dev/null || echo ""', - timeout=fio_read_timeout, ignore_failure=True) - results += _parse_fio_json(out, name, label, base_meta) - - # fio prefill overwrites the entire device, destroying the mkswap header. - # Re-stamp and re-enable before the remaining phases need active swap. - _pod_exec(pod, f'mkswap {swap_dev} && swapon {swap_dev}', - ignore_failure=True, timeout=120) - return results + """Run fio directly on the swap block device for raw I/O characterisation. + + Skipped only for an UNINTENTIONAL loop fallback (a single-disk node with no + dedicated swap disk, where fio on the loop would measure the boot ext4 + filesystem rather than the swap stack). When the user explicitly selects the + boot_disk target (--swap_encryption_swap_type=boot_disk, methodology rows + 1-4), the loop over the boot disk IS the device under test, so fio runs and + characterises it. + + For dedicated second disks (hyperdisk, LSSD, NVMe) direct I/O is always + used and swap is restored (mkswap + swapon) after the fio run. + To get fio results use c4-*-lssd (local NVMe) or + --swap_encryption_add_swap_disk to provision a dedicated second disk. + """ + if swap_dev.startswith("/dev/loop") and _SWAP_TYPE.value != "boot_disk": + logging.warning( + "[swap_encryption] Phase 1 (fio) SKIPPED for plain loop device %s" + " (unintentional single-disk fallback). fio on a loop-backed device" + " measures the underlying ext4 filesystem (stateful_partition), not" + " the swap stack. Use c4-*-lssd, --swap_encryption_add_swap_disk," + " or --swap_encryption_swap_type=boot_disk for fio data.", + swap_dev, + ) + return [] + + results = [] + + _pod_exec(pod, f"swapoff {swap_dev}", ignore_failure=True) + + # Pre-fill device so read tests have real data (avoids zero-block optimisation + # by the storage controller skewing read latency measurements). + # Cap at 20 GiB — enough to warm up the dm-crypt pipeline and cover the fio + # runtime window. Writing 100% of a 500 GiB hyperdisk takes ~500+ seconds + # at provisioned throughput, which exceeds the PKB command timeout. + # Timeout: 20 GiB / ~150 MB/s (conservative dm-crypt write rate) + 60 s buffer. + _PREFILL_GIB = 20 + prefill_timeout = ( + _PREFILL_GIB * 1024 // 150 + 60 + ) # ~197 s, rounds up to ~200 s + prefill_timeout = max(prefill_timeout, 300) # floor at 5 min + logging.info( + "[swap_encryption] Pre-filling %d GiB of %s", _PREFILL_GIB, swap_dev + ) + # No --output-format=json for prefill; we only care that it completes. + # Still use --output to avoid streaming large stdout over the websocket. + _pod_exec( + pod, + ( + f"fio --name=prefill --filename={swap_dev} --ioengine=libaio" + f" --direct=1 --rw=write --bs=1m --size={_PREFILL_GIB}g --verify=0" + " --output=/tmp/pkb_fio_prefill.log" + ), + timeout=prefill_timeout, + ignore_failure=True, + ) + + # Each fio job: runtime + 90 s buffer (run + JSON write + file read). + # We write fio output to a file inside the pod and retrieve it in a second + # short-lived kubectl exec, because: + # - A single 120 s kubectl exec session over GKE websocket can be reset + # by the control-plane load balancer mid-stream ("connection reset by + # peer"), losing the output. + # - Separating the long run from the short file-read gives each exec a + # much shorter window, avoiding the keepalive timeout. + fio_run_timeout = _FIO_RUNTIME_SEC.value + 90 + fio_read_timeout = 60 # just a cat of the JSON file + + for name, rw, bs, depth, label in _FIO_JOBS: + logging.info("[swap_encryption] fio: %s", name) + out_file = f"/tmp/pkb_fio_{name}.json" + # Remove any stale output first so a parse can never silently reuse a + # previous job's/run's result (rules out byte-identical results between + # runs being a caching artifact rather than a true device ceiling). + _pod_exec( + pod, + f"rm -f {out_file}", + ignore_failure=True, + _retries=0, + timeout=15, + ) + run_cmd = ( + f"fio --name={name} --filename={swap_dev} " + "--ioengine=libaio --direct=1 --verify=0 --randrepeat=0 " + f"--bs={bs} --iodepth={depth} --rw={rw} " + f"--time_based --runtime={_FIO_RUNTIME_SEC.value}s " + f"--output-format=json --output={out_file}" + ) + _, err = _pod_exec( + pod, + run_cmd, + timeout=fio_run_timeout, + ignore_failure=True, + _retries=0, + ) + if "connection reset by peer" in err: + logging.warning( + "[swap_encryption] fio %s: kubectl exec connection " + "reset; result may be incomplete", + name, + ) + out, _ = _pod_exec( + pod, + f'cat {out_file} 2>/dev/null || echo ""', + timeout=fio_read_timeout, + ignore_failure=True, + ) + results += _parse_fio_json(out, name, label, base_meta) + + # fio prefill overwrites the entire device, destroying the mkswap header. + # Re-stamp and re-enable before the remaining phases need active swap. + _pod_exec( + pod, + f"mkswap {swap_dev} && swapon {swap_dev}", + ignore_failure=True, + timeout=120, + ) + return results def _parse_fio_json( stdout: str, job_name: str, label: str, base_meta: dict ) -> list[sample.Sample]: - """Parse fio JSON output into PKB Samples.""" - results = [] - try: - data = json.loads(stdout) - except (json.JSONDecodeError, ValueError): - logging.warning('[swap_encryption] fio JSON parse failed for %s', job_name) + """Parse fio JSON output into PKB Samples.""" + results = [] + try: + data = json.loads(stdout) + except (json.JSONDecodeError, ValueError): + logging.warning( + "[swap_encryption] fio JSON parse failed for %s", job_name + ) + return results + + meta = dict(base_meta, fio_job=job_name, fio_label=label) + for job in data.get("jobs", []): + for direction in ("read", "write"): + d = job.get(direction, {}) + if not d or d.get("io_bytes", 0) == 0: + continue + iops = float(d.get("iops", 0)) + bw_kib = float(d.get("bw", 0)) + clat = d.get("clat_ns", {}) + pct = clat.get("percentile", {}) + lat_mean = float(clat.get("mean", 0)) / 1000.0 + lat_p50 = float(pct.get("50.000000", 0)) / 1000.0 + lat_p99 = float(pct.get("99.000000", 0)) / 1000.0 + lat_p999 = float(pct.get("99.900000", 0)) / 1000.0 + m = dict(meta, direction=direction) + results += [ + sample.Sample(f"{job_name}_{direction}_iops", iops, "iops", m), + sample.Sample( + f"{job_name}_{direction}_bw_mbps", bw_kib / 1024, "MB/s", m + ), + sample.Sample( + f"{job_name}_{direction}_lat_mean", lat_mean, "us", m + ), + sample.Sample( + f"{job_name}_{direction}_lat_p50", lat_p50, "us", m + ), + sample.Sample( + f"{job_name}_{direction}_lat_p99", lat_p99, "us", m + ), + sample.Sample( + f"{job_name}_{direction}_lat_p999", lat_p999, "us", m + ), + ] return results - meta = dict(base_meta, fio_job=job_name, fio_label=label) - for job in data.get('jobs', []): - for direction in ('read', 'write'): - d = job.get(direction, {}) - if not d or d.get('io_bytes', 0) == 0: - continue - iops = float(d.get('iops', 0)) - bw_kib = float(d.get('bw', 0)) - clat = d.get('clat_ns', {}) - pct = clat.get('percentile', {}) - lat_mean = float(clat.get('mean', 0)) / 1000.0 - lat_p50 = float(pct.get('50.000000', 0)) / 1000.0 - lat_p99 = float(pct.get('99.000000', 0)) / 1000.0 - lat_p999 = float(pct.get('99.900000', 0)) / 1000.0 - m = dict(meta, direction=direction) - results += [ - sample.Sample( - f'{job_name}_{direction}_iops', iops, 'iops', m), - sample.Sample( - f'{job_name}_{direction}_bw_mbps', bw_kib / 1024, 'MB/s', m), - sample.Sample( - f'{job_name}_{direction}_lat_mean', lat_mean, 'us', m), - sample.Sample( - f'{job_name}_{direction}_lat_p50', lat_p50, 'us', m), - sample.Sample( - f'{job_name}_{direction}_lat_p99', lat_p99, 'us', m), - sample.Sample( - f'{job_name}_{direction}_lat_p999', lat_p999, 'us', m), - ] - return results - def _parse_vm_bytes_to_mb(vm_bytes: str) -> float: - """Parse a vm-bytes string like '28G', '512M', '1024k' into megabytes.""" - vm_bytes = vm_bytes.strip() - if not vm_bytes: - return 0.0 - suffix = vm_bytes[-1].upper() - try: - value = float(vm_bytes[:-1]) - except ValueError: - return 0.0 - if suffix == 'G': - return value * 1024.0 - elif suffix == 'M': - return value - elif suffix == 'K': - return value / 1024.0 - elif suffix == 'T': - return value * 1024.0 * 1024.0 - else: - # Assume bytes + """Parse a vm-bytes string like '28G', '512M', '1024k' into megabytes.""" + vm_bytes = vm_bytes.strip() + if not vm_bytes: + return 0.0 + suffix = vm_bytes[-1].upper() try: - return float(vm_bytes) / (1024.0 * 1024.0) + value = float(vm_bytes[:-1]) except ValueError: - return 0.0 + return 0.0 + if suffix == "G": + return value * 1024.0 + elif suffix == "M": + return value + elif suffix == "K": + return value / 1024.0 + elif suffix == "T": + return value * 1024.0 * 1024.0 + else: + # Assume bytes + try: + return float(vm_bytes) / (1024.0 * 1024.0) + except ValueError: + return 0.0 def _per_worker_vm_bytes(total_vm_bytes: str, workers: int) -> str: - """Split a *total* vm-bytes target across N stress-ng --vm workers. - - stress-ng allocates ``--vm-bytes`` PER worker, so ``--vm N --vm-bytes B`` - touches ``N * B`` of memory. Every vm_bytes value in this benchmark (the - --swap_encryption_stress_vm_bytes flag and the _autoscale_vm_bytes result) - represents the intended *combined* footprint, as documented on - --swap_encryption_stress_vm_workers ("workers divide vm_bytes equally ... - the combined in-flight footprint equals vm_bytes"). We therefore divide by - the worker count before handing the value to stress-ng; otherwise N>1 - workers allocate N x the target and the kernel OOM-kills the whole pod - (observed as stress-ng rc=137, after which all later phases fail with - "pods not found"). - - Returns a stress-ng-friendly ``M`` string (megabytes), floored to at - least 1M. - """ - workers = max(1, int(workers)) - total_mb = _parse_vm_bytes_to_mb(total_vm_bytes) - if total_mb <= 0: - # Unparseable — fall back to letting stress-ng divide nothing rather than - # silently changing behaviour; the caller's value is passed through. - return total_vm_bytes - per_worker_mb = max(1, int(total_mb / workers)) - return f'{per_worker_mb}M' + """Split a *total* vm-bytes target across N stress-ng --vm workers. + + stress-ng allocates ``--vm-bytes`` PER worker, so ``--vm N --vm-bytes B`` + touches ``N * B`` of memory. Every vm_bytes value in this benchmark (the + --swap_encryption_stress_vm_bytes flag and the _autoscale_vm_bytes result) + represents the intended *combined* footprint, as documented on + --swap_encryption_stress_vm_workers ("workers divide vm_bytes equally ... + the combined in-flight footprint equals vm_bytes"). We therefore divide by + the worker count before handing the value to stress-ng; otherwise N>1 + workers allocate N x the target and the kernel OOM-kills the whole pod + (observed as stress-ng rc=137, after which all later phases fail with + "pods not found"). + + Returns a stress-ng-friendly ``M`` string (megabytes), floored to at + least 1M. + """ + workers = max(1, int(workers)) + total_mb = _parse_vm_bytes_to_mb(total_vm_bytes) + if total_mb <= 0: + # Unparseable — fall back to letting stress-ng divide nothing rather than + # silently changing behaviour; the caller's value is passed through. + return total_vm_bytes + per_worker_mb = max(1, int(total_mb / workers)) + return f"{per_worker_mb}M" def _cgroup_swap_limit_mb(pod: str) -> float: - """Return the swap budget (in MB) that the benchmark cgroup can actually use. - - GKE sets the per-container cgroup v2 ``memory.swap.max`` to 0, so even though - the node advertises a large swap device the container cannot page anything - out. Sizing stress-ng against the *node* swap total in that case guarantees - an OOM kill. This probe finds the swap budget of *our* cgroup so the caller - can size against reality. - - We locate our own cgroup from the host-mounted /sys by finding the - ``cgroup.procs`` file that lists this shell's PID — ``hostPID: true`` means - ``$$`` is a host-namespace PID that appears in those files, and the - kubectl-exec'd shell shares the container's cgroup with stress-ng. - - Returns: - ``float('inf')`` when swap is uncapped (``max``); the limit in MB when - capped to a finite value; ``0.0`` when swap is fully locked - (``memory.swap.max == 0``); ``-1.0`` when the limit could not be read (the - caller then falls back to the legacy node-total behaviour). - """ - probe = textwrap.dedent(""" + """Return the swap budget (in MB) that the benchmark cgroup can actually use. + + GKE sets the per-container cgroup v2 ``memory.swap.max`` to 0, so even though + the node advertises a large swap device the container cannot page anything + out. Sizing stress-ng against the *node* swap total in that case guarantees + an OOM kill. This probe finds the swap budget of *our* cgroup so the caller + can size against reality. + + We locate our own cgroup from the host-mounted /sys by finding the + ``cgroup.procs`` file that lists this shell's PID — ``hostPID: true`` means + ``$$`` is a host-namespace PID that appears in those files, and the + kubectl-exec'd shell shares the container's cgroup with stress-ng. + + Returns: + ``float('inf')`` when swap is uncapped (``max``); the limit in MB when + capped to a finite value; ``0.0`` when swap is fully locked + (``memory.swap.max == 0``); ``-1.0`` when the limit could not be read (the + caller then falls back to the legacy node-total behaviour). + """ + probe = textwrap.dedent(""" mypid=$$ for procf in $(find /sys/fs/cgroup -path '*kubepods*' -name cgroup.procs 2>/dev/null) do @@ -2688,289 +3245,333 @@ def _cgroup_swap_limit_mb(pod: str) -> float: fi done """) - try: - out, _ = _pod_exec(pod, probe, timeout=20, ignore_failure=True) - except Exception as e: # pylint: disable=broad-except - logging.warning('[swap_encryption] cgroup swap-limit probe failed: %s', e) - return -1.0 - - text = (out or '').strip() - m = re.search(r'V2=(\S+)', text) - if m: - val = m.group(1) - if val == 'max': - return float('inf') - try: - return int(val) / (1024.0 * 1024.0) - except ValueError: - return -1.0 - # cgroup v1: the combined RAM+swap ceiling is memsw; swap budget = memsw-mem. - m = re.search(r'MEMSW=(\S+)\s+MEM=(\S+)', text) - if m: try: - memsw = int(m.group(1)) - mem = int(m.group(2)) - except ValueError: - return -1.0 - # A near-2^63 sentinel means "unlimited" in cgroup v1. - if memsw >= (1 << 62): - return float('inf') - return max(0.0, (memsw - mem) / (1024.0 * 1024.0)) - return -1.0 + out, _ = _pod_exec(pod, probe, timeout=20, ignore_failure=True) + except Exception as e: # pylint: disable=broad-except + logging.warning( + "[swap_encryption] cgroup swap-limit probe failed: %s", e + ) + return -1.0 + + text = (out or "").strip() + m = re.search(r"V2=(\S+)", text) + if m: + val = m.group(1) + if val == "max": + return float("inf") + try: + return int(val) / (1024.0 * 1024.0) + except ValueError: + return -1.0 + # cgroup v1: the combined RAM+swap ceiling is memsw; swap budget = memsw-mem. + m = re.search(r"MEMSW=(\S+)\s+MEM=(\S+)", text) + if m: + try: + memsw = int(m.group(1)) + mem = int(m.group(2)) + except ValueError: + return -1.0 + # A near-2^63 sentinel means "unlimited" in cgroup v1. + if memsw >= (1 << 62): + return float("inf") + return max(0.0, (memsw - mem) / (1024.0 * 1024.0)) + return -1.0 def _autoscale_vm_bytes(pod: str, vm_bytes: str) -> str: - """Ensure vm_bytes forces real swap I/O without hard-crashing the container. - - Strategy - -------- - We want stress-ng to overflow into swap so that dm-crypt / Nitro encryption - overhead is actually measured. Two competing constraints apply: - - 1. vm_bytes must exceed available RAM so that anonymous pages are paged out - to the swap device. A value below ~95 % of RAM fits entirely in memory - and produces swap_out_pages_per_sec = 0 (benchmark defeats itself). - - 2. vm_bytes must not be so large that the kernel OOM-kills the whole - container before any meaningful swap activity is recorded. - - Target formula - -------------- - target = RAM + min(swap_size × 0.25, 64 GB) - - This guarantees at least 25 % of the swap device is actively exercised - (measured swap I/O) while keeping the allocation safely within what the - kernel can page out given the available swap space. The 64 GB cap prevents - extremely large targets on machines with huge swap devices. - - On large-RAM machines (e.g. n4-highmem-32, 252 GB) the old 110%-of-RAM - formula only overflowed by ~25 GB; with sequential write64 patterns the - kernel handled that via LRU page eviction without actually hitting the swap - device, yielding swap_out = 0. The new formula forces a much larger working - set into swap. - - Hard ceiling - ------------ - Regardless of the formula, cap at RAM + swap_size - 4 GB (4 GB headroom) - to avoid exhausting the swap device and triggering kernel panics. - """ - try: - meminfo_out, _ = _pod_exec(pod, 'cat /proc/meminfo', timeout=15) - node_ram_kb = 0 - swap_total_kb = 0 - for line in meminfo_out.splitlines(): - if line.startswith('MemTotal:'): - parts = line.split() - if len(parts) >= 2: - node_ram_kb = int(parts[1]) - elif line.startswith('SwapTotal:'): - parts = line.split() - if len(parts) >= 2: - swap_total_kb = int(parts[1]) - if node_ram_kb and swap_total_kb: - break - - if node_ram_kb <= 0: - logging.warning('[swap_encryption] Could not read MemTotal; using vm_bytes=%s', vm_bytes) - return vm_bytes - - node_ram_mb = node_ram_kb / 1024.0 - swap_total_mb = swap_total_kb / 1024.0 - requested_mb = _parse_vm_bytes_to_mb(vm_bytes) - if requested_mb <= 0: - return vm_bytes - - # The node may advertise a large SwapTotal while THIS cgroup is forbidden - # from using it (GKE sets memory.swap.max=0 per container). Size against - # the swap the cgroup can actually reach, not the node total — otherwise a - # value like 32G OOM-kills the pod the instant it exceeds RAM. - cgroup_swap_mb = _cgroup_swap_limit_mb(pod) - usable_swap_mb = swap_total_mb # default / legacy when probe is inconclusive - if cgroup_swap_mb == 0.0: - # Swap is fully locked. Cap the working set just under RAM so the pod - # survives, and mark the run degraded: swap-encryption overhead cannot be - # measured when the cgroup cannot page out. - safe_gb = max(1, int(node_ram_mb * 0.9 / 1024)) - msg = (f'cgroup swap is locked (memory.swap.max=0); the ' - f'{swap_total_mb/1024:.0f} GB node swap device is unreachable. ' - f'Capping stress-ng vm_bytes {vm_bytes} → {safe_gb}G (0.9 x RAM) ' - f'to keep the pod alive — swap-encryption overhead will NOT be ' - f'measured this run') - logging.error('[swap_encryption] %s', msg) - _degraded_reasons.append(msg) - return f'{safe_gb}G' - if 0.0 < cgroup_swap_mb < float('inf'): - # cgroup permits a finite swap budget smaller than the device. - usable_swap_mb = min(swap_total_mb, cgroup_swap_mb) - # cgroup_swap_mb == inf -> swap fully usable (node total stands) - # cgroup_swap_mb == -1 -> undetermined; fall back to node total (legacy) - - # Desired overflow: 25% of usable swap capped at 64 GB, minimum 4 GB. - overflow_mb = max(min(usable_swap_mb * 0.25, 64.0 * 1024), 4.0 * 1024) - target_mb = node_ram_mb + overflow_mb - - # Hard ceiling: never exceed RAM + usable swap − 4 GB headroom. - if usable_swap_mb > 0: - ceiling_mb = node_ram_mb + usable_swap_mb - 4096.0 - target_mb = min(target_mb, ceiling_mb) - else: - # No usable swap at all (and not the locked-at-0 case handled above): - # keep the working set just under RAM. - target_mb = min(target_mb, node_ram_mb * 0.9) - - target_gb = max(1, int(target_mb / 1024)) # floor to GB for a clean flag - - if requested_mb < node_ram_mb * 0.95: - new_vm_bytes = f'{target_gb}G' - logging.warning( - '[swap_encryption] Auto-scaling vm_bytes UP: %s → %s ' - '(RAM %.0f GB, swap %.0f GB; original value would not trigger swap)', - vm_bytes, new_vm_bytes, node_ram_mb / 1024, swap_total_mb / 1024, - ) - return new_vm_bytes - - if requested_mb > target_mb: - new_vm_bytes = f'{target_gb}G' - logging.warning( - '[swap_encryption] Capping vm_bytes DOWN: %s → %s ' - '(RAM %.0f GB, swap %.0f GB; original value risks swap exhaustion)', - vm_bytes, new_vm_bytes, node_ram_mb / 1024, swap_total_mb / 1024, - ) - return new_vm_bytes - - return vm_bytes - except Exception as e: # pylint: disable=broad-except - logging.warning('[swap_encryption] _autoscale_vm_bytes failed (%s); using %s', e, vm_bytes) - return vm_bytes + """Ensure vm_bytes forces real swap I/O without hard-crashing the container. + + Strategy + -------- + We want stress-ng to overflow into swap so that dm-crypt / Nitro encryption + overhead is actually measured. Two competing constraints apply: + + 1. vm_bytes must exceed available RAM so that anonymous pages are paged out + to the swap device. A value below ~95 % of RAM fits entirely in memory + and produces swap_out_pages_per_sec = 0 (benchmark defeats itself). + + 2. vm_bytes must not be so large that the kernel OOM-kills the whole + container before any meaningful swap activity is recorded. + + Target formula + -------------- + target = RAM + min(swap_size × 0.25, 64 GB) + + This guarantees at least 25 % of the swap device is actively exercised + (measured swap I/O) while keeping the allocation safely within what the + kernel can page out given the available swap space. The 64 GB cap prevents + extremely large targets on machines with huge swap devices. + + On large-RAM machines (e.g. n4-highmem-32, 252 GB) the old 110%-of-RAM + formula only overflowed by ~25 GB; with sequential write64 patterns the + kernel handled that via LRU page eviction without actually hitting the swap + device, yielding swap_out = 0. The new formula forces a much larger working + set into swap. + + Hard ceiling + ------------ + Regardless of the formula, cap at RAM + swap_size - 4 GB (4 GB headroom) + to avoid exhausting the swap device and triggering kernel panics. + """ + try: + meminfo_out, _ = _pod_exec(pod, "cat /proc/meminfo", timeout=15) + node_ram_kb = 0 + swap_total_kb = 0 + for line in meminfo_out.splitlines(): + if line.startswith("MemTotal:"): + parts = line.split() + if len(parts) >= 2: + node_ram_kb = int(parts[1]) + elif line.startswith("SwapTotal:"): + parts = line.split() + if len(parts) >= 2: + swap_total_kb = int(parts[1]) + if node_ram_kb and swap_total_kb: + break + + if node_ram_kb <= 0: + logging.warning( + "[swap_encryption] Could not read MemTotal; using vm_bytes=%s", + vm_bytes, + ) + return vm_bytes + + node_ram_mb = node_ram_kb / 1024.0 + swap_total_mb = swap_total_kb / 1024.0 + requested_mb = _parse_vm_bytes_to_mb(vm_bytes) + if requested_mb <= 0: + return vm_bytes + + # The node may advertise a large SwapTotal while THIS cgroup is forbidden + # from using it (GKE sets memory.swap.max=0 per container). Size against + # the swap the cgroup can actually reach, not the node total — otherwise a + # value like 32G OOM-kills the pod the instant it exceeds RAM. + cgroup_swap_mb = _cgroup_swap_limit_mb(pod) + usable_swap_mb = ( + swap_total_mb # default / legacy when probe is inconclusive + ) + if cgroup_swap_mb == 0.0: + # Swap is fully locked. Cap the working set just under RAM so the pod + # survives, and mark the run degraded: swap-encryption overhead cannot be + # measured when the cgroup cannot page out. + safe_gb = max(1, int(node_ram_mb * 0.9 / 1024)) + msg = ( + "cgroup swap is locked (memory.swap.max=0); the" + f" {swap_total_mb/1024:.0f} GB node swap device is unreachable." + f" Capping stress-ng vm_bytes {vm_bytes} → {safe_gb}G (0.9 x" + " RAM) to keep the pod alive — swap-encryption overhead will" + " NOT be measured this run" + ) + logging.error("[swap_encryption] %s", msg) + _degraded_reasons.append(msg) + return f"{safe_gb}G" + if 0.0 < cgroup_swap_mb < float("inf"): + # cgroup permits a finite swap budget smaller than the device. + usable_swap_mb = min(swap_total_mb, cgroup_swap_mb) + # cgroup_swap_mb == inf -> swap fully usable (node total stands) + # cgroup_swap_mb == -1 -> undetermined; fall back to node total (legacy) + + # Desired overflow: 25% of usable swap capped at 64 GB, minimum 4 GB. + overflow_mb = max(min(usable_swap_mb * 0.25, 64.0 * 1024), 4.0 * 1024) + target_mb = node_ram_mb + overflow_mb + + # Hard ceiling: never exceed RAM + usable swap − 4 GB headroom. + if usable_swap_mb > 0: + ceiling_mb = node_ram_mb + usable_swap_mb - 4096.0 + target_mb = min(target_mb, ceiling_mb) + else: + # No usable swap at all (and not the locked-at-0 case handled above): + # keep the working set just under RAM. + target_mb = min(target_mb, node_ram_mb * 0.9) + + target_gb = max( + 1, int(target_mb / 1024) + ) # floor to GB for a clean flag + + if requested_mb < node_ram_mb * 0.95: + new_vm_bytes = f"{target_gb}G" + logging.warning( + "[swap_encryption] Auto-scaling vm_bytes UP: %s → %s (RAM %.0f" + " GB, swap %.0f GB; original value would not trigger swap)", + vm_bytes, + new_vm_bytes, + node_ram_mb / 1024, + swap_total_mb / 1024, + ) + return new_vm_bytes + + if requested_mb > target_mb: + new_vm_bytes = f"{target_gb}G" + logging.warning( + "[swap_encryption] Capping vm_bytes DOWN: %s → %s (RAM %.0f GB," + " swap %.0f GB; original value risks swap exhaustion)", + vm_bytes, + new_vm_bytes, + node_ram_mb / 1024, + swap_total_mb / 1024, + ) + return new_vm_bytes + + return vm_bytes + except Exception as e: # pylint: disable=broad-except + logging.warning( + "[swap_encryption] _autoscale_vm_bytes failed (%s); using %s", + e, + vm_bytes, + ) + return vm_bytes def _get_stress_vm_method(pod: str) -> str: - """Detect the best --vm-method argument for stress-ng on this node. - - stress-ng vm-method support varies by version and distro: - - Older Ubuntu / some GKE images: supports 'mmap' - - Newer Ubuntu on n4-highmem-32 (kernel 6.8+ GKE): 'mmap' removed; supports - 'write64', 'rand-set', etc. - - We prefer 'mmap' (lowest overhead, no kernel structure cycling), fall back to - 'write64' (simple sequential writes, universally available), then 'rand-set', - and if none are listed we return '' so callers omit the --vm-method flag - entirely (stress-ng then uses its compiled-in default). - - NOTE on forcing swap (two independent requirements): - (a) The working set must exceed RAM. Without --vm-keep each worker re-mmaps - and re-touches its full slice every iteration, so all - --swap_encryption_stress_vm_workers slices are simultaneously resident and - the combined footprint exceeds RAM (run 910c8da5 swapped ~10k pages/s with - write64 and no --vm-keep). Adding --vm-keep made stress-ng reuse one - quiescent mapping, the resident set plateaued below RAM, and the gate - fired — so we must NOT pass --vm-keep. - (b) The workers must stay BUSY for the whole phase. Do NOT pass --vm-hang 0: - stress-ng documents "--vm-hang 0" as "sleep for an INFINITE time before - unmapping", so each worker wrote its slice once and then slept for the - rest of the run — usr+sys CPU was ~10 s out of 300 s and si/so stayed 0 - (runs 14907cff, config1/111, even with KSM disabled and rand-set). - Omitting --vm-hang entirely lets the workers loop continuously, keeping - the slices hot so the over-RAM remainder pages to swap throughout. - - Result is cached in _stress_vm_method so the detection kubectl exec only runs - once per benchmark run. - """ - if _stress_vm_method: - return _stress_vm_method[0] - - try: - # stress-ng prints its valid vm-methods to stdout when given an invalid one. - out, _, _ = kubectl.RunKubectlCommand( - ['exec', (_active_pod[0] if _active_pod else pod), - '-n', _DS_NAMESPACE, - '--', 'bash', '-c', - 'stress-ng --vm 1 --vm-bytes 1M --vm-method __invalid__ --timeout 1s 2>&1 || true'], - raise_on_failure=False, timeout=15, - ) - combined = out.lower() - # Prefer rand-set: random access keeps every page of each worker's slice - # hot (no cold pages behind a sequential write pointer to reclaim) and - # writes non-identical data (so KSM cannot merge the workers' regions). - # write64 is sequential and was empirically reclaimed / merged, leaving the - # resident set below RAM and swap_out ~0. - if 'rand-set' in combined: - method = 'rand-set' - elif 'mmap' in combined: - method = 'mmap' - elif 'write64' in combined: - method = 'write64' - else: - method = '' # omit flag; use stress-ng default - logging.info('[swap_encryption] stress-ng vm-method detected: %r', method or '(default)') - except Exception as e: # pylint: disable=broad-except - logging.warning('[swap_encryption] vm-method detection failed (%s); using rand-set', e) - method = 'rand-set' + """Detect the best --vm-method argument for stress-ng on this node. + + stress-ng vm-method support varies by version and distro: + - Older Ubuntu / some GKE images: supports 'mmap' + - Newer Ubuntu on n4-highmem-32 (kernel 6.8+ GKE): 'mmap' removed; supports + 'write64', 'rand-set', etc. + + We prefer 'mmap' (lowest overhead, no kernel structure cycling), fall back to + 'write64' (simple sequential writes, universally available), then 'rand-set', + and if none are listed we return '' so callers omit the --vm-method flag + entirely (stress-ng then uses its compiled-in default). + + NOTE on forcing swap (two independent requirements): + (a) The working set must exceed RAM. Without --vm-keep each worker re-mmaps + and re-touches its full slice every iteration, so all + --swap_encryption_stress_vm_workers slices are simultaneously resident and + the combined footprint exceeds RAM (run 910c8da5 swapped ~10k pages/s with + write64 and no --vm-keep). Adding --vm-keep made stress-ng reuse one + quiescent mapping, the resident set plateaued below RAM, and the gate + fired — so we must NOT pass --vm-keep. + (b) The workers must stay BUSY for the whole phase. Do NOT pass --vm-hang 0: + stress-ng documents "--vm-hang 0" as "sleep for an INFINITE time before + unmapping", so each worker wrote its slice once and then slept for the + rest of the run — usr+sys CPU was ~10 s out of 300 s and si/so stayed 0 + (runs 14907cff, config1/111, even with KSM disabled and rand-set). + Omitting --vm-hang entirely lets the workers loop continuously, keeping + the slices hot so the over-RAM remainder pages to swap throughout. + + Result is cached in _stress_vm_method so the detection kubectl exec only runs + once per benchmark run. + """ + if _stress_vm_method: + return _stress_vm_method[0] - _stress_vm_method.append(method) - return method + try: + # stress-ng prints its valid vm-methods to stdout when given an invalid one. + out, _, _ = kubectl.RunKubectlCommand( + [ + "exec", + (_active_pod[0] if _active_pod else pod), + "-n", + _DS_NAMESPACE, + "--", + "bash", + "-c", + ( + "stress-ng --vm 1 --vm-bytes 1M --vm-method __invalid__" + " --timeout 1s 2>&1 || true" + ), + ], + raise_on_failure=False, + timeout=15, + ) + combined = out.lower() + # Prefer rand-set: random access keeps every page of each worker's slice + # hot (no cold pages behind a sequential write pointer to reclaim) and + # writes non-identical data (so KSM cannot merge the workers' regions). + # write64 is sequential and was empirically reclaimed / merged, leaving the + # resident set below RAM and swap_out ~0. + if "rand-set" in combined: + method = "rand-set" + elif "mmap" in combined: + method = "mmap" + elif "write64" in combined: + method = "write64" + else: + method = "" # omit flag; use stress-ng default + logging.info( + "[swap_encryption] stress-ng vm-method detected: %r", + method or "(default)", + ) + except Exception as e: # pylint: disable=broad-except + logging.warning( + "[swap_encryption] vm-method detection failed (%s); using rand-set", + e, + ) + method = "rand-set" + + _stress_vm_method.append(method) + return method def _stress_vm_method_flag(pod: str) -> str: - """Return the --vm-method flag string, or empty string if none.""" - method = _get_stress_vm_method(pod) - return f'--vm-method {method}' if method else '' + """Return the --vm-method flag string, or empty string if none.""" + method = _get_stress_vm_method(pod) + return f"--vm-method {method}" if method else "" def _phase2a_cpu_overhead(pod: str, base_meta: dict) -> list[sample.Sample]: - """Measure CPU cost of dm-crypt / Nitro while stress-ng drives swap I/O. - - If --swap_encryption_stress_vm_bytes_list is set the phase is run once per - listed intensity value so that a full pressure-curve is captured (gap 5). - Otherwise the single value from --swap_encryption_stress_vm_bytes is used. - - Auto-scaling: if the requested vm_bytes is less than 95% of node RAM, it is - automatically increased to 110% of node RAM so that swap is actually - triggered on large-RAM machines (e.g. n4-highmem-32 with 256 GB). - """ - # Build the list of vm-bytes intensities to sweep (gap 5) - if _STRESS_VM_BYTES_LIST.value.strip(): - intensities = [v.strip() for v in _STRESS_VM_BYTES_LIST.value.split(',') - if v.strip()] - else: - intensities = [_STRESS_VM_BYTES.value] - - results = [] - for vm_bytes in intensities: - scaled = _autoscale_vm_bytes(pod, vm_bytes) - logging.info('[swap_encryption] Phase 2a: stress-ng intensity %s', scaled) - results += _run_cpu_overhead_sweep(pod, base_meta, scaled) - return results + """Measure CPU cost of dm-crypt / Nitro while stress-ng drives swap I/O. + + If --swap_encryption_stress_vm_bytes_list is set the phase is run once per + listed intensity value so that a full pressure-curve is captured (gap 5). + Otherwise the single value from --swap_encryption_stress_vm_bytes is used. + + Auto-scaling: if the requested vm_bytes is less than 95% of node RAM, it is + automatically increased to 110% of node RAM so that swap is actually + triggered on large-RAM machines (e.g. n4-highmem-32 with 256 GB). + """ + # Build the list of vm-bytes intensities to sweep (gap 5) + if _STRESS_VM_BYTES_LIST.value.strip(): + intensities = [ + v.strip() + for v in _STRESS_VM_BYTES_LIST.value.split(",") + if v.strip() + ] + else: + intensities = [_STRESS_VM_BYTES.value] + + results = [] + for vm_bytes in intensities: + scaled = _autoscale_vm_bytes(pod, vm_bytes) + logging.info( + "[swap_encryption] Phase 2a: stress-ng intensity %s", scaled + ) + results += _run_cpu_overhead_sweep(pod, base_meta, scaled) + return results def _run_cpu_overhead_sweep( pod: str, base_meta: dict, vm_bytes: str ) -> list[sample.Sample]: - """Phase 2a stressor sweep, WITH RETRY for flaky swap. - - Driving the multi-worker rand-set working set past RAM into swap is - empirically non-deterministic on these nodes: the SAME config produced - ~670k pages/s on some runs and <300 on others. So we retry: if an attempt - completes but peak swap-out is below the threshold (and it did not OOM), - reclaim memory and re-run, keeping the BEST attempt. An OOM, or a peak - at/above threshold, ends the retries immediately. - """ - meta = dict(base_meta, phase='cpu_overhead', stress_vm_bytes=vm_bytes) - timeout = _STRESS_TIMEOUT_SEC.value - interval = 2 - n_samples = timeout // interval + 10 - vmstat_log = f'/tmp/pkb_vmstat_{vm_bytes}.log' - pidstat_log = f'/tmp/pkb_pidstat_{vm_bytes}.log' - workers = max(1, _STRESS_VM_WORKERS.value) - per_worker = _per_worker_vm_bytes(vm_bytes, workers) - min_so = _MIN_SWAP_OUT_PAGES.value - method_flag = _stress_vm_method_flag(pod) - max_attempts = 3 - best = None - - for attempt in range(1, max_attempts + 1): - t0 = time.time() - stress_out, _ = _pod_exec(pod, textwrap.dedent(f""" + """Phase 2a stressor sweep, WITH RETRY for flaky swap. + + Driving the multi-worker rand-set working set past RAM into swap is + empirically non-deterministic on these nodes: the SAME config produced + ~670k pages/s on some runs and <300 on others. So we retry: if an attempt + completes but peak swap-out is below the threshold (and it did not OOM), + reclaim memory and re-run, keeping the BEST attempt. An OOM, or a peak + at/above threshold, ends the retries immediately. + """ + meta = dict(base_meta, phase="cpu_overhead", stress_vm_bytes=vm_bytes) + timeout = _STRESS_TIMEOUT_SEC.value + interval = 2 + n_samples = timeout // interval + 10 + vmstat_log = f"/tmp/pkb_vmstat_{vm_bytes}.log" + pidstat_log = f"/tmp/pkb_pidstat_{vm_bytes}.log" + workers = max(1, _STRESS_VM_WORKERS.value) + per_worker = _per_worker_vm_bytes(vm_bytes, workers) + min_so = _MIN_SWAP_OUT_PAGES.value + method_flag = _stress_vm_method_flag(pod) + max_attempts = 3 + best = None + + for attempt in range(1, max_attempts + 1): + t0 = time.time() + stress_out, _ = _pod_exec( + pod, + textwrap.dedent(f""" echo 2 > /sys/kernel/mm/ksm/run 2>/dev/null || true echo 0 > /sys/kernel/mm/ksm/run 2>/dev/null || true sysctl -w vm.swappiness=100 >/dev/null 2>&1 || true @@ -2986,190 +3587,234 @@ def _run_cpu_overhead_sweep( --timeout {timeout}s \\ --metrics-brief 2>&1 || true kill $VMSTAT_PID $PISTAT_PID 2>/dev/null || true - """), timeout=timeout + 60, ignore_failure=True) - elapsed = time.time() - t0 - - completed_cleanly = ('successful run completed' in stress_out.lower() - or 'metrics-brief' in stress_out.lower() - or 'bogo-ops' in stress_out.lower()) - oom_killed = (not completed_cleanly) and elapsed < timeout * 0.8 - vmstat_out, _ = _pod_exec(pod, f'cat {vmstat_log}', ignore_failure=True) - pidstat_out, _ = _pod_exec(pod, f'cat {pidstat_log}', ignore_failure=True) - vmstat_samples = _parse_vmstat(vmstat_out, meta) - swap_out_max = max( - (s.value for s in vmstat_samples - if s.metric in ('swap_out_pages_per_sec', - 'swap_out_pages_per_sec_max')), default=0.0) - bogo = None - for line in stress_out.splitlines(): - mm = re.search(r'vm\s+\d+\s+(\d+)\s+\S+\s+bogo-ops', line) - if mm: - bogo = float(mm.group(1)) - break - logging.info('[swap_encryption] Phase 2a attempt %d/%d: peak swap-out ' - '%.0f pages/s (completed=%s, oom=%s)', attempt, max_attempts, - swap_out_max, completed_cleanly, oom_killed) - if best is None or swap_out_max > best['swap_out_max']: - best = dict(elapsed=elapsed, oom_killed=oom_killed, - swap_out_max=swap_out_max, vmstat_samples=vmstat_samples, - pidstat_out=pidstat_out, bogo=bogo) - if oom_killed or swap_out_max >= min_so: - break - if attempt < max_attempts: - logging.warning('[swap_encryption] Phase 2a swap-out %.0f < %d threshold ' - '— reclaiming and retrying (%d/%d)', swap_out_max, min_so, - attempt + 1, max_attempts) - _pod_exec(pod, textwrap.dedent(""" + """), + timeout=timeout + 60, + ignore_failure=True, + ) + elapsed = time.time() - t0 + + completed_cleanly = ( + "successful run completed" in stress_out.lower() + or "metrics-brief" in stress_out.lower() + or "bogo-ops" in stress_out.lower() + ) + oom_killed = (not completed_cleanly) and elapsed < timeout * 0.8 + vmstat_out, _ = _pod_exec(pod, f"cat {vmstat_log}", ignore_failure=True) + pidstat_out, _ = _pod_exec( + pod, f"cat {pidstat_log}", ignore_failure=True + ) + vmstat_samples = _parse_vmstat(vmstat_out, meta) + swap_out_max = max( + ( + s.value + for s in vmstat_samples + if s.metric + in ("swap_out_pages_per_sec", "swap_out_pages_per_sec_max") + ), + default=0.0, + ) + bogo = None + for line in stress_out.splitlines(): + mm = re.search(r"vm\s+\d+\s+(\d+)\s+\S+\s+bogo-ops", line) + if mm: + bogo = float(mm.group(1)) + break + logging.info( + "[swap_encryption] Phase 2a attempt %d/%d: peak swap-out " + "%.0f pages/s (completed=%s, oom=%s)", + attempt, + max_attempts, + swap_out_max, + completed_cleanly, + oom_killed, + ) + if best is None or swap_out_max > best["swap_out_max"]: + best = dict( + elapsed=elapsed, + oom_killed=oom_killed, + swap_out_max=swap_out_max, + vmstat_samples=vmstat_samples, + pidstat_out=pidstat_out, + bogo=bogo, + ) + if oom_killed or swap_out_max >= min_so: + break + if attempt < max_attempts: + logging.warning( + "[swap_encryption] Phase 2a swap-out %.0f < %d threshold " + "— reclaiming and retrying (%d/%d)", + swap_out_max, + min_so, + attempt + 1, + max_attempts, + ) + _pod_exec( + pod, + textwrap.dedent(""" echo -1000 > /proc/self/oom_score_adj 2>/dev/null || true pkill -9 stress-ng 2>/dev/null || true sleep 3; sync; echo 1 > /proc/sys/vm/drop_caches 2>/dev/null || true - """), ignore_failure=True, timeout=60) - - # Emit samples from the BEST attempt. - results = [ - sample.Sample('stress_ng_duration_sec', best['elapsed'], 's', meta), - sample.Sample('stress_ng_completed', - 0.0 if best['oom_killed'] else 1.0, 'status', meta), - ] - if best['bogo'] is not None: - results.append(sample.Sample('stress_ng_bogo_ops', best['bogo'], 'ops', - meta)) - results += best['vmstat_samples'] - results += _parse_pidstat(best['pidstat_out'], meta) - - # Swap-activity gate: a completed run that moved ~no pages to swap never - # exercised the encrypted swap path (the headline numbers would be hollow). - if best['oom_killed']: - msg = (f'stress-ng (vm_bytes={vm_bytes}) was OOM-killed — the cgroup could ' - f'not page anonymous memory out to swap; swap-encryption overhead ' - f'was not measured') - logging.error('[swap_encryption] %s', msg) - _degraded_reasons.append(msg) - elif best['swap_out_max'] < min_so: - msg = (f'stress-ng (vm_bytes={vm_bytes}) peak swap-out was only ' - f'{best["swap_out_max"]:.0f} pages/s (< {min_so} threshold) after ' - f'{max_attempts} attempts — the working set never meaningfully ' - f'paged to swap. Check vm_bytes vs RAM and the swap device') - logging.error('[swap_encryption] %s', msg) - _degraded_reasons.append(msg) - - return results + """), + ignore_failure=True, + timeout=60, + ) + + # Emit samples from the BEST attempt. + results = [ + sample.Sample("stress_ng_duration_sec", best["elapsed"], "s", meta), + sample.Sample( + "stress_ng_completed", + 0.0 if best["oom_killed"] else 1.0, + "status", + meta, + ), + ] + if best["bogo"] is not None: + results.append( + sample.Sample("stress_ng_bogo_ops", best["bogo"], "ops", meta) + ) + results += best["vmstat_samples"] + results += _parse_pidstat(best["pidstat_out"], meta) + + # Swap-activity gate: a completed run that moved ~no pages to swap never + # exercised the encrypted swap path (the headline numbers would be hollow). + if best["oom_killed"]: + msg = ( + f"stress-ng (vm_bytes={vm_bytes}) was OOM-killed — the cgroup could" + " not page anonymous memory out to swap; swap-encryption overhead" + " was not measured" + ) + logging.error("[swap_encryption] %s", msg) + _degraded_reasons.append(msg) + elif best["swap_out_max"] < min_so: + msg = ( + f"stress-ng (vm_bytes={vm_bytes}) peak swap-out was only " + f'{best["swap_out_max"]:.0f} pages/s (< {min_so} threshold) after ' + f"{max_attempts} attempts — the working set never meaningfully " + f"paged to swap. Check vm_bytes vs RAM and the swap device" + ) + logging.error("[swap_encryption] %s", msg) + _degraded_reasons.append(msg) + + return results def _parse_vmstat(output: str, base_meta: dict) -> list[sample.Sample]: - """Parse vmstat output for swap rates AND CPU utilisation. - - Standard vmstat column layout (non-header data lines, 0-indexed): - r b swpd free buff cache si so bi bo in cs us sy id wa st - 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 - - si=6, so=7 – swap-in / swap-out pages/s - us=12 – user CPU % - sy=13 – system (kernel) CPU % ← gap 2: system time % - id=14 – idle CPU % - wa=15 – I/O wait CPU % - total_active = us + sy + wa ← gap 1: total CPU utilisation - """ - si_vals, so_vals = [], [] - us_vals, sy_vals, wa_vals = [], [], [] - - for line in output.splitlines(): - parts = line.split() - if len(parts) < 17 or not parts[0].isdigit(): - continue - try: - si_vals.append(float(parts[6])) - so_vals.append(float(parts[7])) - us_vals.append(float(parts[12])) - sy_vals.append(float(parts[13])) - wa_vals.append(float(parts[15])) - except (ValueError, IndexError): - pass - - if not si_vals: - return [] - - meta = dict(base_meta, metric_source='vmstat') - - def _mean(lst): - return sum(lst) / len(lst) if lst else 0.0 - - def _peak(lst): - return max(lst) if lst else 0.0 - - total_active = [u + s + w for u, s, w in zip(us_vals, sy_vals, wa_vals)] - - return [ - # Swap rates - sample.Sample( - 'swap_in_pages_per_sec', _mean(si_vals), 'pages/s', meta), - sample.Sample( - 'swap_in_pages_per_sec_max', _peak(si_vals), 'pages/s', meta), - sample.Sample( - 'swap_out_pages_per_sec', _mean(so_vals), 'pages/s', meta), - sample.Sample( - 'swap_out_pages_per_sec_max', _peak(so_vals), 'pages/s', meta), - # Total CPU utilisation (gap 1) - sample.Sample( - 'total_cpu_pct_avg', _mean(total_active), '%', meta), - sample.Sample( - 'total_cpu_pct_max', _peak(total_active), '%', meta), - # System (kernel) time % – encryption overhead signal (gap 2) - sample.Sample('system_time_pct_avg', _mean(sy_vals), '%', meta), - sample.Sample('system_time_pct_max', _peak(sy_vals), '%', meta), - # User and I/O-wait for completeness - sample.Sample('user_cpu_pct_avg', _mean(us_vals), '%', meta), - sample.Sample('iowait_cpu_pct_avg', _mean(wa_vals), '%', meta), - ] + """Parse vmstat output for swap rates AND CPU utilisation. + + Standard vmstat column layout (non-header data lines, 0-indexed): + r b swpd free buff cache si so bi bo in cs us sy id wa st + 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 + + si=6, so=7 – swap-in / swap-out pages/s + us=12 – user CPU % + sy=13 – system (kernel) CPU % ← gap 2: system time % + id=14 – idle CPU % + wa=15 – I/O wait CPU % + total_active = us + sy + wa ← gap 1: total CPU utilisation + """ + si_vals, so_vals = [], [] + us_vals, sy_vals, wa_vals = [], [], [] + + for line in output.splitlines(): + parts = line.split() + if len(parts) < 17 or not parts[0].isdigit(): + continue + try: + si_vals.append(float(parts[6])) + so_vals.append(float(parts[7])) + us_vals.append(float(parts[12])) + sy_vals.append(float(parts[13])) + wa_vals.append(float(parts[15])) + except (ValueError, IndexError): + pass + + if not si_vals: + return [] + + meta = dict(base_meta, metric_source="vmstat") + + def _mean(lst): + return sum(lst) / len(lst) if lst else 0.0 + + def _peak(lst): + return max(lst) if lst else 0.0 + + total_active = [u + s + w for u, s, w in zip(us_vals, sy_vals, wa_vals)] + + return [ + # Swap rates + sample.Sample("swap_in_pages_per_sec", _mean(si_vals), "pages/s", meta), + sample.Sample( + "swap_in_pages_per_sec_max", _peak(si_vals), "pages/s", meta + ), + sample.Sample( + "swap_out_pages_per_sec", _mean(so_vals), "pages/s", meta + ), + sample.Sample( + "swap_out_pages_per_sec_max", _peak(so_vals), "pages/s", meta + ), + # Total CPU utilisation (gap 1) + sample.Sample("total_cpu_pct_avg", _mean(total_active), "%", meta), + sample.Sample("total_cpu_pct_max", _peak(total_active), "%", meta), + # System (kernel) time % – encryption overhead signal (gap 2) + sample.Sample("system_time_pct_avg", _mean(sy_vals), "%", meta), + sample.Sample("system_time_pct_max", _peak(sy_vals), "%", meta), + # User and I/O-wait for completeness + sample.Sample("user_cpu_pct_avg", _mean(us_vals), "%", meta), + sample.Sample("iowait_cpu_pct_avg", _mean(wa_vals), "%", meta), + ] def _parse_pidstat(output: str, base_meta: dict) -> list[sample.Sample]: - """Parse CPU % for swap/encryption-related kernel threads from pidstat.""" - cpu_by_proc: dict[str, list[float]] = {} - for line in output.splitlines(): - parts = line.split() - if len(parts) < 9: - continue - proc = parts[-1] - if not any(t in proc for t in _CRYPTO_PROCS): - continue - try: - cpu_by_proc.setdefault(proc, []).append(float(parts[7])) - except (ValueError, IndexError): - pass - results = [] - meta = dict(base_meta, metric_source='pidstat') - for proc, vals in cpu_by_proc.items(): - m = dict(meta, process=proc) - results += [ - sample.Sample(f'cpu_pct_avg_{proc}', sum(vals) / len(vals), '%', m), - sample.Sample(f'cpu_pct_max_{proc}', max(vals), '%', m), - ] - return results + """Parse CPU % for swap/encryption-related kernel threads from pidstat.""" + cpu_by_proc: dict[str, list[float]] = {} + for line in output.splitlines(): + parts = line.split() + if len(parts) < 9: + continue + proc = parts[-1] + if not any(t in proc for t in _CRYPTO_PROCS): + continue + try: + cpu_by_proc.setdefault(proc, []).append(float(parts[7])) + except (ValueError, IndexError): + pass + results = [] + meta = dict(base_meta, metric_source="pidstat") + for proc, vals in cpu_by_proc.items(): + m = dict(meta, process=proc) + results += [ + sample.Sample(f"cpu_pct_avg_{proc}", sum(vals) / len(vals), "%", m), + sample.Sample(f"cpu_pct_max_{proc}", max(vals), "%", m), + ] + return results def _launch_confined_bg_stress(pod: str, timeout_s: int, logfile: str) -> None: - """Launch the Phase 2b/3a background swap stressor confined to its OWN - memory-capped cgroup, so it drives swap pressure WITHOUT starving the - concurrent foreground workload (fio / Redis) or OOM-killing the pod. - - On a small node (config1, 30 GB) a flat 32 GB stressor plus a concurrent - workload exhausts RAM faster than the kernel pages out, and the OOM killer - takes the foreground process (the under-pressure app_io fio died with - rc=137). Confining the stressor to memory.max = 60% of RAM (with unlimited - swap) makes it page within its own budget; the other ~40% of RAM stays free - for the workload, and if the stressor overruns its cap only IT is killed — - never the pod or the workload. - - Config-2 safety: on a 256 GB node, 60% = ~150 GB, far above the 32 GB - stressor, so the cap is never reached and behaviour is unchanged. - Best-effort: if the cgroup can't be created the stressor still runs in the - main cgroup (degrades to prior behaviour, not worse). MemTotal is read with - grep/cut (no awk) to keep this clear of f-string brace escaping. - """ - method = _stress_vm_method_flag(pod) - vm_bytes = _STRESS_VM_BYTES.value - _pod_exec(pod, textwrap.dedent(f""" + """Launch the Phase 2b/3a background swap stressor confined to its OWN + memory-capped cgroup, so it drives swap pressure WITHOUT starving the + concurrent foreground workload (fio / Redis) or OOM-killing the pod. + + On a small node (config1, 30 GB) a flat 32 GB stressor plus a concurrent + workload exhausts RAM faster than the kernel pages out, and the OOM killer + takes the foreground process (the under-pressure app_io fio died with + rc=137). Confining the stressor to memory.max = 60% of RAM (with unlimited + swap) makes it page within its own budget; the other ~40% of RAM stays free + for the workload, and if the stressor overruns its cap only IT is killed — + never the pod or the workload. + + Config-2 safety: on a 256 GB node, 60% = ~150 GB, far above the 32 GB + stressor, so the cap is never reached and behaviour is unchanged. + Best-effort: if the cgroup can't be created the stressor still runs in the + main cgroup (degrades to prior behaviour, not worse). MemTotal is read with + grep/cut (no awk) to keep this clear of f-string brace escaping. + """ + method = _stress_vm_method_flag(pod) + vm_bytes = _STRESS_VM_BYTES.value + _pod_exec( + pod, + textwrap.dedent(f""" nohup bash -c ' BG=/sys/fs/cgroup/pkb_bgstress mkdir -p "$BG" 2>/dev/null || true @@ -3182,27 +3827,31 @@ def _launch_confined_bg_stress(pod: str, timeout_s: int, logfile: str) -> None: ' >{logfile} 2>&1 & disown echo STRESS_STARTED - """), timeout=30) + """), + timeout=30, + ) def _set_memory_high_guard(pod: str, fraction: float = 0.9) -> None: - """Cap the container cgroup ``memory.high`` at `fraction` x RAM. - - Phases 2b (I/O interference) and 3a (Redis) run a background stressor *and* a - concurrent foreground workload (an 8 GB fio file / a Redis dataset). On a - small-RAM node (config1, 30 GB) their combined footprint exceeds RAM and the - hard OOM killer (``memory.max``) terminates the pod (rc=137), wiping out both - phases. ``memory.high`` is a soft limit: when the cgroup crosses it the - kernel reclaims and *swaps* aggressively (throttling the cgroup) instead of - killing it — which is exactly the swap pressure these phases want to create. - - Config-2 safety: this is a no-op in effect on large-RAM nodes. On - n4-highmem-32 (256 GB) the 32 GB background workload never approaches 0.9 x - 256 GB = 230 GB, so the soft limit is never crossed and behaviour is - unchanged. Phase 2a is deliberately NOT guarded (it works on both configs). - Best-effort; any failure is ignored. - """ - _pod_exec(pod, textwrap.dedent(f""" + """Cap the container cgroup ``memory.high`` at `fraction` x RAM. + + Phases 2b (I/O interference) and 3a (Redis) run a background stressor *and* a + concurrent foreground workload (an 8 GB fio file / a Redis dataset). On a + small-RAM node (config1, 30 GB) their combined footprint exceeds RAM and the + hard OOM killer (``memory.max``) terminates the pod (rc=137), wiping out both + phases. ``memory.high`` is a soft limit: when the cgroup crosses it the + kernel reclaims and *swaps* aggressively (throttling the cgroup) instead of + killing it — which is exactly the swap pressure these phases want to create. + + Config-2 safety: this is a no-op in effect on large-RAM nodes. On + n4-highmem-32 (256 GB) the 32 GB background workload never approaches 0.9 x + 256 GB = 230 GB, so the soft limit is never crossed and behaviour is + unchanged. Phase 2a is deliberately NOT guarded (it works on both configs). + Best-effort; any failure is ignored. + """ + _pod_exec( + pod, + textwrap.dedent(f""" PKB_MCG=$(awk -F: '/^0::/{{print $3}}' /proc/self/cgroup 2>/dev/null) MT_KB=$(awk '/MemTotal/{{print $2}}' /proc/meminfo) HIGH=$(( MT_KB * 1024 / 100 * {int(fraction * 100)} )) @@ -3211,51 +3860,68 @@ def _set_memory_high_guard(pod: str, fraction: float = 0.9) -> None: && echo "[pkb] memory.high set to $HIGH bytes ({int(fraction * 100)}% RAM) — pod will swap, not OOM" \ || echo "[pkb] WARNING: could not set memory.high" >&2 fi - """), ignore_failure=True, timeout=30, _retries=0) + """), + ignore_failure=True, + timeout=30, + _retries=0, + ) def _reset_memory_high_guard(pod: str) -> None: - """Restore ``memory.high`` to ``max`` after a guarded phase.""" - _pod_exec(pod, textwrap.dedent(""" + """Restore ``memory.high`` to ``max`` after a guarded phase.""" + _pod_exec( + pod, + textwrap.dedent(""" PKB_MCG=$(awk -F: '/^0::/{print $3}' /proc/self/cgroup 2>/dev/null) if [ -n "$PKB_MCG" ] && [ -f "/sys/fs/cgroup$PKB_MCG/memory.high" ]; then echo max > "/sys/fs/cgroup$PKB_MCG/memory.high" 2>/dev/null || true fi - """), ignore_failure=True, timeout=30, _retries=0) + """), + ignore_failure=True, + timeout=30, + _retries=0, + ) def _phase2b_io_interference(pod: str, base_meta: dict) -> list[sample.Sample]: - """Quantify drop in application I/O when swap is under simultaneous pressure.""" - results = [] - # IMPORTANT: keep this OFF tmpfs. /tmp is RAM-backed (tmpfs/overlay), so an - # 8 GB fio file there consumes 8 GB of RAM and OOM-kills the pod on a small - # node (config1, rc=137 at "Laying out IO file") before any swap pressure is - # even applied. /mnt/stateful_partition is the node's persistent boot disk - # (hostPath mount) — the file lives on disk, not RAM, and the fio results - # then measure real disk I/O under swap pressure, which is the intent. - app_file = '/mnt/stateful_partition/pkb_app_io' - timeout = _STRESS_TIMEOUT_SEC.value - meta = dict(base_meta, phase='io_interference') - - # Relieve memory pressure via swap rather than the OOM killer (see helper). - # No-op on large-RAM nodes; prevents the config1 Phase 2b OOM (rc=137). - _set_memory_high_guard(pod) - - # Ensure fio is available — apt-get may have failed during DaemonSet init. - _pod_exec(pod, textwrap.dedent(""" + """Quantify drop in application I/O when swap is under simultaneous pressure.""" + results = [] + # IMPORTANT: keep this OFF tmpfs. /tmp is RAM-backed (tmpfs/overlay), so an + # 8 GB fio file there consumes 8 GB of RAM and OOM-kills the pod on a small + # node (config1, rc=137 at "Laying out IO file") before any swap pressure is + # even applied. /mnt/stateful_partition is the node's persistent boot disk + # (hostPath mount) — the file lives on disk, not RAM, and the fio results + # then measure real disk I/O under swap pressure, which is the intent. + app_file = "/mnt/stateful_partition/pkb_app_io" + timeout = _STRESS_TIMEOUT_SEC.value + meta = dict(base_meta, phase="io_interference") + + # Relieve memory pressure via swap rather than the OOM killer (see helper). + # No-op on large-RAM nodes; prevents the config1 Phase 2b OOM (rc=137). + _set_memory_high_guard(pod) + + # Ensure fio is available — apt-get may have failed during DaemonSet init. + _pod_exec( + pod, + textwrap.dedent(""" command -v fio >/dev/null 2>&1 || { apt-get install -y -qq fio 2>/dev/null || true } - """), ignore_failure=True, timeout=120) - - # Reclaim node memory BEFORE creating the test file. By this point Phase 2a - # has hard-swapped the node and Phase 3c's OpenSearch (which runs first) may - # have left a multi-GB JVM footprint; on a 30 GB node the file create then - # gets OOM-killed (rc=137) at the NODE level — which neither --direct=1 nor - # the cgroup memory.high guard can prevent (those are cgroup/page-cache - # tools, not node-eviction controls). Kill any leftover stressors/servers, - # flush dirty pages, and drop caches so the node starts Phase 2b clean. - _pod_exec(pod, textwrap.dedent(""" + """), + ignore_failure=True, + timeout=120, + ) + + # Reclaim node memory BEFORE creating the test file. By this point Phase 2a + # has hard-swapped the node and Phase 3c's OpenSearch (which runs first) may + # have left a multi-GB JVM footprint; on a 30 GB node the file create then + # gets OOM-killed (rc=137) at the NODE level — which neither --direct=1 nor + # the cgroup memory.high guard can prevent (those are cgroup/page-cache + # tools, not node-eviction controls). Kill any leftover stressors/servers, + # flush dirty pages, and drop caches so the node starts Phase 2b clean. + _pod_exec( + pod, + textwrap.dedent(""" pkill -9 stress-ng 2>/dev/null || true pkill -9 -f 'opensearch|elasticsearch' 2>/dev/null || true pkill -9 redis-server 2>/dev/null || true @@ -3263,171 +3929,187 @@ def _phase2b_io_interference(pod: str, base_meta: dict) -> list[sample.Sample]: echo 3 > /proc/sys/vm/drop_caches 2>/dev/null || true sleep 2 echo "[pkb] pre-2b MemAvailable_kB=$(awk '/MemAvailable/{print $2}' /proc/meminfo) SwapFree_kB=$(awk '/SwapFree/{print $2}' /proc/meminfo)" - """), ignore_failure=True, timeout=60) - - # Create the test file on the persistent disk (see app_file note above). - # --direct=1 (O_DIRECT, ext4 supports it) bypasses the page cache. Size is - # kept at 4 GB (not 8) so the create + the concurrent background stressor - # cannot exhaust a 30 GB node even with swap already in use. - _pod_exec(pod, ( - f'fio --name=create --filename={app_file} ' - f'--rw=write --bs=1m --size=4G --verify=0 --direct=1' - ), timeout=600, ignore_failure=True) - - def _run_app_fio(pressure_label: str) -> list[sample.Sample]: - # --direct=1 (O_DIRECT) avoids page-cache buildup; ext4 on the persistent - # disk supports it. --size=4G matches the file created above. This - # measures the disk's I/O under swap pressure directly. - cmd = ( - f'fio --name=app_io --filename={app_file} ' - f'--ioengine=libaio --direct=1 ' - f'--rw=randrw --bs=4k --iodepth=32 --size=4G --verify=0 ' - f'--time_based --runtime=60s --output-format=json' - ) - # ignore_failure=True: fio rc=137 is expected when the pod is OOM-evicted - # under heavy swap pressure. _pod_exec handles recovery; callers rely on - # _parse_fio_json returning [] on empty/bad output rather than an exception. - out, _ = _pod_exec(pod, cmd, ignore_failure=True) - return _parse_fio_json( - out, 'app_io', f'App I/O ({pressure_label})', - dict(meta, pressure=pressure_label), - ) - - # 1. Baseline – no swap pressure - logging.info('[swap_encryption] I/O interference: baseline (no pressure)') - results += _run_app_fio('no_pressure') - - # 2. Under swap pressure - # Use nohup + disown so bash exits immediately after launching stress-ng; - # otherwise kubectl exec keeps the session alive until stress-ng finishes - # (300 s) and PKB's IssueCommand times out. - logging.info('[swap_encryption] I/O interference: under swap pressure') - # Confined background stressor: pages within a 60%-RAM cgroup so it can't - # OOM the concurrent app_io fio on a small node (see helper). - _launch_confined_bg_stress(pod, timeout, '/tmp/pkb_stress_io.log') - time.sleep(10) # let swap pressure build - results += _run_app_fio('with_swap_pressure') - - # Stop background stress-ng. If the pod was OOM-evicted while fio ran, - # stress-ng is already dead — kill is a no-op and we skip the long wait. - # _retries=0: no recovery here; the first Phase 3a command will recover - # the pod properly if needed (and it already waits for /tmp/pkb_ready). - _pod_exec(pod, 'pkill -9 stress-ng 2>/dev/null || true', - ignore_failure=True, _retries=0, timeout=15) - _reset_memory_high_guard(pod) - return results + """), + ignore_failure=True, + timeout=60, + ) + + # Create the test file on the persistent disk (see app_file note above). + # --direct=1 (O_DIRECT, ext4 supports it) bypasses the page cache. Size is + # kept at 4 GB (not 8) so the create + the concurrent background stressor + # cannot exhaust a 30 GB node even with swap already in use. + _pod_exec( + pod, + ( + f"fio --name=create --filename={app_file} " + "--rw=write --bs=1m --size=4G --verify=0 --direct=1" + ), + timeout=600, + ignore_failure=True, + ) + + def _run_app_fio(pressure_label: str) -> list[sample.Sample]: + # --direct=1 (O_DIRECT) avoids page-cache buildup; ext4 on the persistent + # disk supports it. --size=4G matches the file created above. This + # measures the disk's I/O under swap pressure directly. + cmd = ( + f"fio --name=app_io --filename={app_file} " + "--ioengine=libaio --direct=1 " + "--rw=randrw --bs=4k --iodepth=32 --size=4G --verify=0 " + "--time_based --runtime=60s --output-format=json" + ) + # ignore_failure=True: fio rc=137 is expected when the pod is OOM-evicted + # under heavy swap pressure. _pod_exec handles recovery; callers rely on + # _parse_fio_json returning [] on empty/bad output rather than an exception. + out, _ = _pod_exec(pod, cmd, ignore_failure=True) + return _parse_fio_json( + out, + "app_io", + f"App I/O ({pressure_label})", + dict(meta, pressure=pressure_label), + ) + + # 1. Baseline – no swap pressure + logging.info("[swap_encryption] I/O interference: baseline (no pressure)") + results += _run_app_fio("no_pressure") + + # 2. Under swap pressure + # Use nohup + disown so bash exits immediately after launching stress-ng; + # otherwise kubectl exec keeps the session alive until stress-ng finishes + # (300 s) and PKB's IssueCommand times out. + logging.info("[swap_encryption] I/O interference: under swap pressure") + # Confined background stressor: pages within a 60%-RAM cgroup so it can't + # OOM the concurrent app_io fio on a small node (see helper). + _launch_confined_bg_stress(pod, timeout, "/tmp/pkb_stress_io.log") + time.sleep(10) # let swap pressure build + results += _run_app_fio("with_swap_pressure") + + # Stop background stress-ng. If the pod was OOM-evicted while fio ran, + # stress-ng is already dead — kill is a no-op and we skip the long wait. + # _retries=0: no recovery here; the first Phase 3a command will recover + # the pod properly if needed (and it already waits for /tmp/pkb_ready). + _pod_exec( + pod, + "pkill -9 stress-ng 2>/dev/null || true", + ignore_failure=True, + _retries=0, + timeout=15, + ) + _reset_memory_high_guard(pod) + return results _INSTANCE_PRICE_USD_PER_HR: dict[str, float] = { # GCP (on-demand, us-central1 unless noted) - 'c4-standard-8-lssd': 0.5888, # 8 vCPU, 32 GB RAM + 1×375 GB LSSD - 'c4-standard-8': 0.5008, # 8 vCPU, 32 GB RAM, no LSSD - 'n4-highmem-32': 3.0256, # 32 vCPU, 256 GB RAM - 'n2-highmem-32': 2.5216, # 32 vCPU, 256 GB RAM - 'n2-standard-32': 1.5264, # 32 vCPU, 120 GB RAM - 'z3-highmem-8': 2.7248, # 8 vCPU + 4× LSSD + "c4-standard-8-lssd": 0.5888, # 8 vCPU, 32 GB RAM + 1×375 GB LSSD + "c4-standard-8": 0.5008, # 8 vCPU, 32 GB RAM, no LSSD + "n4-highmem-32": 3.0256, # 32 vCPU, 256 GB RAM + "n2-highmem-32": 2.5216, # 32 vCPU, 256 GB RAM + "n2-standard-32": 1.5264, # 32 vCPU, 120 GB RAM + "z3-highmem-8": 2.7248, # 8 vCPU + 4× LSSD # AWS - 'i4i.4xlarge': 1.4960, # 16 vCPU, 128 GB RAM, NVMe Instance Store - 'i4i.2xlarge': 0.7480, - 'm6id.4xlarge': 0.9072, # 16 vCPU, 64 GB RAM, NVMe Instance Store - 'm6i.4xlarge': 0.7680, # 16 vCPU, 64 GB RAM, no Instance Store - 'r6i.4xlarge': 1.0080, # 16 vCPU, 128 GB RAM, no Instance Store + "i4i.4xlarge": 1.4960, # 16 vCPU, 128 GB RAM, NVMe Instance Store + "i4i.2xlarge": 0.7480, + "m6id.4xlarge": 0.9072, # 16 vCPU, 64 GB RAM, NVMe Instance Store + "m6i.4xlarge": 0.7680, # 16 vCPU, 64 GB RAM, no Instance Store + "r6i.4xlarge": 1.0080, # 16 vCPU, 128 GB RAM, no Instance Store } def _collect_cost_sample( pod: str, elapsed_sec: float, base_meta: dict ) -> list[sample.Sample]: - """Emit a cost_estimate_usd sample for the benchmark run (gap 7). - - Instance type is read from cloud metadata inside the pod. Price is looked - up from _INSTANCE_PRICE_USD_PER_HR; if unknown, the sample is omitted and - a warning is logged. - - Args: - pod: Benchmark pod name. - elapsed_sec: Wall-clock seconds the benchmark phases took. - base_meta: Shared metadata dict. - - Returns: - A list of zero or one sample.Sample. - """ - # Detect instance type from cloud metadata - instance_type = '' - - # GCP: machine type is the last segment of the metadata URL value - gcp_type_out, _ = _pod_exec( - pod, - 'curl -s -m 3 --fail ' - 'http://metadata.google.internal/computeMetadata/v1/instance/machine-type ' - '-H "Metadata-Flavor: Google" 2>/dev/null || echo ""', - ignore_failure=True, - ) - if gcp_type_out.strip(): - instance_type = gcp_type_out.strip().split('/')[-1] - - if not instance_type: - # AWS: instance-type is a plain string - aws_type_out, _ = _pod_exec( - pod, - 'curl -s -m 3 --fail ' - 'http://169.254.169.254/latest/meta-data/instance-type ' - '2>/dev/null || echo ""', - ignore_failure=True, - ) - instance_type = aws_type_out.strip() + """Emit a cost_estimate_usd sample for the benchmark run (gap 7). - # Allow explicit override (useful when running on custom/renamed machine - # types or when the pod was unavailable during cost collection). - if _INSTANCE_SIZE_LABEL.value: - instance_type = _INSTANCE_SIZE_LABEL.value + Instance type is read from cloud metadata inside the pod. Price is looked + up from _INSTANCE_PRICE_USD_PER_HR; if unknown, the sample is omitted and + a warning is logged. - # Last resort: fall back to the benchmark machine type flag. This ensures - # cost tracking works even when the pod was evicted before cost collection - # ran (in which case the metadata curl above returned empty). - if not instance_type and _BENCHMARK_MACHINE_TYPE.value: - instance_type = _BENCHMARK_MACHINE_TYPE.value - logging.info( - '[swap_encryption] Instance type from metadata unavailable; ' - 'using --swap_encryption_benchmark_machine_type=%s for cost tracking', - instance_type, - ) + Args: + pod: Benchmark pod name. + elapsed_sec: Wall-clock seconds the benchmark phases took. + base_meta: Shared metadata dict. - price = _INSTANCE_PRICE_USD_PER_HR.get(instance_type) - if price is None: - logging.warning( - '[swap_encryption] Unknown instance type "%s" – skipping cost sample. ' - 'Add it to _INSTANCE_PRICE_USD_PER_HR to enable cost tracking.', - instance_type, + Returns: + A list of zero or one sample.Sample. + """ + # Detect instance type from cloud metadata + instance_type = "" + + # GCP: machine type is the last segment of the metadata URL value + gcp_type_out, _ = _pod_exec( + pod, + "curl -s -m 3 --fail" + " http://metadata.google.internal/computeMetadata/v1/instance/machine-type" + ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""', + ignore_failure=True, ) - return [] + if gcp_type_out.strip(): + instance_type = gcp_type_out.strip().split("/")[-1] + + if not instance_type: + # AWS: instance-type is a plain string + aws_type_out, _ = _pod_exec( + pod, + "curl -s -m 3 --fail " + "http://169.254.169.254/latest/meta-data/instance-type " + '2>/dev/null || echo ""', + ignore_failure=True, + ) + instance_type = aws_type_out.strip() + + # Allow explicit override (useful when running on custom/renamed machine + # types or when the pod was unavailable during cost collection). + if _INSTANCE_SIZE_LABEL.value: + instance_type = _INSTANCE_SIZE_LABEL.value + + # Last resort: fall back to the benchmark machine type flag. This ensures + # cost tracking works even when the pod was evicted before cost collection + # ran (in which case the metadata curl above returned empty). + if not instance_type and _BENCHMARK_MACHINE_TYPE.value: + instance_type = _BENCHMARK_MACHINE_TYPE.value + logging.info( + "[swap_encryption] Instance type from metadata unavailable; using" + " --swap_encryption_benchmark_machine_type=%s for cost tracking", + instance_type, + ) - hours = elapsed_sec / 3600.0 - cost = hours * price - meta = dict( - base_meta, - instance_type=instance_type, - price_usd_per_hr=price, - benchmark_elapsed_sec=round(elapsed_sec, 1), - ) - return [sample.Sample('cost_estimate_usd', cost, 'USD', meta)] + price = _INSTANCE_PRICE_USD_PER_HR.get(instance_type) + if price is None: + logging.warning( + '[swap_encryption] Unknown instance type "%s" – skipping cost' + " sample. Add it to _INSTANCE_PRICE_USD_PER_HR to enable cost" + " tracking.", + instance_type, + ) + return [] + + hours = elapsed_sec / 3600.0 + cost = hours * price + meta = dict( + base_meta, + instance_type=instance_type, + price_usd_per_hr=price, + benchmark_elapsed_sec=round(elapsed_sec, 1), + ) + return [sample.Sample("cost_estimate_usd", cost, "USD", meta)] def _detect_swap_device(pod: str) -> str: - """Return the active swap device path on the cluster node.""" - if _SWAP_DEVICE.value: - return _SWAP_DEVICE.value - - # /proc/swaps is the source of truth: it lists the swap device that is - # ACTUALLY active. We must NOT just `test -e /dev/mapper/swap_encrypted`, - # because a stale dm-crypt mapping from a previous run on a reused node can - # still exist as a /dev node while being non-functional (fio/swapoff then - # fail with "No such device or address"). So read the active device from - # /proc/swaps first; only fall back to the mapper path if /proc/swaps is - # somehow empty but the mapper is genuinely present. - dm_out, _ = _pod_exec( - pod, - textwrap.dedent(""" + """Return the active swap device path on the cluster node.""" + if _SWAP_DEVICE.value: + return _SWAP_DEVICE.value + + # /proc/swaps is the source of truth: it lists the swap device that is + # ACTUALLY active. We must NOT just `test -e /dev/mapper/swap_encrypted`, + # because a stale dm-crypt mapping from a previous run on a reused node can + # still exist as a /dev node while being non-functional (fio/swapoff then + # fail with "No such device or address"). So read the active device from + # /proc/swaps first; only fall back to the mapper path if /proc/swaps is + # somehow empty but the mapper is genuinely present. + dm_out, _ = _pod_exec( + pod, + textwrap.dedent(""" ACTIVE=$(awk 'NR==2{print $1}' /proc/swaps 2>/dev/null) if [ -n "$ACTIVE" ] then @@ -3437,107 +4119,109 @@ def _detect_swap_device(pod: str) -> str: echo /dev/mapper/swap_encrypted fi """), - ignore_failure=True, - ) - dev = dm_out.strip().splitlines()[-1].strip() if dm_out.strip() else '' - if dev: - return dev - raise ValueError( - 'No active swap device found in the benchmark pod. ' - 'Use --swap_encryption_device to specify one.' - ) - - -def _build_metadata(pod: str, swap_dev: str) -> dict: - """Collect node environment, encryption type, and config into a dict.""" - - kernel_out, _ = _pod_exec(pod, 'uname -r', ignore_failure=True) - mem_out, _ = _pod_exec( - pod, "awk '/MemTotal/{print $2}' /proc/meminfo", - ignore_failure=True, - ) - swap_out, _ = _pod_exec( - pod, "awk 'NR>1{sum+=$3} END{print sum+0}' /proc/swaps", - ignore_failure=True, - ) - - try: - mem_gb = round(int(mem_out.strip()) / (1024 * 1024), 1) - except ValueError: - mem_gb = 0 - try: - swap_gb = round(int(swap_out.strip()) / (1024 * 1024), 1) - except ValueError: - swap_gb = 0 - - # Encryption type — key off dm-crypt presence + the swap target, NOT the - # device path. A GKE plain Local SSD is /dev/nvme0n1 but is NOT Nitro- - # encrypted; only the AWS targets (instance_store / io2) are. - enc = 'unknown' - if '/dev/mapper/' in swap_dev: - table_out, _ = _pod_exec( - pod, - f'dmsetup table {swap_dev.split("/")[-1]} 2>/dev/null || echo ""', ignore_failure=True, ) - enc = 'dm-crypt-plain' if 'crypt' in table_out.lower() else 'dm-other' - elif _SWAP_TYPE.value in ('instance_store', 'io2'): - enc = 'nitro_hardware_offload' # AWS: encrypted by the Nitro card - elif not _ENABLE_DMCRYPT.value: - enc = 'none' # GKE plain swap (encryption OFF) + dev = dm_out.strip().splitlines()[-1].strip() if dm_out.strip() else "" + if dev: + return dev + raise ValueError( + "No active swap device found in the benchmark pod. " + "Use --swap_encryption_device to specify one." + ) - cloud = _detect_cloud(pod) - # Gap 6: instance size label for multi-size comparison runs. - # If the flag is set use it directly; otherwise try to read it from - # cloud metadata so that the field is always populated. - instance_label = _INSTANCE_SIZE_LABEL.value - if not instance_label: - gcp_type_out, _ = _pod_exec( +def _build_metadata(pod: str, swap_dev: str) -> dict: + """Collect node environment, encryption type, and config into a dict.""" + + kernel_out, _ = _pod_exec(pod, "uname -r", ignore_failure=True) + mem_out, _ = _pod_exec( pod, - 'curl -s -m 3 --fail ' - 'http://metadata.google.internal/computeMetadata/v1/instance/machine-type ' - '-H "Metadata-Flavor: Google" 2>/dev/null || echo ""', + "awk '/MemTotal/{print $2}' /proc/meminfo", ignore_failure=True, ) - if gcp_type_out.strip(): - instance_label = gcp_type_out.strip().split('/')[-1] - if not instance_label: - aws_type_out, _ = _pod_exec( + swap_out, _ = _pod_exec( pod, - 'curl -s -m 3 --fail ' - 'http://169.254.169.254/latest/meta-data/instance-type ' - '2>/dev/null || echo ""', + "awk 'NR>1{sum+=$3} END{print sum+0}' /proc/swaps", ignore_failure=True, ) - instance_label = aws_type_out.strip() - - return { - 'benchmark': BENCHMARK_NAME, - 'execution_mode': 'kubernetes_privileged_pod', - 'cloud': cloud, - 'instance_size': instance_label, - 'kernel_version': kernel_out.strip(), - 'host_memory_gb': mem_gb, - 'swap_device': swap_dev, - 'swap_size_gb': swap_gb, - 'swap_encryption': enc, - # Test-matrix columns: storage target, encryption on/off, image, IOPS - 'storage_target': _SWAP_TYPE.value, - 'boot_disk_type': _BOOT_DISK_TYPE.value, - 'dmcrypt_enabled': _ENABLE_DMCRYPT.value, - 'node_image_type': _NODE_IMAGE_TYPE.value, - 'boot_disk_iops_target': _BOOT_DISK_IOPS.value, - 'benchmark_machine_type': _BENCHMARK_MACHINE_TYPE.value, - # Other config - 'zswap_enabled': _ENABLE_ZSWAP.value, - 'min_free_kbytes': _MIN_FREE_KBYTES.value, - 'fio_runtime_sec': _FIO_RUNTIME_SEC.value, - # Requested config value only. The *effective* stress-ng footprint may - # be autoscaled per node (see _autoscale_vm_bytes); Phase 2a records the - # actual value it ran with as 'stress_vm_bytes' so the two never conflict. - 'stress_vm_bytes_requested': _STRESS_VM_BYTES.value, - 'stress_vm_bytes_list': _STRESS_VM_BYTES_LIST.value, - 'stress_timeout_sec': _STRESS_TIMEOUT_SEC.value, - 'nodepool': _NODEPOOL.value, - } + + try: + mem_gb = round(int(mem_out.strip()) / (1024 * 1024), 1) + except ValueError: + mem_gb = 0 + try: + swap_gb = round(int(swap_out.strip()) / (1024 * 1024), 1) + except ValueError: + swap_gb = 0 + + # Encryption type — key off dm-crypt presence + the swap target, NOT the + # device path. A GKE plain Local SSD is /dev/nvme0n1 but is NOT Nitro- + # encrypted; only the AWS targets (instance_store / io2) are. + enc = "unknown" + if "/dev/mapper/" in swap_dev: + table_out, _ = _pod_exec( + pod, + f'dmsetup table {swap_dev.split("/")[-1]} 2>/dev/null || echo ""', + ignore_failure=True, + ) + enc = "dm-crypt-plain" if "crypt" in table_out.lower() else "dm-other" + elif _SWAP_TYPE.value in ("instance_store", "io2"): + enc = "nitro_hardware_offload" # AWS: encrypted by the Nitro card + elif not _ENABLE_DMCRYPT.value: + enc = "none" # GKE plain swap (encryption OFF) + + cloud = _detect_cloud(pod) + + # Gap 6: instance size label for multi-size comparison runs. + # If the flag is set use it directly; otherwise try to read it from + # cloud metadata so that the field is always populated. + instance_label = _INSTANCE_SIZE_LABEL.value + if not instance_label: + gcp_type_out, _ = _pod_exec( + pod, + "curl -s -m 3 --fail" + " http://metadata.google.internal/computeMetadata/v1/instance/machine-type" + ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""', + ignore_failure=True, + ) + if gcp_type_out.strip(): + instance_label = gcp_type_out.strip().split("/")[-1] + if not instance_label: + aws_type_out, _ = _pod_exec( + pod, + "curl -s -m 3 --fail " + "http://169.254.169.254/latest/meta-data/instance-type " + '2>/dev/null || echo ""', + ignore_failure=True, + ) + instance_label = aws_type_out.strip() + + return { + "benchmark": BENCHMARK_NAME, + "execution_mode": "kubernetes_privileged_pod", + "cloud": cloud, + "instance_size": instance_label, + "kernel_version": kernel_out.strip(), + "host_memory_gb": mem_gb, + "swap_device": swap_dev, + "swap_size_gb": swap_gb, + "swap_encryption": enc, + # Test-matrix columns: storage target, encryption on/off, image, IOPS + "storage_target": _SWAP_TYPE.value, + "boot_disk_type": _BOOT_DISK_TYPE.value, + "dmcrypt_enabled": _ENABLE_DMCRYPT.value, + "node_image_type": _NODE_IMAGE_TYPE.value, + "boot_disk_iops_target": _BOOT_DISK_IOPS.value, + "benchmark_machine_type": _BENCHMARK_MACHINE_TYPE.value, + # Other config + "zswap_enabled": _ENABLE_ZSWAP.value, + "min_free_kbytes": _MIN_FREE_KBYTES.value, + "fio_runtime_sec": _FIO_RUNTIME_SEC.value, + # Requested config value only. The *effective* stress-ng footprint may + # be autoscaled per node (see _autoscale_vm_bytes); Phase 2a records the + # actual value it ran with as 'stress_vm_bytes' so the two never conflict. + "stress_vm_bytes_requested": _STRESS_VM_BYTES.value, + "stress_vm_bytes_list": _STRESS_VM_BYTES_LIST.value, + "stress_timeout_sec": _STRESS_TIMEOUT_SEC.value, + "nodepool": _NODEPOOL.value, + } From 3162242159be38ff19c36531a8fb8849c7e1388c Mon Sep 17 00:00:00 2001 From: DevVegeta Date: Thu, 25 Jun 2026 20:34:04 +0530 Subject: [PATCH 6/8] fix(swap_encryption/pr4): slim DaemonSet, duplicate FLAGS, COS ref, EKS stub - Remove duplicate FLAGS = flags.FLAGS - Add _configure_eks_kubelet_swap() stub (deferred to PR #6780) - DaemonSet: remove workload tools; separate PKB pods (r3457826290) - Remove cgroup v2 unlock hack (r3457928855) - Pre-fetch kernel source for Phase 3b - Fix COS_CONTAINERD -> UBUNTU_CONTAINERD (r3472549985) --- .../linux_benchmarks/swap_encryption_benchmark.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py index ffcca8123a..c7bd92fdcd 100644 --- a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py +++ b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py @@ -93,7 +93,7 @@ GKE vs. EKS swap encryption and LSSD performance comparison. Two-step nodepool setup: PKB provisions a minimal cluster with a cheap default nodepool (Step 1), then Prepare() adds the real benchmark - nodepool (n4-highmem-32 / c4-*-lssd, COS_CONTAINERD, 80k IOPS) with a + nodepool (n4-highmem-32 / c4-*-lssd, UBUNTU_CONTAINERD, 80k IOPS) with a node-level startup script that configures dm-crypt swap before any pod is scheduled, then removes the default nodepool (Step 2). All benchmark phases run inside a privileged DaemonSet pinned to the benchmark nodepool. @@ -496,7 +496,7 @@ def Prepare(spec) -> None: Step 2 (this function): a. Create the benchmark nodepool (n4-highmem-32 or c4-*-lssd) with - COS_CONTAINERD, 80 000 IOPS, and a node startup script that configures + UBUNTU_CONTAINERD, 80 000 IOPS, and a node startup script that configures dm-crypt swap at the OS level — before any pod is scheduled. b. Delete the dummy default nodepool to stop its cost immediately. c. Deploy the privileged DaemonSet (pinned via nodeSelector to the @@ -1155,7 +1155,7 @@ def _create_benchmark_node_pool(cluster) -> None: Uses: --swap_encryption_benchmark_machine_type (default n4-highmem-32) - --swap_encryption_node_image_type (default COS_CONTAINERD) + --swap_encryption_node_image_type (default UBUNTU_CONTAINERD) --swap_encryption_boot_disk_iops (default 80000) --swap_encryption_enable_dmcrypt (default True) From 41d3cb1f7b95af3ee89dfadbe455bae09d89088b Mon Sep 17 00:00:00 2001 From: DevVegeta Date: Mon, 29 Jun 2026 16:33:50 +0530 Subject: [PATCH 7/8] refactor(swap_encryption/pr4): thin benchmark using BaseResource pattern - Prepare() uses SwapNodePool + SwapDaemonSet from spec.resources - Cleanup() is empty - PKB framework auto-deletes spec.resources - All _pod_exec(pod, ...) calls replaced with daemonset.PodExec(...) - DaemonSet: remove workload tools; separate PKB pods (r3457826290) - Remove cgroup v2 unlock hack (r3457928855) - Pre-fetch kernel source for Phase 3b - Fix COS_CONTAINERD -> UBUNTU_CONTAINERD (r3472549985) --- .../swap_encryption_benchmark.py | 3624 +++++------------ 1 file changed, 936 insertions(+), 2688 deletions(-) diff --git a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py index c7bd92fdcd..cd867b8234 100644 --- a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py +++ b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py @@ -35,6 +35,28 @@ EKS nodes ── NVMe Instance Store, Nitro hardware-offloaded encryption swap device: /dev/nvme1n1 (or auto-detected) +== Resource pattern == + +Infrastructure lifecycle lives in two BaseResource subclasses: + + SwapNodePool (perfkitbenchmarker/resources/container_service/swap_nodepool.py) + _Create(): gcloud container node-pools create with linuxConfig.swapConfig + + sysctl via --system-config-from-file; waits for node Ready; + optionally creates and attaches a dedicated swap disk. + _Delete(): detach+delete disk; delete the nodepool. + DeleteDefaultPool(): remove the dummy e2-medium default pool after the + DaemonSet pod is Running (separate step to avoid API-server + contention during nodepool ops). + + SwapDaemonSet (perfkitbenchmarker/resources/container_service/swap_daemonset.py) + _Create(): apply Jinja2 manifest; wait for Running + /tmp/pkb_ready. + _Delete(): in-pod swapoff / dmsetup / losetup teardown; kubectl delete. + PodExec(): kubectl exec wrapper with transient-reset retry, OOM-kill + detection (rc=137), and automatic pod recovery. + +Both resources are added to spec.resources in Prepare() and are auto-deleted +by the PKB framework in Cleanup(). + == Benchmark Phases == Phase 1 – fio Microbenchmarks @@ -50,17 +72,6 @@ Phase 2b – I/O Interference Baseline fio on a scratch volume → re-run with concurrent swap pressure. IOPS/latency delta = storage contention cost. - - Phase 3a – Redis Latency - Dataset loaded beyond container memory limit → GET/SET p99 latency - measured while kernel swaps pages. - - Phase 3b – Kernel Build - Linux compiled inside a memory-capped cgroup; slowdown ratio vs - unconstrained baseline. - - Phase 3c – OpenSearch - Bulk-index + search query under swap pressure (esrally or curl). """ import json @@ -71,20 +82,24 @@ from typing import Any from absl import flags +from perfkitbenchmarker import benchmark_spec as bm_spec_lib from perfkitbenchmarker import configs from perfkitbenchmarker import errors from perfkitbenchmarker import sample -from perfkitbenchmarker import vm_util from perfkitbenchmarker.resources.container_service import kubectl +from perfkitbenchmarker.resources.container_service import swap_daemonset as _ds_mod +from perfkitbenchmarker.resources.container_service import swap_nodepool as _np_mod FLAGS = flags.FLAGS +_BenchmarkSpec = bm_spec_lib.BenchmarkSpec + # --------------------------------------------------------------------------- # Benchmark identity # --------------------------------------------------------------------------- -BENCHMARK_NAME = "swap_encryption" +BENCHMARK_NAME = 'swap_encryption' BENCHMARK_CONFIG = """ @@ -114,315 +129,313 @@ _SWAP_DEVICE = flags.DEFINE_string( - "swap_encryption_device", - "", - "Explicit swap block-device path on the cluster node, e.g. " - "/dev/nvme1n1 or /dev/dm-0. When empty the benchmark auto-detects " - "via /proc/swaps after setup.", + 'swap_encryption_device', + '', + 'Explicit swap block-device path on the cluster node, e.g. ' + '/dev/nvme1n1 or /dev/dm-0. When empty the benchmark auto-detects ' + 'via /proc/swaps after setup.', ) _SWAP_SIZE_GB = flags.DEFINE_integer( - "swap_encryption_swap_size_gb", + 'swap_encryption_swap_size_gb', 32, - "Size in GB of the swap space to configure on the node. " - "Ignored when a ready swap device already exists.", + 'Size in GB of the swap space to configure on the node. ' + 'Ignored when a ready swap device already exists.', ) _SWAP_TYPE = flags.DEFINE_enum( - "swap_encryption_swap_type", - "auto", - ["auto", "hyperdisk", "lssd", "boot_disk", "instance_store", "io2"], - "Swap backing storage target, one per methodology test-matrix row:\n" - " GKE: boot_disk (swap file on the OS boot disk — pd-balanced or " - "hyperdisk-balanced, chosen via --swap_encryption_boot_disk_type),\n" - " hyperdisk (dedicated hyperdisk-balanced data disk),\n" - " lssd (dedicated Local SSD RAID-0).\n" - " AWS: instance_store (NVMe Instance Store, Nitro-encrypted),\n" - " io2 (EBS io2 data/root volume).\n" - "dm-crypt is applied on the GKE targets when " - "--swap_encryption_enable_dmcrypt is set; AWS targets are encrypted by " - "Nitro at the hardware level. auto = detect from cloud + instance type.", + 'swap_encryption_swap_type', + 'auto', + ['auto', 'hyperdisk', 'lssd', 'boot_disk', 'instance_store', 'io2'], + 'Swap backing storage target, one per methodology test-matrix row:\n' + ' GKE: boot_disk (swap file on the OS boot disk — pd-balanced or ' + 'hyperdisk-balanced, chosen via --swap_encryption_boot_disk_type),\n' + ' hyperdisk (dedicated hyperdisk-balanced data disk),\n' + ' lssd (dedicated Local SSD RAID-0).\n' + ' AWS: instance_store (NVMe Instance Store, Nitro-encrypted),\n' + ' io2 (EBS io2 data/root volume).\n' + 'dm-crypt is applied on the GKE targets when ' + '--swap_encryption_enable_dmcrypt is set; AWS targets are encrypted by ' + 'Nitro at the hardware level. auto = detect from cloud + instance type.', ) _FIO_RUNTIME_SEC = flags.DEFINE_integer( - "swap_encryption_fio_runtime_sec", + 'swap_encryption_fio_runtime_sec', 60, - "Wall-clock runtime in seconds for each individual fio job.", + 'Wall-clock runtime in seconds for each individual fio job.', ) _STRESS_TIMEOUT_SEC = flags.DEFINE_integer( - "swap_encryption_stress_timeout_sec", + 'swap_encryption_stress_timeout_sec', 300, - "Duration in seconds of each stress-ng memory-pressure phase.", + 'Duration in seconds of each stress-ng memory-pressure phase.', ) _STRESS_VM_BYTES = flags.DEFINE_string( - "swap_encryption_stress_vm_bytes", - "28G", - "Combined stress-ng working-set size (total in-flight footprint, not " - "per-worker). It is divided equally across --swap_encryption_stress_vm_" - "workers before being passed to stress-ng, so the total memory touched " - "equals this value. Should exceed node RAM to force kernel swapping.", + 'swap_encryption_stress_vm_bytes', + '28G', + 'Combined stress-ng working-set size (total in-flight footprint, not ' + 'per-worker). It is divided equally across --swap_encryption_stress_vm_' + 'workers before being passed to stress-ng, so the total memory touched ' + 'equals this value. Should exceed node RAM to force kernel swapping.', ) _STRESS_VM_BYTES_LIST = flags.DEFINE_string( - "swap_encryption_stress_vm_bytes_list", - "", - "Comma-separated list of stress-ng --vm-bytes values to iterate over " + 'swap_encryption_stress_vm_bytes_list', + '', + 'Comma-separated list of stress-ng --vm-bytes values to iterate over ' 'in Phase 2a CPU-overhead sweeps, e.g. "14G,21G,28G". When non-empty ' - "this overrides --swap_encryption_stress_vm_bytes and Phase 2a is run " - "once per entry so that the swap-pressure intensity curve is captured.", + 'this overrides --swap_encryption_stress_vm_bytes and Phase 2a is run ' + 'once per entry so that the swap-pressure intensity curve is captured.', ) _STRESS_VM_WORKERS = flags.DEFINE_integer( - "swap_encryption_stress_vm_workers", + 'swap_encryption_stress_vm_workers', 4, - "Number of parallel stress-ng --vm workers for Phase 2a. The total " - "working set (the autoscaled vm_bytes) is divided equally across workers, " - "so the combined footprint stays under RAM+swap (no OOM) while exceeding " - "RAM (forcing swap). Multiple workers are needed for fill speed — a " - "single write64 worker cannot dirty enough memory within the timeout to " - "reach RAM (run swap1: ~184 GB resident, no swap). To stop the N " + 'Number of parallel stress-ng --vm workers for Phase 2a. The total ' + 'working set (the autoscaled vm_bytes) is divided equally across workers, ' + 'so the combined footprint stays under RAM+swap (no OOM) while exceeding ' + 'RAM (forcing swap). Multiple workers are needed for fill speed — a ' + 'single write64 worker cannot dirty enough memory within the timeout to ' + 'reach RAM (run swap1: ~184 GB resident, no swap). To stop the N ' "workers' resident sets from collapsing to one worker's share, the " - "stressor uses random access (rand-set) and disables KSM page-merging " - "(without those, identical write64 pages across workers were merged, " - "leaving only ~vm_bytes/N resident and swap_out ~0).", + 'stressor uses random access (rand-set) and disables KSM page-merging ' + '(without those, identical write64 pages across workers were merged, ' + 'leaving only ~vm_bytes/N resident and swap_out ~0).', ) _ENABLE_ZSWAP = flags.DEFINE_boolean( - "swap_encryption_enable_zswap", + 'swap_encryption_enable_zswap', False, - "Enable zswap (lz4 compressor, 20%% max pool) before running tests.", + 'Enable zswap (lz4 compressor, 20%% max pool) before running tests.', ) _MIN_FREE_KBYTES = flags.DEFINE_integer( - "swap_encryption_min_free_kbytes", + 'swap_encryption_min_free_kbytes', 65536, - "Value written to /proc/sys/vm/min_free_kbytes to trigger earlier " - "swapping. Set 0 to leave the kernel default unchanged.", + 'Value written to /proc/sys/vm/min_free_kbytes to trigger earlier ' + 'swapping. Set 0 to leave the kernel default unchanged.', ) _DAEMONSET_IMAGE = flags.DEFINE_string( - "swap_encryption_daemonset_image", - "ubuntu:22.04", - "Container image used for the privileged benchmark DaemonSet pod.", + 'swap_encryption_daemonset_image', + 'ubuntu:22.04', + 'Container image used for the privileged benchmark DaemonSet pod.', ) _NODEPOOL = flags.DEFINE_string( - "swap_encryption_nodepool", - "benchmark", - "Name of the node pool to deploy the benchmark DaemonSet on.", + 'swap_encryption_nodepool', + 'benchmark', + 'Name of the node pool to deploy the benchmark DaemonSet on.', ) _INSTANCE_SIZE_LABEL = flags.DEFINE_string( - "swap_encryption_instance_size_label", - "", - "Human-readable label for the current instance size being tested, e.g. " + 'swap_encryption_instance_size_label', + '', + 'Human-readable label for the current instance size being tested, e.g. ' '"n4-highmem-32" or "i4i.4xlarge". Stored in sample metadata so that ' - "results from multiple PKB runs across different instance sizes can be " - "collated and compared. Defaults to the value reported by the cloud " - "metadata endpoint inside the pod.", + 'results from multiple PKB runs across different instance sizes can be ' + 'collated and compared. Defaults to the value reported by the cloud ' + 'metadata endpoint inside the pod.', ) _COLLECT_COST = flags.DEFINE_boolean( - "swap_encryption_collect_cost", + 'swap_encryption_collect_cost', False, - "When True, emit a cost_estimate_usd sample using on-demand pricing " - "for the instance type detected at runtime.", + 'When True, emit a cost_estimate_usd sample using on-demand pricing ' + 'for the instance type detected at runtime.', ) _IO2_ENCRYPTED = flags.DEFINE_boolean( - "swap_encryption_io2_encrypted", + 'swap_encryption_io2_encrypted', True, - "When True (default), the dedicated io2 swap volume is created with EBS " + 'When True (default), the dedicated io2 swap volume is created with EBS ' 'encryption (Nitro/KMS) -> matrix row "io2 + hardware encryption". ' - "Set False for the unencrypted io2 baseline row. Only applies when " - "--swap_encryption_swap_type=io2 on AWS/EKS.", + 'Set False for the unencrypted io2 baseline row. Only applies when ' + '--swap_encryption_swap_type=io2 on AWS/EKS.', ) _IO2_KMS_KEY_ID = flags.DEFINE_string( - "swap_encryption_io2_kms_key_id", - "", - "Optional KMS key id/ARN for the encrypted io2 volume. Empty = the " - "account default aws/ebs key. Ignored unless io2_encrypted is True.", + 'swap_encryption_io2_kms_key_id', + '', + 'Optional KMS key id/ARN for the encrypted io2 volume. Empty = the ' + 'account default aws/ebs key. Ignored unless io2_encrypted is True.', ) _FAIL_ON_DEGRADED = flags.DEFINE_boolean( - "swap_encryption_fail_on_degraded", + 'swap_encryption_fail_on_degraded', True, - "When True (default), raise an error at the end of Run() if the run was " - "catastrophically degraded — e.g. the benchmark pod was OOM-evicted and " - "replaced mid-run, Gate 1 (fio) produced no samples, or the stress-ng " - "swap-pressure phase was OOM-killed before completing. This prevents PKB " - "from reporting SUCCEEDED for a run whose post-eviction phases produced " - "empty or meaningless data. Set False to keep the legacy behaviour of " - "always returning whatever partial samples were collected.", + 'When True (default), raise an error at the end of Run() if the run was ' + 'catastrophically degraded — e.g. the benchmark pod was OOM-evicted and ' + 'replaced mid-run, Gate 1 (fio) produced no samples, or the stress-ng ' + 'swap-pressure phase was OOM-killed before completing. This prevents PKB ' + 'from reporting SUCCEEDED for a run whose post-eviction phases produced ' + 'empty or meaningless data. Set False to keep the legacy behaviour of ' + 'always returning whatever partial samples were collected.', ) _PHASES = flags.DEFINE_list( - "swap_encryption_phases", - ["all"], - "Which Run() phases to execute, for fast iteration against an " - "already-provisioned cluster (e.g. --run_stage=run --run_uri=...). " - "Comma-separated subset of: fio (Tier 1 microbenchmarks), 2a (stress-ng " - "CPU overhead + swap pressure), 2b (I/O interference), 3a (redis), " + 'swap_encryption_phases', + ['all'], + 'Which Run() phases to execute, for fast iteration against an ' + 'already-provisioned cluster (e.g. --run_stage=run --run_uri=...). ' + 'Comma-separated subset of: fio (Tier 1 microbenchmarks), 2a (stress-ng ' + 'CPU overhead + swap pressure), 2b (I/O interference), 3a (redis), ' '3b (kernel build), 3c (opensearch). Default "all" runs everything. ' - "Example: --swap_encryption_phases=2a runs only the swap-pressure phase. " - "Phases not listed are skipped and do not affect the degraded-run gate " + 'Example: --swap_encryption_phases=2a runs only the swap-pressure phase. ' + 'Phases not listed are skipped and do not affect the degraded-run gate ' '(e.g. skipping fio will not be reported as "Gate 1 produced no samples").', ) _MIN_SWAP_OUT_PAGES = flags.DEFINE_integer( - "swap_encryption_min_swap_out_pages", + 'swap_encryption_min_swap_out_pages', 1000, - "Minimum peak swap-out rate (pages/s) that Phase 2a must reach for the run" - " to count as a real swap-encryption measurement. Below this the working" - " set never meaningfully paged (e.g. run swap1 peaked at 176 pages/s of" + 'Minimum peak swap-out rate (pages/s) that Phase 2a must reach for the run' + ' to count as a real swap-encryption measurement. Below this the working' + ' set never meaningfully paged (e.g. run swap1 peaked at 176 pages/s of' ' noise yet "passed" the old zero-only gate), so the dm-crypt overhead is' - " hollow and the run is flagged degraded. A genuinely swapping run peaks" - " in the tens-to-hundreds of thousands of pages/s. Set 0 to accept any" - " non-zero swap-out (legacy behaviour).", + ' hollow and the run is flagged degraded. A genuinely swapping run peaks' + ' in the tens-to-hundreds of thousands of pages/s. Set 0 to accept any' + ' non-zero swap-out (legacy behaviour).', ) _BENCHMARK_MACHINE_TYPE = flags.DEFINE_string( - "swap_encryption_benchmark_machine_type", - "n4-highmem-32", - "Machine type for the benchmark nodepool created in Prepare(). " - "Use n4-highmem-32 (hyperdisk, default) or c4-standard-8-lssd " - "(LSSD RAID-0). The matching swap setup is selected automatically.", + 'swap_encryption_benchmark_machine_type', + 'n4-highmem-32', + 'Machine type for the benchmark nodepool created in Prepare(). ' + 'Use n4-highmem-32 (hyperdisk, default) or c4-standard-8-lssd ' + '(LSSD RAID-0). The matching swap setup is selected automatically.', ) _BENCHMARK_LSSD = flags.DEFINE_boolean( - "swap_encryption_lssd", + 'swap_encryption_lssd', False, - "Force LSSD RAID-0 swap path even when the machine type name does not " + 'Force LSSD RAID-0 swap path even when the machine type name does not ' 'contain "lssd". Auto-detected from machine type when False.', ) _LSSD_COUNT = flags.DEFINE_integer( - "swap_encryption_lssd_count", + 'swap_encryption_lssd_count', 1, - "Number of local NVMe SSDs to attach as raw block devices " - "(--local-nvme-ssd-block count=N). Must match the fixed local SSD " - "count for the chosen machine type: c4-standard-8-lssd=1, " - "c4-standard-16-lssd=2, i4i.4xlarge has NVMe Instance Store (AWS). " - "Default 1 covers most single-lssd machine types.", + 'Number of local NVMe SSDs to attach as raw block devices ' + '(--local-nvme-ssd-block count=N). Must match the fixed local SSD ' + 'count for the chosen machine type: c4-standard-8-lssd=1, ' + 'c4-standard-16-lssd=2, i4i.4xlarge has NVMe Instance Store (AWS). ' + 'Default 1 covers most single-lssd machine types.', ) _ENABLE_DMCRYPT = flags.DEFINE_boolean( - "swap_encryption_enable_dmcrypt", + 'swap_encryption_enable_dmcrypt', True, - "When True (default), configure dm-crypt on the swap device — the " + 'When True (default), configure dm-crypt on the swap device — the ' '"encryption enabled" column of the test matrix. Set False to use ' - "plain swap (encryption disabled column).", + 'plain swap (encryption disabled column).', ) _NODE_IMAGE_TYPE = flags.DEFINE_string( - "swap_encryption_node_image_type", - "UBUNTU_CONTAINERD", - "GKE node image type for the benchmark nodepool. " - "UBUNTU_CONTAINERD is required for dm-crypt measurement: COS locks " - "down device-mapper at the kernel LSM level and cryptsetup hangs " - "indefinitely from any pod context (even privileged, even via nsenter " - "into the host mount namespace). Ubuntu GKE nodes allow cryptsetup " - "from privileged pods without restriction. " - "Use COS_CONTAINERD only when dm-crypt is disabled " - "(--noswap_encryption_enable_dmcrypt) to measure plain-swap overhead. " - "AL2 on EKS.", + 'swap_encryption_node_image_type', + 'UBUNTU_CONTAINERD', + 'GKE node image type for the benchmark nodepool. ' + 'UBUNTU_CONTAINERD is required for dm-crypt measurement: COS locks ' + 'down device-mapper at the kernel LSM level and cryptsetup hangs ' + 'indefinitely from any pod context (even privileged, even via nsenter ' + 'into the host mount namespace). Ubuntu GKE nodes allow cryptsetup ' + 'from privileged pods without restriction. ' + 'Use COS_CONTAINERD only when dm-crypt is disabled ' + '(--noswap_encryption_enable_dmcrypt) to measure plain-swap overhead. ' + 'AL2 on EKS.', ) _BOOT_DISK_TYPE = flags.DEFINE_string( - "swap_encryption_boot_disk_type", - "hyperdisk-balanced", - "Disk type for the benchmark nodepool boot disk. Use hyperdisk-balanced " - "for production machines (n4, c3, c4 families). Use pd-ssd for n2/e2 " - "dev/test machines, which do not support hyperdisk-balanced.", + 'swap_encryption_boot_disk_type', + 'hyperdisk-balanced', + 'Disk type for the benchmark nodepool boot disk. Use hyperdisk-balanced ' + 'for production machines (n4, c3, c4 families). Use pd-ssd for n2/e2 ' + 'dev/test machines, which do not support hyperdisk-balanced.', ) _BOOT_DISK_IOPS = flags.DEFINE_integer( - "swap_encryption_boot_disk_iops", + 'swap_encryption_boot_disk_iops', 80000, - "Provisioned IOPS for the boot disk (hyperdisk-balanced only). " - "80 000 is the COS max-IOPS target. Ignored for pd-ssd.", + 'Provisioned IOPS for the boot disk (hyperdisk-balanced only). ' + '80 000 is the COS max-IOPS target. Ignored for pd-ssd.', ) _BOOT_DISK_THROUGHPUT = flags.DEFINE_integer( - "swap_encryption_boot_disk_throughput", + 'swap_encryption_boot_disk_throughput', 1200, - "Provisioned throughput in MB/s for the boot disk (hyperdisk-balanced " - "only). Must be set together with iops. 1200 MB/s pairs with 80 000 " - "IOPS for production; use 140 (minimum) for dev/test. Ignored for " - "pd-ssd.", + 'Provisioned throughput in MB/s for the boot disk (hyperdisk-balanced ' + 'only). Must be set together with iops. 1200 MB/s pairs with 80 000 ' + 'IOPS for production; use 140 (minimum) for dev/test. Ignored for ' + 'pd-ssd.', ) _BOOT_DISK_SIZE_GB = flags.DEFINE_integer( - "swap_encryption_boot_disk_size_gb", + 'swap_encryption_boot_disk_size_gb', 500, - "Boot disk size in GiB for the benchmark nodepool. 500 GiB is " - "required for the n4-highmem-32 + hyperdisk-balanced Config 2 run " - "(see Engineer Assignments table in execution-plan.md). " - "For LSSD configs the boot disk is smaller; 100 GiB is fine.", + 'Boot disk size in GiB for the benchmark nodepool. 500 GiB is ' + 'required for the n4-highmem-32 + hyperdisk-balanced Config 2 run ' + '(see Engineer Assignments table in execution-plan.md). ' + 'For LSSD configs the boot disk is smaller; 100 GiB is fine.', ) _ADD_SWAP_DISK = flags.DEFINE_boolean( - "swap_encryption_add_swap_disk", + 'swap_encryption_add_swap_disk', False, - "Attach a dedicated second disk to the benchmark nodepool for use as " - "the swap device. Required for dm-crypt measurement on single-boot-disk " - "machines (n4-highmem-32, n4-highmem-8) because COS blocks device-mapper " - "from pod namespaces. The second disk is provisioned via " - "--additional-node-disk using the same type/IOPS/throughput as the boot " - "disk flags.", + 'Attach a dedicated second disk to the benchmark nodepool for use as ' + 'the swap device. Required for dm-crypt measurement on single-boot-disk ' + 'machines (n4-highmem-32, n4-highmem-8) because COS blocks device-mapper ' + 'from pod namespaces. The second disk is provisioned via ' + '--additional-node-disk using the same type/IOPS/throughput as the boot ' + 'disk flags.', ) _SWAP_DISK_SIZE_GB = flags.DEFINE_integer( - "swap_encryption_swap_disk_size_gb", + 'swap_encryption_swap_disk_size_gb', 500, - "Size in GiB of the dedicated swap disk when " - "--swap_encryption_add_swap_disk is True. Must satisfy the " - "hyperdisk-balanced IOPS constraint: provisioned_iops ≤ size_gb × 80.", + 'Size in GiB of the dedicated swap disk when ' + '--swap_encryption_add_swap_disk is True. Must satisfy the ' + 'hyperdisk-balanced IOPS constraint: provisioned_iops <= size_gb x 80.', ) -_DS_NAME = "pkb-swap-benchmark" - - -_DS_NAMESPACE = "default" - - -_DS_LABEL = "pkb-swap-benchmark" - +# --------------------------------------------------------------------------- +# Module-level constants +# --------------------------------------------------------------------------- -_active_pod: list[str] = [] # single-element list so closures can mutate it +_DS_NAME = 'pkb-swap-benchmark' +_DS_NAMESPACE = 'default' +_DS_LABEL = 'pkb-swap-benchmark' +_BENCHMARK_NODEPOOL = 'benchmark' _stress_vm_method: list[str] = ( @@ -430,370 +443,262 @@ ) # single-element list; '' means no --vm-method flag -_degraded_reasons: list[str] = [] - - -_pod_lost: list[str] = [] - - -_oom_events: list[str] = [] - - -_BENCHMARK_NODEPOOL = "benchmark" - - -_DEFAULT_NODEPOOL = "default-pool" - - _FIO_JOBS = ( - ("rand_write_iops", "randwrite", "4k", 256, "Random write IOPS"), - ("rand_read_iops", "randread", "4k", 256, "Random read IOPS"), - ("rand_rw_mixed", "randrw", "4k", 256, "Mixed random R/W (50/50)"), - ("seq_write_bw", "write", "1m", 64, "Sequential write bandwidth"), - ("seq_read_bw", "read", "1m", 64, "Sequential read bandwidth"), - ("lat_write", "randwrite", "4k", 1, "Random write latency"), - ("lat_read", "randread", "4k", 1, "Random read latency"), + ('rand_write_iops', 'randwrite', '4k', 256, 'Random write IOPS'), + ('rand_read_iops', 'randread', '4k', 256, 'Random read IOPS'), + ('rand_rw_mixed', 'randrw', '4k', 256, 'Mixed random R/W (50/50)'), + ('seq_write_bw', 'write', '1m', 64, 'Sequential write bandwidth'), + ('seq_read_bw', 'read', '1m', 64, 'Sequential read bandwidth'), + ('lat_write', 'randwrite', '4k', 1, 'Random write latency'), + ('lat_read', 'randread', '4k', 1, 'Random read latency'), ) -_VMSTAT_LOG = "/tmp/pkb_vmstat.log" - - -_PIDSTAT_LOG = "/tmp/pkb_pidstat.log" - - -_CRYPTO_PROCS = ("kswapd", "kworker", "kcryptd", "dmcrypt_write") - - -def _daemonset_yaml(image: str) -> str: - """Render the privileged benchmark DaemonSet manifest. - - The manifest is a PKB data file rendered with Jinja2 - (data/cluster/swap_encryption_daemonset.yaml.j2) rather than an inline - string, per PKB conventions. The DaemonSet is pinned to the benchmark - nodepool via nodeSelector so it never lands on the dummy default pool. - """ - return vm_util.ReadAndRenderJinja2Template( - "cluster/swap_encryption_daemonset.yaml.j2", - ds_name=_DS_NAME, - ds_namespace=_DS_NAMESPACE, - ds_label=_DS_LABEL, - benchmark_nodepool=_BENCHMARK_NODEPOOL, - image=image, - kernel_version=_KERNEL_VERSION.value, - ) +_CRYPTO_PROCS = ('kswapd', 'kworker', 'kcryptd', 'dmcrypt_write') def GetConfig(user_config: dict[str, Any]) -> dict[str, Any]: + """Load and return benchmark config spec.""" return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME) -def Prepare(spec) -> None: +def Prepare(spec: _BenchmarkSpec) -> None: """Two-step nodepool setup then DaemonSet deployment. Step 1 (handled by PKB infrastructure): cluster provisioned with a cheap e2-medium default nodepool. Step 2 (this function): - a. Create the benchmark nodepool (n4-highmem-32 or c4-*-lssd) with - UBUNTU_CONTAINERD, 80 000 IOPS, and a node startup script that configures - dm-crypt swap at the OS level — before any pod is scheduled. - b. Delete the dummy default nodepool to stop its cost immediately. - c. Deploy the privileged DaemonSet (pinned via nodeSelector to the - benchmark nodepool) and wait for tools to install. + a. GCP: Create SwapNodePool (benchmark nodepool + optional swap disk). + EKS: label existing nodes with pkb_nodepool=benchmark. + b. Create SwapDaemonSet: deploy manifest + wait for Running + sentinel. + c. GCP: DeleteDefaultPool() — safe now that DaemonSet pod is Running. + d. GCP: re-resolve pod name in case default-pool deletion evicts the pod. + + Both resources are appended to spec.resources for auto-cleanup. """ cluster = spec.container_cluster - - # ── Step 2a: add real benchmark nodepool ──────────────────────────────── - if getattr(cluster, "project", None): - # GCP path: true two-step nodepool setup - logging.info("[swap_encryption] Step 2a: creating benchmark nodepool") - _create_benchmark_node_pool(cluster) - - # ── Step 2b: wait for the benchmark node to join and be Ready ───────── - logging.info("[swap_encryption] Step 2b: waiting for benchmark node") - _wait_for_benchmark_node() - - # ── Step 2b2: attach dedicated swap disk (if requested) ─────────────── - # --additional-node-disk is not available in all gcloud versions, so we - # create + attach the disk after the node is up using gcloud compute. - if _ADD_SWAP_DISK.value: - logging.info( - "[swap_encryption] Step 2b2: attaching dedicated swap disk" - ) - _attach_swap_disk(cluster) + is_gcp = getattr(cluster, 'project', None) is not None + + if is_gcp: + # ── Step 2a (GCP): create benchmark nodepool + wait for node ────────── + logging.info('[swap_encryption] Step 2a: creating benchmark nodepool') + nodepool = _np_mod.SwapNodePool( + cluster=cluster, + machine_type=_BENCHMARK_MACHINE_TYPE.value, + node_image_type=_NODE_IMAGE_TYPE.value, + disk_type=_BOOT_DISK_TYPE.value, + disk_size_gb=_BOOT_DISK_SIZE_GB.value, + disk_iops=_BOOT_DISK_IOPS.value, + disk_throughput=_BOOT_DISK_THROUGHPUT.value, + lssd=_BENCHMARK_LSSD.value, + lssd_count=_LSSD_COUNT.value, + add_swap_disk=_ADD_SWAP_DISK.value, + swap_disk_size_gb=_SWAP_DISK_SIZE_GB.value, + ) + nodepool.Create() + spec.resources.append(nodepool) else: - # AWS / EKS: nodepool management is external. PKB's cluster creation - # labels nodes pkb_nodepool=default, so re-label all existing nodes here - # to match the DaemonSet nodeSelector (pkb_nodepool=benchmark). + # ── Step 2a (EKS): label existing nodes to match DaemonSet selector ── logging.info( - "[swap_encryption] EKS cluster — labelling existing nodes with " - "pkb_nodepool=%s so the DaemonSet nodeSelector matches.", + '[swap_encryption] EKS cluster — labelling existing nodes with' + ' pkb_nodepool=%s so the DaemonSet nodeSelector matches.', _BENCHMARK_NODEPOOL, ) kubectl.RunKubectlCommand([ - "label", - "nodes", - "--all", - "--overwrite", - f"pkb_nodepool={_BENCHMARK_NODEPOOL}", + 'label', + 'nodes', + '--all', + '--overwrite', + f'pkb_nodepool={_BENCHMARK_NODEPOOL}', ]) - # io2 test-matrix row: create + attach a real io2 EBS volume so swap runs - # on io2 hardware-encrypted storage (no-op unless swap_type=io2). _ensure_io2_volume() - # ── Step 2c: deploy DaemonSet ──────────────────────────────────────────── - # Deploy and wait for the pod BEFORE deleting the default nodepool. - # Deleting the default pool while the benchmark node is still joining causes - # a temporary API server i/o timeout (control plane busy with two nodepool - # ops simultaneously). Once the pod is Running the cluster is fully stable. - logging.info("[swap_encryption] Step 2c: deploying privileged DaemonSet") - _deploy_daemonset() - - pod = _wait_for_benchmark_pod() - logging.info("[swap_encryption] Benchmark pod ready: %s", pod) + # ── Step 2b: deploy DaemonSet and wait for pod ──────────────────────────── + # Deploy BEFORE deleting the default pool: deleting the default pool while + # the benchmark node is still joining causes a brief API-server I/O timeout. + # The pod being Running means the cluster is fully stable. + logging.info('[swap_encryption] Step 2b: deploying privileged DaemonSet') + daemonset = _ds_mod.SwapDaemonSet( + name=_DS_NAME, + namespace=_DS_NAMESPACE, + label=_DS_LABEL, + nodepool=_BENCHMARK_NODEPOOL, + image=_DAEMONSET_IMAGE.value, + ) + daemonset.Create() + spec.resources.append(daemonset) + logging.info( + '[swap_encryption] Benchmark pod ready: %s', daemonset.pod_name + ) - # ── Step 2d: now safe to remove the dummy default nodepool ─────────────── - if getattr(cluster, "project", None): + # ── Step 2c+d (GCP): delete dummy default nodepool, re-resolve pod name ── + if is_gcp: logging.info( - "[swap_encryption] Step 2d: deleting dummy default nodepool" + '[swap_encryption] Step 2c: deleting dummy default nodepool' ) - _delete_default_node_pool(cluster) - # The DaemonSet pod may be evicted and rescheduled with a new name during - # the nodepool deletion (cluster control plane briefly interrupts pod - # lifecycle). Re-resolve the pod name to avoid stale-reference errors on - # all subsequent _pod_exec calls. + nodepool.DeleteDefaultPool() + # The pod may be evicted and rescheduled with a new name during the + # default nodepool deletion. Re-resolve to avoid stale references. logging.info( - "[swap_encryption] Step 2d: re-resolving benchmark pod " - "after nodepool deletion" + '[swap_encryption] Step 2d: re-resolving benchmark pod after' + ' nodepool deletion' ) - pod = _wait_for_benchmark_pod() - logging.info("[swap_encryption] Benchmark pod (post-deletion): %s", pod) - - # Tune kernel swap aggressiveness. - # vm.swappiness=100 (maximum): GKE nodes default to 0 (avoid swap, prefer - # OOM-kill). At 60 the kernel still under-swapped on n4-highmem-32 — under - # cgroup-level memory pressure with ~160 GB node RAM free it would leave - # anonymous pages resident and record swap_out ~0 (run bb4a782d), making the - # result non-deterministic. 100 maximally biases the kernel toward paging - # anonymous pages out to the (encrypted) swap device, which is exactly the - # path this benchmark is meant to exercise. - _pod_exec(pod, "sysctl -w vm.swappiness=100", ignore_failure=True) - if _MIN_FREE_KBYTES.value > 0: - _pod_exec(pod, f"sysctl -w vm.min_free_kbytes={_MIN_FREE_KBYTES.value}") - - # Unlock container cgroup swap. - # GKE cgroup v2 sets memory.swap.max=0 per-container even when the node has - # a swap device. This blocks swap for the container regardless of - # vm.swappiness. Stress-ng gets OOM-killed in ~15s because the kernel can - # page out for this cgroup. Set 'max' so the container can use all swap. - _pod_exec( - pod, - textwrap.dedent(""" - PKB_CG=$(awk -F: '/^0::/{print $3; exit}' /proc/self/cgroup 2>/dev/null) - if [ -n "$PKB_CG" ] && [ -f "/sys/fs/cgroup${PKB_CG}/memory.swap.max" ]; then - echo max > "/sys/fs/cgroup${PKB_CG}/memory.swap.max" 2>/dev/null || true - fi - PKB_CG1=$(awk -F: '/:memory:/{print $3; exit}' /proc/self/cgroup 2>/dev/null) - if [ -n "$PKB_CG1" ] && \ - [ -f "/sys/fs/cgroup/memory${PKB_CG1}/memory.memsw.limit_in_bytes" ]; then - echo -1 > "/sys/fs/cgroup/memory${PKB_CG1}/memory.memsw.limit_in_bytes" \ - 2>/dev/null || true - fi - """), - ignore_failure=True, - ) - - # Enable zswap if requested - if _ENABLE_ZSWAP.value: - _enable_zswap(pod) - - # Configure cloud-specific swap - cloud = _detect_cloud(pod) - logging.info("[swap_encryption] Detected cloud: %s", cloud) - - if cloud == "gcp": - _setup_gke_swap(pod) - elif cloud == "aws": - _setup_eks_swap(pod) - else: - logging.warning( - "[swap_encryption] Unknown cloud – falling back to plain swapfile" + daemonset.WaitForPod() + logging.info( + '[swap_encryption] Benchmark pod (post-deletion): %s', + daemonset.pod_name, ) - _setup_plain_swap_file(pod, _SWAP_SIZE_GB.value) - - -def _phase_selected(token: str) -> bool: - """Return True if phase `token` should run given --swap_encryption_phases. - - 'all' (the default) selects every phase. Otherwise only the comma-separated - tokens listed in the flag run. Tokens: fio, 2a, 2b, 3a, 3b, 3c. - """ - selected = [p.strip().lower() for p in _PHASES.value if p.strip()] - return (not selected) or ("all" in selected) or (token.lower() in selected) -def Run(spec) -> list[sample.Sample]: +def Run(spec: _BenchmarkSpec) -> list[sample.Sample]: """Execute all benchmark phases with gate logic. - Execution is structured in three gated tiers matching the execution plan: + Execution is structured in gated tiers matching the execution plan: Tier 1 (Gate 1) — fio microbenchmarks Raw I/O ceiling of the swap device. Gate 1 fails if fio produces zero samples (device not found, O_DIRECT error, etc.). - Tier 2 (Gate 2) — stress-ng CPU overhead + I/O interference + Tier 2 (Gate 2) — stress-ng CPU overhead + I/O interference (Phase 2a/2b) Requires an active swap device (Gate 1 must pass). Gate 2 fails if stress-ng does not complete within timeout. - Tier 3 (Gate 3) — real-world workloads (Redis, kernel build, OpenSearch) - Independent of Tier 2 results; always attempted if Gate 1 passed. - Individual workload failures are logged but do not abort the others. - - If Gate 1 fails, Tiers 2 and 3 are skipped — there is no point measuring + If Gate 1 fails, Tier 2 is skipped — there is no point measuring application-level swap performance when the raw device is inaccessible. """ - pod = _wait_for_benchmark_pod() - # Initialise the module-level active-pod tracker so _pod_exec and - # _recover_pod can transparently redirect to a replacement pod if the - # original is evicted during the run. - _active_pod.clear() - _active_pod.append(pod) - _degraded_reasons.clear() - _pod_lost.clear() - _oom_events.clear() + daemonset = _get_daemonset(spec) + + pod = daemonset.WaitForPod() + if pod is None: + raise errors.Benchmarks.RunError( + '[swap_encryption] Benchmark pod never became ready.' + ) + # Reset per-run accumulators before starting phases. + daemonset.oom_events.clear() + daemonset.pod_lost.clear() original_pod = pod - swap_dev = _detect_swap_device(pod) - base_meta = _build_metadata(pod, swap_dev) + degraded_reasons: list[str] = [] + + swap_dev = _detect_swap_device(daemonset) + base_meta = _build_metadata(daemonset, swap_dev) results: list[sample.Sample] = [] t_run_start = time.time() - logging.info("[swap_encryption] swap device: %s", swap_dev) + logging.info('[swap_encryption] swap device: %s', swap_dev) # ── Tier 1 / Gate 1: fio microbenchmarks ───────────────────────────────── tier1_results = [] - if _phase_selected("fio"): + if _phase_selected('fio'): logging.info( - "[swap_encryption] ── Tier 1 / Gate 1: fio microbenchmarks ──" + '[swap_encryption] ── Tier 1 / Gate 1: fio microbenchmarks ──' ) try: - tier1_results = _phase1_fio(pod, swap_dev, base_meta) + tier1_results = _phase1_fio(daemonset, swap_dev, base_meta) results += tier1_results except Exception as e: # pylint: disable=broad-except logging.error( - "[swap_encryption] Gate 1 FAILED — fio phase error: %s", e + '[swap_encryption] Gate 1 FAILED — fio phase error: %s', e ) logging.error( - "[swap_encryption] Skipping Tiers 2 and 3 (no swap device)" + '[swap_encryption] Skipping Tier 2 (no swap device)' ) return results if not tier1_results: logging.warning( - "[swap_encryption] Gate 1 produced no samples " - "(loop-device skip or parse error) — " - "continuing to Tier 2 with caution" + '[swap_encryption] Gate 1 produced no samples ' + '(loop-device skip or parse error) — ' + 'continuing to Tier 2 with caution' ) else: logging.info( - "[swap_encryption] Skipping Tier 1 (fio) — not selected by " - "--swap_encryption_phases=%s", - ",".join(_PHASES.value), + '[swap_encryption] Skipping Tier 1 (fio) — not selected by ' + '--swap_encryption_phases=%s', + ','.join(_PHASES.value), ) # ── Tier 2 / Gate 2: stress-ng CPU overhead + I/O interference ─────────── - if _phase_selected("2a") or _phase_selected("2b"): + if _phase_selected('2a') or _phase_selected('2b'): logging.info( - "[swap_encryption] ── Tier 2 / Gate 2: stress-ng phases ──" + '[swap_encryption] ── Tier 2 / Gate 2: stress-ng phases ──' ) try: - if _phase_selected("2a"): - logging.info("[swap_encryption] Phase 2a: CPU overhead") - results += _phase2a_cpu_overhead(pod, base_meta) - if _phase_selected("2b"): - logging.info("[swap_encryption] Phase 2b: I/O interference") - results += _phase2b_io_interference(pod, base_meta) + if _phase_selected('2a'): + logging.info('[swap_encryption] Phase 2a: CPU overhead') + results += _phase2a_cpu_overhead(daemonset, base_meta, degraded_reasons) + if _phase_selected('2b'): + logging.info('[swap_encryption] Phase 2b: I/O interference') + results += _phase2b_io_interference(daemonset, base_meta) except Exception as e: # pylint: disable=broad-except logging.error( - "[swap_encryption] Gate 2 FAILED — stress phase error: %s", e - ) - logging.warning( - "[swap_encryption] Proceeding to Tier 3 (workloads are " - "independent of stress-ng results)" + '[swap_encryption] Gate 2 FAILED — stress phase error: %s', e ) # ── Cost estimate ───────────────────────────────────────────────────────── if _COLLECT_COST.value: elapsed = time.time() - t_run_start - results += _collect_cost_sample(pod, elapsed, base_meta) + results += _collect_cost_sample(daemonset, elapsed, base_meta) # ── Final degradation gate ──────────────────────────────────────────────── - # The phase try/except blocks above keep the run alive so partial data is - # still collected, but that means a catastrophic failure (pod OOM-evicted - # mid-run, no fio data, stress-ng killed before it could drive swap I/O) - # would otherwise be reported by PKB as SUCCEEDED with empty/garbage metrics. - # Detect those conditions here and surface them explicitly. - if _active_pod and _active_pod[0] != original_pod: - _degraded_reasons.append( - f"benchmark pod was replaced during the run ({original_pod} →" - f" {_active_pod[0]}) — it was OOM-evicted under swap pressure;" - " phases executed after the eviction ran against a" - " freshly-initialised pod (empty /tmp, swap re-setup) and may be" - " invalid" - ) - if _pod_lost: - _degraded_reasons.append( - "benchmark pod(s) went NotFound during the run" - f' ({", ".join(_pod_lost)}) — the pod died (node memory-pressure' - " eviction or container exit) and any phase running at or after" - " that point (e.g. kernel-build baseline, OpenSearch) produced" - " invalid data" - ) - if _oom_events: - _degraded_reasons.append( - f"OOM kill(s) (rc=137) occurred during the run on pod(s) " - f'{", ".join(_oom_events)} — a phase exceeded memory and was' - " killed by " - f"the OOM killer (the container may have restarted in place), so" - f" the " - f"affected phase(s) produced no or partial data" - ) - - if _phase_selected("fio") and not tier1_results: - if swap_dev.startswith("/dev/loop"): + if daemonset.pod_name and daemonset.pod_name != original_pod: + degraded_reasons.append( + f'benchmark pod was replaced during the run ({original_pod} →' + f' {daemonset.pod_name}) — it was OOM-evicted under swap' + ' pressure; phases executed after the eviction ran against a' + ' freshly-initialised pod (empty /tmp, swap re-setup) and may' + ' be invalid' + ) + if daemonset.pod_lost: + degraded_reasons.append( + 'benchmark pod(s) went NotFound during the run' + f' ({", ".join(daemonset.pod_lost)}) — the pod died (node' + ' memory-pressure eviction or container exit) and any phase' + ' running at or after that point produced invalid data' + ) + if daemonset.oom_events: + degraded_reasons.append( + 'OOM kill(s) (rc=137) occurred during the run on pod(s) ' + f'{", ".join(daemonset.oom_events)} — a phase exceeded memory' + ' and was killed by the OOM killer; the affected phase(s)' + ' produced no or partial data' + ) + + if _phase_selected('fio') and not tier1_results: + if swap_dev.startswith('/dev/loop'): # Expected: COS blocks device-mapper from pod namespaces on single-disk # nodes (n2/n4 without --swap_encryption_add_swap_disk or lssd). - # Tier 2/3 results are still valid; do NOT mark the run as degraded. + # Tier 2 results are still valid; do NOT mark the run as degraded. logging.warning( - "[swap_encryption] Gate 1 (fio) skipped — loop device %s has no" - " dm-crypt support from inside a pod. Tier 2/3 results are" - " valid. Use c4-*-lssd or --swap_encryption_add_swap_disk for" - " fio data.", + '[swap_encryption] Gate 1 (fio) skipped — loop device %s has no' + ' dm-crypt support from inside a pod. Tier 2 results are' + ' valid. Use c4-*-lssd or --swap_encryption_add_swap_disk for' + ' fio data.', swap_dev, ) else: - _degraded_reasons.append( - "Gate 1 (fio microbenchmarks) produced no samples — the raw" - " swap device was never characterised" + degraded_reasons.append( + 'Gate 1 (fio microbenchmarks) produced no samples — the raw' + ' swap device was never characterised' ) - degraded = bool(_degraded_reasons) + degraded = bool(degraded_reasons) results.append( sample.Sample( - "swap_encryption_run_status", + 'swap_encryption_run_status', 0.0 if degraded else 1.0, - "status", + 'status', dict( base_meta, degraded=degraded, - degraded_reasons="; ".join(_degraded_reasons) or "none", + degraded_reasons='; '.join(degraded_reasons) or 'none', num_samples=len(results) + 1, ), ) ) if degraded: - msg = "[swap_encryption] RUN DEGRADED — " + "; ".join(_degraded_reasons) + msg = '[swap_encryption] RUN DEGRADED — ' + '; '.join(degraded_reasons) logging.error(msg) if _FAIL_ON_DEGRADED.value: # Raise so PKB marks the benchmark FAILED instead of SUCCEEDED. The @@ -802,1238 +707,237 @@ def Run(spec) -> list[sample.Sample]: raise errors.Benchmarks.RunError(msg) else: logging.info( - "[swap_encryption] Run completed cleanly (%d samples)", len(results) + '[swap_encryption] Run completed cleanly (%d samples)', len(results) ) return results -def Cleanup(spec) -> None: - """Remove the DaemonSet and tear down any swap configuration.""" - pod = _wait_for_benchmark_pod(timeout=30) - if pod: - _pod_exec(pod, "swapoff -a 2>/dev/null || true", ignore_failure=True) - _pod_exec( - pod, - textwrap.dedent(""" - swapoff /dev/mapper/swap_encrypted 2>/dev/null || true - dmsetup remove --noudevrules --noudevsync swap_encrypted 2>/dev/null || true - """), - ignore_failure=True, - ) - # Clean up loop device backing files (single-disk fallback path). - _pod_exec( - pod, - textwrap.dedent(""" - for backing in /var/pkb_swap_backing /run/pkb_swap_backing \ - /mnt/stateful_partition/pkb_swap_backing - do - losetup -j "$backing" 2>/dev/null | awk -F: '{print $1}' | \ - while read dev - do - losetup -d "$dev" 2>/dev/null || true - done - rm -f "$backing" - done - """), - ignore_failure=True, - ) - _pod_exec( - pod, - "pkill -9 'stress-ng|fio' 2>/dev/null || true", - ignore_failure=True, - ) - - _delete_daemonset() - - # Detach and delete the dedicated swap disk if one was provisioned. - cluster = spec.container_cluster - if _ADD_SWAP_DISK.value and getattr(cluster, "project", None): - _detach_and_delete_swap_disk(cluster) - - -def _configure_eks_kubelet_swap(spec) -> None: - """Configure EKS kubelet for LimitedSwap via nodeadm bootstrap. +def Cleanup(spec: _BenchmarkSpec) -> None: + """Resources in spec.resources are auto-deleted by the PKB framework. - NOTE: Deferred — requires Ajay's PR #6780 (SwapConfigSpec + nodeadm - integration) to merge. When that lands, EKS node pools should include - a preBootstrapCommands block writing nodeadm config with - memorySwapBehavior: LimitedSwap before kubelet starts. - - See also: https://github.com/GoogleCloudPlatform/PerfKitBenchmarker/pull/6780 + SwapDaemonSet._Delete() runs in-pod teardown (swapoff, dmsetup remove, + losetup cleanup, pkill fio/stress-ng) then deletes the DaemonSet. + SwapNodePool._Delete() detaches+deletes the swap disk (if any) then + deletes the benchmark nodepool. """ - logging.warning( - "[swap_encryption] EKS kubelet LimitedSwap config via nodeadm is " - "deferred (blocked on PR #6780 — SwapConfigSpec). " - "EKS nodes will use default kubelet swap settings until that PR merges." - ) - -def _deploy_daemonset() -> None: - """Apply the benchmark DaemonSet manifest to the cluster.""" - manifest = _daemonset_yaml(image=_DAEMONSET_IMAGE.value) - with vm_util.NamedTemporaryFile(mode="w", suffix=".yaml") as f: - f.write(manifest) - f.close() - kubectl.RunKubectlCommand(["apply", "-f", f.name]) - logging.info("[swap_encryption] DaemonSet applied") +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- -def _wait_for_benchmark_pod(timeout: int = 900) -> str | None: - """Wait until the DaemonSet pod is Running AND tools are installed. - - The benchmark container installs apt packages on first start and writes - /tmp/pkb_ready when done (~2-4 min on a cold node). We must wait for - that sentinel before exec-ing any commands, otherwise tools like - cryptsetup / fio may not yet be on PATH. - - Uses tab-separated name/phase output so kubectl always exits 0 regardless - of whether any pods are present, avoiding jsonpath index errors. - """ - deadline = time.time() + timeout - last_phase = "" - ready_pod = None # pod name once phase == Running - - while time.time() < deadline: - # ── Step 1: wait for Running phase ────────────────────────────────────── - if ready_pod is None: - out, _, rc = kubectl.RunKubectlCommand( - [ - "get", - "pods", - "-l", - f"app={_DS_LABEL}", - "-n", - _DS_NAMESPACE, - "-o", - ( - r"jsonpath={range" - r' .items[*]}{.metadata.name}{"\t"}{.status.phase}{"\n"}{end}' - ), - ], - raise_on_failure=False, - ) - - if rc == 0 and out.strip(): - for line in out.strip().splitlines(): - parts = line.split("\t") - if len(parts) == 2: - pod_name, phase = parts[0].strip(), parts[1].strip() - if phase == "Running": - logging.info( - "[swap_encryption] Pod %s is Running – " - "waiting for tool install to finish...", - pod_name, - ) - ready_pod = pod_name - break - if phase != last_phase: - logging.info( - "[swap_encryption] Pod %s phase: %s", - pod_name, - phase, - ) - last_phase = phase - if phase in ("Pending",): - _log_pod_events(pod_name) - else: - logging.info( - "[swap_encryption] Waiting for DaemonSet pod to appear..." - ) - - # ── Step 2: poll for /tmp/pkb_ready sentinel ──────────────────────────── - if ready_pod is not None: - sentinel_out, sentinel_err, sentinel_rc = kubectl.RunKubectlCommand( - [ - "exec", - ready_pod, - "-n", - _DS_NAMESPACE, - "--", - "test", - "-f", - "/tmp/pkb_ready", - ], - raise_on_failure=False, - ) - if sentinel_rc == 0: - logging.info( - "[swap_encryption] Pod %s ready (tools installed)", - ready_pod, - ) - return ready_pod - # "container not found" means the container crashed (CrashLoopBackOff or - # exited) — treat it as a hard reset: re-check pod phase on next iteration. - if ( - "container not found" in sentinel_err - or "unable to upgrade connection" in sentinel_err - ): - logging.warning( - "[swap_encryption] Pod %s: container not running (%s) " - "— will re-check pod state", - ready_pod, - sentinel_err.strip(), - ) - ready_pod = None - last_phase = "" - else: - logging.info( - "[swap_encryption] Pod %s: still installing tools...", - ready_pod, - ) - - time.sleep(15) - logging.warning( - "[swap_encryption] Benchmark pod not ready after %ds", timeout +def _get_daemonset(spec: _BenchmarkSpec) -> _ds_mod.SwapDaemonSet: + """Retrieve the SwapDaemonSet resource from spec.resources.""" + daemonset = next( + (r for r in spec.resources if isinstance(r, _ds_mod.SwapDaemonSet)), + None, ) - return None - - -def _log_pod_events(pod_name: str) -> None: - """Dump recent Kubernetes events for the pod to help diagnose startup hangs.""" - events_out, _, _ = kubectl.RunKubectlCommand( - [ - "describe", - "pod", - pod_name, - "-n", - _DS_NAMESPACE, - ], - raise_on_failure=False, - ) - # Only log the Events section to keep output manageable - in_events = False - lines = [] - for line in events_out.splitlines(): - if line.startswith("Events:"): - in_events = True - if in_events: - lines.append(line) - if lines: - logging.info("[swap_encryption] Pod events:\n%s", "\n".join(lines[:30])) - else: - logging.info( - "[swap_encryption] kubectl describe output:\n%s", - events_out[-2000:] if len(events_out) > 2000 else events_out, + if daemonset is None: + raise errors.Benchmarks.RunError( + '[swap_encryption] SwapDaemonSet not found in spec.resources —' + ' was Prepare() called?' ) + return daemonset -def _delete_daemonset() -> None: - """Delete the benchmark DaemonSet.""" - kubectl.RunKubectlCommand( - [ - "delete", - "daemonset", - _DS_NAME, - "-n", - _DS_NAMESPACE, - "--ignore-not-found", - ], - raise_on_failure=False, - ) - logging.info("[swap_encryption] DaemonSet deleted") - - -def _build_node_startup_script(enable_dmcrypt: bool, lssd: bool) -> str: - """Return a bash startup script for the benchmark nodepool. - - NOTE: This function is not currently used. GKE reserves the - `startup-script` node metadata key, so dm-crypt setup is performed - from within the privileged DaemonSet pod instead (see - _setup_gke_hyperdisk_swap / _setup_gke_lssd_swap). Kept as reference. - - Args: - enable_dmcrypt: When True, wrap the swap device in dm-crypt plain - mode (aes-xts-plain64, ephemeral random key) matching GKE's - go/node:swap-encryption implementation. - lssd: When True, build a RAID-0 array across all local SSDs before - setting up swap (matches go/gke-swap-lssd). - - Returns: - A bash script string suitable for running as root at node boot. - """ - dmcrypt_str = "true" if enable_dmcrypt else "false" - lssd_str = "true" if lssd else "false" - - return textwrap.dedent(f"""\ - #!/bin/bash - # PKB swap_encryption_benchmark — nodepool startup script. - # Configures swap once at node boot so all benchmark phases see a - # pre-warmed swap device. Runs as root on the COS host. - set -euo pipefail - ENABLE_DMCRYPT={dmcrypt_str} - LSSD={lssd_str} - - _wait_dev() {{ - local d=$1 i - for i in $(seq 1 30); do [ -b "$d" ] && return 0; sleep 2; done - echo "[pkb-startup] device $d not ready" >&2; return 1 - }} - - _boot_dev() {{ - lsblk -no pkname "$(findmnt -n -o SOURCE /)" 2>/dev/null | head -1 || echo nvme0n1 - }} - - if $LSSD; then - BOOT=$(_boot_dev) - # Collect all non-rotational non-boot block devices (local SSDs) - DEVS=$(lsblk -d -o NAME,ROTA | awk '$2=="0"{{print "/dev/"$1}}' | grep -v "/dev/$BOOT" || true) - N=$(echo "$DEVS" | grep -c /dev/ || true) - if [ "$N" -gt 1 ]; then - modprobe raid0 || true - # shellcheck disable=SC2086 - mdadm --create /dev/md0 --level=0 --raid-devices="$N" $DEVS --force - TARGET=/dev/md0 - elif [ "$N" -eq 1 ]; then - TARGET=$(echo "$DEVS" | head -1) - else - echo "[pkb-startup] no LSSD devices found; skipping swap setup" >&2 - exit 0 - fi - else - BOOT=$(_boot_dev) - RAW=$(lsblk -d -o NAME,TYPE | awk '$2=="disk"{{print $1}}' | grep -v "^$BOOT$" | head -1 || true) - if [ -z "$RAW" ]; then - echo "[pkb-startup] no secondary disk found for hyperdisk swap" >&2 - exit 0 - fi - TARGET=/dev/$RAW - fi - - _wait_dev "$TARGET" - - if $ENABLE_DMCRYPT; then - modprobe dm-crypt || true - dd if=/dev/urandom bs=32 count=1 2>/dev/null | \\ - cryptsetup open --type plain \\ - --cipher aes-xts-plain64 --key-size 256 \\ - --key-file=- "$TARGET" pkb_swap - SWAP_DEV=/dev/mapper/pkb_swap - else - SWAP_DEV=$TARGET - fi - - mkswap "$SWAP_DEV" - swapon "$SWAP_DEV" - echo "[pkb-startup] swap active on $SWAP_DEV (dmcrypt=$ENABLE_DMCRYPT lssd=$LSSD)" - """) - - -_HYPERDISK_MAX_IOPS_PER_MBPS = ( - 256 # GCP Hyperdisk Balanced: IOPS <= 256 x MiB/s -) - - -def _valid_hyperdisk_throughput(iops: int, throughput: int) -> int: - """Return a throughput (MiB/s) that satisfies GCP's Hyperdisk constraint. +def _phase_selected(token: str) -> bool: + """Return True if phase `token` should run given --swap_encryption_phases. - Hyperdisk Balanced rejects disk creation when provisioned IOPS exceed - 256 x provisioned throughput (MiB/s) — e.g. 80000 IOPS with 300 MiB/s fails - with "Requested provisioned throughput is too low for the provisioned iops". - Clamp throughput UP to the minimum the requested IOPS need (plus a small - margin) and warn, so a mismatched flag pairing cannot abort node-pool/disk - creation. + 'all' (the default) selects every phase. Otherwise only the comma-separated + tokens listed in the flag run. Tokens: fio, 2a, 2b, 3a, 3b, 3c. """ - min_tput = -(-int(iops) // _HYPERDISK_MAX_IOPS_PER_MBPS) # ceil(iops/256) - if throughput < min_tput: - logging.warning( - "[swap_encryption] boot/swap disk throughput %d MiB/s is too low" - " for %d IOPS (Hyperdisk needs >= ceil(iops/256) = %d MiB/s);" - " raising to %d", - throughput, - iops, - min_tput, - min_tput, - ) - return min_tput - return throughput + selected = [p.strip().lower() for p in _PHASES.value if p.strip()] + return (not selected) or ('all' in selected) or (token.lower() in selected) -def _create_benchmark_node_pool(cluster) -> None: - """Add the benchmark nodepool to the existing cluster (Step 2 of setup). +def _configure_eks_kubelet_swap(spec) -> None: + """Configure EKS kubelet for LimitedSwap via nodeadm bootstrap. - Uses: - --swap_encryption_benchmark_machine_type (default n4-highmem-32) - --swap_encryption_node_image_type (default UBUNTU_CONTAINERD) - --swap_encryption_boot_disk_iops (default 80000) - --swap_encryption_enable_dmcrypt (default True) + NOTE: Deferred — requires Ajay's PR #6780 (SwapConfigSpec + nodeadm + integration) to merge. When that lands, EKS node pools should include + a preBootstrapCommands block writing nodeadm config with + memorySwapBehavior: LimitedSwap before kubelet starts. - The nodepool is labelled pkb_nodepool=benchmark so the DaemonSet - nodeSelector targets it exclusively. dm-crypt swap setup is performed - from within the privileged DaemonSet pod (see _setup_gke_hyperdisk_swap / - _setup_gke_lssd_swap) — we do NOT inject a startup-script via node metadata - because GKE reserves that metadata key and rejects it at the API level. + See also: https://github.com/GoogleCloudPlatform/PerfKitBenchmarker/pull/6780 """ - machine_type = _BENCHMARK_MACHINE_TYPE.value - # Auto-detect LSSD from machine type name; flag overrides only when True. - is_lssd = _BENCHMARK_LSSD.value or "lssd" in machine_type.lower() - - # Determine zone/region from the cluster object. - zone_flags: list[str] = [] - if getattr(cluster, "zones", None): - zone_flags = ["--zone", cluster.zones[0]] - elif getattr(cluster, "region", None): - zone_flags = ["--region", cluster.region] - - # LSSD configs only need a small boot disk (OS only; swap is on local NVMe). - # Hyperdisk configs need 500 GiB to hit 80 000 IOPS (the IOPS/GiB ratio on - # hyperdisk-balanced is 1:1 up to the provisioned ceiling, so a 100 GiB disk - # can only provision up to 100 000 IOPS but a 500 GiB gives comfortable - # headroom and matches the Config 2 spec in the Engineer Assignments table). - disk_size_gb = 100 if is_lssd else _BOOT_DISK_SIZE_GB.value - - disk_type = _BOOT_DISK_TYPE.value - cmd = [ - "gcloud", - "container", - "node-pools", - "create", - _BENCHMARK_NODEPOOL, - "--cluster", - cluster.name, - "--project", - cluster.project, - "--machine-type", - machine_type, - "--image-type", - _NODE_IMAGE_TYPE.value, - "--disk-type", - disk_type, - "--disk-size", - str(disk_size_gb), - "--num-nodes", - "1", - "--node-labels", - f"pkb_nodepool={_BENCHMARK_NODEPOOL}", - "--no-enable-autoupgrade", - "--no-enable-autorepair", - ] + zone_flags - - # IOPS and throughput provisioning only applies to hyperdisk-* types AND - # only when the boot disk is also the swap device (non-LSSD configs). - # For LSSD machines the boot disk is OS-only; swap is on local NVMe. - # Provisioning 80k IOPS on a 100 GiB boot disk would exceed the - # hyperdisk-balanced per-GiB cap (80 IOPS/GiB × 100 GiB = 8 000 max). - if disk_type.startswith("hyperdisk") and not is_lssd: - cmd += [ - "--boot-disk-provisioned-iops", - str(_BOOT_DISK_IOPS.value), - "--boot-disk-provisioned-throughput", - str( - _valid_hyperdisk_throughput( - _BOOT_DISK_IOPS.value, _BOOT_DISK_THROUGHPUT.value - ) - ), - ] - - # For LSSD machines, expose local NVMe as raw block devices so fio/mdadm - # can access them directly (go/gke-swap-lssd uses local-nvme-ssd-block). - if is_lssd: - cmd += ["--local-nvme-ssd-block", f"count={_LSSD_COUNT.value}"] - - logging.info( - "[swap_encryption] Creating benchmark nodepool: %s / %s / " - "image=%s / disk=%dGiB / iops=%d / dmcrypt=%s / lssd=%s / " - "add_swap_disk=%s", - _BENCHMARK_NODEPOOL, - machine_type, - _NODE_IMAGE_TYPE.value, - disk_size_gb, - _BOOT_DISK_IOPS.value, - _ENABLE_DMCRYPT.value, - is_lssd, - _ADD_SWAP_DISK.value, + logging.warning( + '[swap_encryption] EKS kubelet LimitedSwap config via nodeadm is ' + 'deferred (blocked on PR #6780 — SwapConfigSpec). ' + 'EKS nodes will use default kubelet swap settings until that PR merges.' ) - # LSSD nodepools take longer to provision than PD-only nodepools because - # GKE must also initialise the local NVMe devices before marking nodes Ready. - # 1200 s (20 min) covers observed worst-case times on c4-lssd and n4 configs. - stdout, stderr, rc = vm_util.IssueCommand( - cmd, timeout=1200, raise_on_failure=False - ) - if rc != 0: - # Idempotent prepare: if the nodepool already exists (e.g. re-running - # --run_stage=prepare,run to redeploy the DaemonSet onto an existing - # cluster), reuse it instead of failing. gcloud returns a 409 / - # "Already exists" message in this case. - low = (stderr or "").lower() - if ( - "already exists" in low - or "alreadyexists" in low - or "code=409" in low - ): - logging.info( - "[swap_encryption] Benchmark nodepool already exists — " - "reusing it (idempotent prepare); proceeding to DaemonSet" - ) - return - raise errors.Benchmarks.RunError( - "[swap_encryption] Failed to create benchmark nodepool " - f"(rc={rc}): {stderr}" - ) - logging.info("[swap_encryption] Benchmark nodepool ready") - - -def _wait_for_benchmark_node(timeout: int = 900) -> None: - """Block until a node labelled pkb_nodepool=benchmark is Ready. - - gcloud container node-pools create returns as soon as the API accepts the - request — the actual node VM may take another 2-4 minutes to boot, join the - cluster, and pass its readiness checks. Deploying the DaemonSet before that - point leaves the pod Pending indefinitely because the nodeSelector finds no - eligible node. +def _ensure_io2_volume() -> None: + """Create and attach an io2 EBS volume for swap on EKS (no-op if not io2). - This function polls kubectl every 15 s until at least one node with - pkb_nodepool=benchmark has Ready=True, then returns. + Only executed when --swap_encryption_swap_type=io2. Full implementation + is deferred to PR2 (swap-capability layer). """ - deadline = time.time() + timeout + if _SWAP_TYPE.value != 'io2': + return logging.info( - "[swap_encryption] Waiting for benchmark node " - "(pkb_nodepool=benchmark) to be Ready..." - ) - while time.time() < deadline: - out, _, rc = kubectl.RunKubectlCommand( - [ - "get", - "nodes", - "-l", - f"pkb_nodepool={_BENCHMARK_NODEPOOL}", - "-o", - r"jsonpath={range .items[*]}" - r'{.metadata.name}{"\t"}' - r'{range .status.conditions[?(@.type=="Ready")]}' - r'{.status}{"\n"}{end}{end}', - ], - raise_on_failure=False, - ) - - if rc == 0 and out.strip(): - for line in out.strip().splitlines(): - parts = line.split("\t") - if len(parts) == 2 and parts[1].strip() == "True": - logging.info( - "[swap_encryption] Benchmark node ready: %s", - parts[0].strip(), - ) - return - - logging.info( - "[swap_encryption] Benchmark node not yet Ready — " - "retrying in 15 s..." - ) - time.sleep(15) - - raise errors.Benchmarks.RunError( - "[swap_encryption] Timed out waiting for benchmark node " - f"(pkb_nodepool={_BENCHMARK_NODEPOOL}) to become Ready " - f"after {timeout}s" + '[swap_encryption] io2 swap volume provisioning deferred to PR2' ) -def _attach_swap_disk(cluster) -> None: - """Create a dedicated hyperdisk and attach it to the benchmark node. - - gcloud container node-pools create --additional-node-disk is not available - in all gcloud SDK versions, so we use gcloud compute to create the disk and - attach it after the node is ready. In GKE the Kubernetes node name is the - same as the GCE instance name, so no translation is needed. - - After attachment the disk appears as /dev/sdb (or /dev/nvme1n1 on NVMe - nodes) inside the pod, and _setup_gke_hyperdisk_swap detects it via lsblk. - - The disk is named pkb-swap- to avoid name collisions across - concurrent runs. Cleanup deletes it in Cleanup() if it exists. - """ - # Resolve zone from cluster - zone = None - if getattr(cluster, "zones", None): - zone = cluster.zones[0] - elif getattr(cluster, "region", None): - zone = cluster.region - if not zone: - raise errors.Benchmarks.RunError( - "[swap_encryption] Cannot attach swap disk: cluster zone unknown" - ) - - project = cluster.project - disk_name = f"pkb-swap-{cluster.name}" - disk_type = _BOOT_DISK_TYPE.value - disk_size_gb = _SWAP_DISK_SIZE_GB.value - - # ── Step 1: get the GCE instance name of the benchmark node ─────────────── - node_out, _, rc = kubectl.RunKubectlCommand( - [ - "get", - "nodes", - "-l", - f"pkb_nodepool={_BENCHMARK_NODEPOOL}", - "-o", - "jsonpath={.items[0].metadata.name}", - ], - raise_on_failure=False, - ) - instance_name = node_out.strip() - if rc != 0 or not instance_name: - raise errors.Benchmarks.RunError( - "[swap_encryption] Cannot find benchmark node for swap disk attach" - ) - logging.info("[swap_encryption] Benchmark node instance: %s", instance_name) - - # ── Step 2: create the hyperdisk ────────────────────────────────────────── - logging.info( - "[swap_encryption] Creating swap disk %s (%dGiB %s)", - disk_name, - disk_size_gb, - disk_type, - ) - create_cmd = [ - "gcloud", - "compute", - "disks", - "create", - disk_name, - "--project", - project, - "--zone", - zone, - "--type", - disk_type, - "--size", - f"{disk_size_gb}GB", - "--quiet", - ] - if disk_type.startswith("hyperdisk"): - create_cmd += [ - "--provisioned-iops", - str(_BOOT_DISK_IOPS.value), - "--provisioned-throughput", - str( - _valid_hyperdisk_throughput( - _BOOT_DISK_IOPS.value, _BOOT_DISK_THROUGHPUT.value - ) - ), - ] - _, stderr, rc = vm_util.IssueCommand( - create_cmd, timeout=120, raise_on_failure=False - ) - if rc != 0: - raise errors.Benchmarks.RunError( - f"[swap_encryption] Failed to create swap disk {disk_name}:" - f" {stderr}" - ) +def _detect_swap_device(daemonset: _ds_mod.SwapDaemonSet) -> str: + """Return the active swap device path on the cluster node.""" + if _SWAP_DEVICE.value: + return _SWAP_DEVICE.value - # ── Step 3: attach the disk to the node VM ──────────────────────────────── - logging.info( - "[swap_encryption] Attaching swap disk %s to %s", - disk_name, - instance_name, - ) - attach_cmd = [ - "gcloud", - "compute", - "instances", - "attach-disk", - instance_name, - "--project", - project, - "--zone", - zone, - "--disk", - disk_name, - "--device-name", - "pkb-swap", - "--quiet", - ] - _, stderr, rc = vm_util.IssueCommand( - attach_cmd, timeout=120, raise_on_failure=False - ) - if rc != 0: - raise errors.Benchmarks.RunError( - f"[swap_encryption] Failed to attach swap disk to {instance_name}: " - f"{stderr}" - ) - logging.info( - "[swap_encryption] Swap disk attached: %s → %s", - disk_name, - instance_name, + # /proc/swaps is the source of truth — it lists the device ACTUALLY active. + # Do NOT just test -e /dev/mapper/swap_encrypted: a stale dm-crypt mapping + # from a previous run on a reused node can still appear as a /dev node while + # being non-functional (fio/swapoff fail with "No such device or address"). + dm_out, _ = daemonset.PodExec( + textwrap.dedent(""" + ACTIVE=$(awk 'NR==2{print $1}' /proc/swaps 2>/dev/null) + if [ -n "$ACTIVE" ] + then + echo "$ACTIVE" + elif test -e /dev/mapper/swap_encrypted + then + echo /dev/mapper/swap_encrypted + fi + """), + ignore_failure=True, ) - - -def _delete_disk_by_name(disk_name: str, project: str, zone: str) -> bool: - """Detach (if attached) and delete a GCE disk, robustly, with retries. - - Finds the attached instance from the disk's own `users` field rather than - kubectl — kubectl is often unavailable during teardown (cluster being - deleted), which previously left the disk attached and undeletable, so it - leaked. Returns True if the disk is gone (deleted or already absent). - """ - for attempt in range(1, 5): - users, _, rc = vm_util.IssueCommand( - [ - "gcloud", - "compute", - "disks", - "describe", - disk_name, - "--project", - project, - "--zone", - zone, - "--format=value(users)", - ], - timeout=60, - raise_on_failure=False, - ) - if rc != 0: - logging.info( - "[swap_encryption] Swap disk %s not present — nothing to " - "delete", - disk_name, - ) - return True # already gone - user = users.strip() - if user: - inst = user.split("/")[-1] - logging.info( - "[swap_encryption] Detaching swap disk %s from %s", - disk_name, - inst, - ) - vm_util.IssueCommand( - [ - "gcloud", - "compute", - "instances", - "detach-disk", - inst, - "--project", - project, - "--zone", - zone, - "--disk", - disk_name, - "--quiet", - ], - timeout=120, - raise_on_failure=False, - ) - _, derr, drc = vm_util.IssueCommand( - [ - "gcloud", - "compute", - "disks", - "delete", - disk_name, - "--project", - project, - "--zone", - zone, - "--quiet", - ], - timeout=180, - raise_on_failure=False, - ) - if drc == 0: - logging.info("[swap_encryption] Swap disk deleted: %s", disk_name) - return True - logging.warning( - "[swap_encryption] Swap disk delete attempt %d/4 failed " - "(%s); retrying in 10s", - attempt, - derr.strip()[:160], - ) - time.sleep(10) - logging.error( - "[swap_encryption] Could NOT delete swap disk %s after retries " - "— delete it manually: gcloud compute disks delete %s " - "--zone %s --quiet", - disk_name, - disk_name, - zone, + dev = dm_out.strip().splitlines()[-1].strip() if dm_out.strip() else '' + if dev: + return dev + raise ValueError( + 'No active swap device found in the benchmark pod. ' + 'Use --swap_encryption_device to specify one.' ) - return False - -def _detach_and_delete_swap_disk(cluster) -> None: - """Detach and delete the dedicated swap disk created by _attach_swap_disk.""" - zone = None - if getattr(cluster, "zones", None): - zone = cluster.zones[0] - elif getattr(cluster, "region", None): - zone = cluster.region - if not zone or not getattr(cluster, "project", None): - return - _delete_disk_by_name(f"pkb-swap-{cluster.name}", cluster.project, zone) - -def _delete_default_node_pool(cluster) -> None: - """Delete the dummy default nodepool after the benchmark pool is ready. - - The default nodepool (e2-medium) was only needed to satisfy GKE's - requirement that a cluster must have at least one nodepool at creation time. - Removing it stops the clock on its cost immediately. - """ - zone_flags: list[str] = [] - if getattr(cluster, "zones", None): - zone_flags = ["--zone", cluster.zones[0]] - elif getattr(cluster, "region", None): - zone_flags = ["--region", cluster.region] - - cmd = [ - "gcloud", - "container", - "node-pools", - "delete", - _DEFAULT_NODEPOOL, - "--cluster", - cluster.name, - "--project", - cluster.project, - "--quiet", - ] + zone_flags - - logging.info( - "[swap_encryption] Deleting default nodepool: %s", _DEFAULT_NODEPOOL +def _build_metadata( + daemonset: _ds_mod.SwapDaemonSet, swap_dev: str +) -> dict[str, Any]: + """Collect node environment, encryption type, and config into a dict.""" + kernel_out, _ = daemonset.PodExec('uname -r', ignore_failure=True) + mem_out, _ = daemonset.PodExec( + "awk '/MemTotal/{print $2}' /proc/meminfo", ignore_failure=True ) - stdout, stderr, rc = vm_util.IssueCommand( - cmd, timeout=300, raise_on_failure=False + swap_out, _ = daemonset.PodExec( + "awk 'NR>1{sum+=$3} END{print sum+0}' /proc/swaps", ignore_failure=True ) - if rc != 0: - logging.warning( - "[swap_encryption] Could not delete default nodepool (rc=%d): %s", - rc, - stderr, - ) - else: - logging.info("[swap_encryption] Default nodepool deleted") - -def _is_pod_gone(pod: str) -> bool: - """Return True if the named pod no longer exists in the cluster. - - Used to distinguish OOM-killed container processes (pod still alive, rc=137) - from OOM-evicted pods (pod gone, DaemonSet will create a replacement). - """ try: - _, err, rc = kubectl.RunKubectlCommand( - [ - "get", - "pod", - pod, - "-n", - _DS_NAMESPACE, - "-o", - "jsonpath={.metadata.name}", - ], - raise_on_failure=False, - timeout=15, - ) - return rc != 0 and "not found" in (err or "").lower() - except Exception: # pylint: disable=broad-except - return False - - -def _pod_exec( - pod: str, - cmd: str, - ignore_failure: bool = False, - timeout: int = 300, - _retries: int = 2, -) -> tuple[str, str]: - """Run a shell command inside the benchmark pod via kubectl exec. - - Args: - pod: Pod name returned by _wait_for_benchmark_pod. - cmd: Shell command string passed to bash -c. - ignore_failure: When True, non-zero exit codes are logged but not - raised. - timeout: Seconds before PKB kills the kubectl exec process. Default - 300 s matches PKB's IssueCommand default. Pass a larger value for - long-running jobs (fio, stress-ng, kernel build). - _retries: Number of automatic retries on transient GKE websocket - resets ("connection reset by peer"). Set to 0 to disable retries - for idempotent-sensitive commands. - - Returns: - Tuple of (stdout, stderr) strings. - """ - _TRANSIENT_ERRORS = ("connection reset by peer", "websocket: close") - # Errors that indicate the container/pod is gone and needs recovery. - # 'not found' covers "Error from server (NotFound): pods ... not found" - # which occurs when the DaemonSet pod was evicted and recreated under a - # new name (e.g. after OOM-triggered node pressure eviction). - # 'deleted state' covers "cannot exec in a deleted state" — the container - # was OOM-killed and is mid-termination (not yet recreated). - _CONTAINER_GONE_ERRORS = ( - "container not found", - "procReady not received", - "unable to upgrade connection", - "not found", - "deleted state", - ) - # Use the globally-tracked active pod name — it may have been updated by - # a previous _recover_pod call when eviction replaced the pod. - active = _active_pod[0] if _active_pod else pod - - for attempt in range(_retries + 1): - out, err, rc = kubectl.RunKubectlCommand( - ["exec", active, "-n", _DS_NAMESPACE, "--", "bash", "-c", cmd], - raise_on_failure=False, - raise_on_timeout=False, # let _pod_exec's own retry loop handle transient resets - timeout=timeout, - ) - is_transient = rc != 0 and any(e in err for e in _TRANSIENT_ERRORS) - if is_transient and attempt < _retries: - logging.warning( - "[swap_encryption] kubectl exec connection reset (attempt" - " %d/%d); retrying in 10 s", - attempt + 1, - _retries + 1, - ) - time.sleep(10) - continue - # rc=137 (SIGKILL): the OOM killer terminated the container process. - # Two sub-cases: - # A) Pod eviction: pod is gone, DaemonSet recreates it under a new name. - # B) Container OOM restart: pod still exists, container restarts in place. - # (DaemonSet restartPolicy=Always restarts the container, /tmp is lost, - # tools must be re-installed before subsequent commands can run.) - # In both cases we call _recover_pod to wait for tools + sentinel, and - # we do NOT retry the OOM-triggering command itself. - if rc == 137: - # Record the OOM so the run-level gate can flag it even if the container - # restarts in place under the same pod name (which leaves both the - # "pod replaced" and "pod NotFound" checks silent). - if active not in _oom_events: - _oom_events.append(active) - # CRITICAL: sleep before checking pod state. Kubernetes takes a few - # seconds to mark a just-evicted pod as Terminating / NotFound. Without - # this delay _recover_pod sees the pod still in "Running" phase, returns - # the old pod name immediately, and every subsequent command fails with - # "Error from server (NotFound): pods … not found". - logging.warning( - "[swap_encryption] rc=137 — sleeping 15s for Kubernetes to" - " update pod state before recovery check" - ) - time.sleep(15) - pod_gone = _is_pod_gone(active) - if pod_gone: - logging.warning( - "[swap_encryption] OOM-eviction detected (rc=137, pod gone)" - " — recovering pod name for subsequent commands (not" - " retrying this cmd)" - ) - else: - logging.warning( - "[swap_encryption] Container OOM-killed (rc=137, pod still" - " exists) — waiting for container restart and tool" - " re-install before continuing" - ) - new_pod = _recover_pod(active) - if new_pod != active: - logging.info( - "[swap_encryption] Pod name updated: %s → %s", - active, - new_pod, - ) - if _active_pod: - _active_pod[0] = new_pod - active = new_pod - break # Do NOT retry — the OOM cmd itself is not re-run on the new pod. - - is_container_gone = rc != 0 and any( - e in err.lower() for e in _CONTAINER_GONE_ERRORS - ) - if is_container_gone: - # Record the loss for the run-level degradation gate REGARDLESS of retry - # budget or ignore_failure. A "pods … not found" on a best-effort command - # (kernel build, opensearch, cleanup of a dead pod) still means the pod - # died; without this the gate stays blind because _active_pod is only - # renamed on the retry path below, which _retries=0 callers never reach. - if active and active not in _pod_lost: - _pod_lost.append(active) - logging.error( - "[swap_encryption] Benchmark pod %s is gone (%s) —" - " recording run as degraded", - active, - (err or "").strip()[:160], - ) - if attempt < _retries: - logging.warning( - "[swap_encryption] Container gone/restarting (attempt" - " %d/%d) — waiting for pod to recover...", - attempt + 1, - _retries + 1, - ) - new_pod = _recover_pod(active) - if new_pod != active: - logging.info( - "[swap_encryption] Pod name updated: %s → %s", - active, - new_pod, - ) - if _active_pod: - _active_pod[0] = new_pod - active = new_pod - continue - break - - if rc != 0 and not ignore_failure: - raise errors.VmUtil.IssueCommandError( - f"[swap_encryption] _pod_exec failed (rc={rc}): {err}" - ) - return out, err - - -def _recover_pod(pod: str, timeout_sec: int = 600) -> str: - """Wait for a DaemonSet container to recover after OOM kill or eviction. - - Handles two scenarios: - 1. Container OOM restart: same pod name, container restarting in place. - DaemonSet restartPolicy=Always brings it back under the same pod name. - 2. Pod eviction/deletion: the pod is gone entirely; the DaemonSet creates - a new pod with a DIFFERENT name. We detect this by checking whether - the named pod still exists; if not, we search by the DaemonSet label - selector for a Running pod. - - Returns the (possibly new) pod name once it is Running and ready. - """ - deadline = time.time() + timeout_sec - logging.info( - "[swap_encryption] Waiting for pod %s to recover (up to %ds)...", - pod, - timeout_sec, - ) + mem_gb = round(int(mem_out.strip()) / (1024 * 1024), 1) + except ValueError: + mem_gb = 0 + try: + swap_gb = round(int(swap_out.strip()) / (1024 * 1024), 1) + except ValueError: + swap_gb = 0 - # Phase 1: wait for a Running pod — either the named one (container - # restart) or a replacement pod found via label selector (eviction). - # - # IMPORTANT: we query BOTH status.phase AND metadata.deletionTimestamp in a - # single call. When a pod is evicted, Kubernetes first sets deletionTimestamp - # (the pod is "Terminating") while status.phase may still read "Running" for - # several seconds. Checking only status.phase causes a false-positive: we - # return the old pod name immediately and every subsequent command fails with - # "Error from server (NotFound)". Checking deletionTimestamp catches this. - recovered_pod = pod - while time.time() < deadline: - # IMPORTANT: capture stderr — kubectl writes "not found" to stderr, not - # stdout. When the pod is gone, status_out is empty and the error text - # lives entirely in status_err. Discarding stderr (using _) means the - # 'not found' check below never fires and we spin until deadline. - status_out, status_err, status_rc = kubectl.RunKubectlCommand( - [ - "get", - "pod", - pod, - "-n", - _DS_NAMESPACE, - "-o", - "jsonpath={.status.phase}|{.metadata.deletionTimestamp}", - ], - raise_on_failure=False, - timeout=30, + # Encryption type — key off dm-crypt presence + swap target. + enc = 'unknown' + if '/dev/mapper/' in swap_dev: + table_out, _ = daemonset.PodExec( + f'dmsetup table {swap_dev.split("/")[-1]} 2>/dev/null || echo ""', + ignore_failure=True, ) - # Parse "Running|" (no deletionTimestamp) vs "Running|2026-…" (terminating) - fields = status_out.strip().split("|") - phase = fields[0].strip() if fields else "" - is_terminating = len(fields) > 1 and bool(fields[1].strip()) - - # Pod is genuinely Running and NOT being deleted — recovery complete. - if status_rc == 0 and phase == "Running" and not is_terminating: - break + enc = 'dm-crypt-plain' if 'crypt' in table_out.lower() else 'dm-other' + elif _SWAP_TYPE.value in ('instance_store', 'io2'): + enc = 'nitro_hardware_offload' + elif not _ENABLE_DMCRYPT.value: + enc = 'none' - # Pod no longer exists, OR it exists but is being terminated (Terminating - # state or deletionTimestamp set) — look for a replacement pod by label. - pod_gone_or_terminating = ( - status_rc != 0 and "not found" in (status_out + status_err).lower() - ) or is_terminating - if pod_gone_or_terminating: - label_out, _, label_rc = kubectl.RunKubectlCommand( - [ - "get", - "pods", - "-n", - _DS_NAMESPACE, - "-l", - f"app={_DS_LABEL}", - "-o", - ( - 'jsonpath={range .items[?(@.status.phase=="Running")]}' - '{.metadata.name}{"\\n"}{end}' - ), - ], - raise_on_failure=False, - timeout=30, - ) - new_pods = [ - p.strip() - for p in label_out.strip().splitlines() - if p.strip() and p.strip() != pod - ] # exclude the dying pod - if label_rc == 0 and new_pods: - recovered_pod = new_pods[0] - logging.info( - "[swap_encryption] Original pod %s gone/terminating; " - "found replacement %s", - pod, - recovered_pod, - ) - break + cloud = _detect_cloud(daemonset) - time.sleep(10) - else: - raise errors.VmUtil.IssueCommandError( - f"[swap_encryption] No Running pod found (original: {pod}) " - f"within {timeout_sec}s after OOM kill / eviction" + instance_label = _INSTANCE_SIZE_LABEL.value + if not instance_label: + gcp_type_out, _ = daemonset.PodExec( + 'curl -s -m 3 --fail' + ' http://metadata.google.internal/computeMetadata/v1/instance/machine-type' + ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""', + ignore_failure=True, ) - - # Phase 2: wait for init script to finish (sentinel written last). - while time.time() < deadline: - ready_out, _, ready_rc = kubectl.RunKubectlCommand( - [ - "exec", - recovered_pod, - "-n", - _DS_NAMESPACE, - "--", - "bash", - "-c", - "test -f /tmp/pkb_ready && echo READY", - ], - raise_on_failure=False, - timeout=30, + if gcp_type_out.strip(): + instance_label = gcp_type_out.strip().split('/')[-1] + if not instance_label: + aws_type_out, _ = daemonset.PodExec( + 'curl -s -m 3 --fail ' + 'http://169.254.169.254/latest/meta-data/instance-type ' + '2>/dev/null || echo ""', + ignore_failure=True, ) - if ready_rc == 0 and "READY" in ready_out: - logging.info( - "[swap_encryption] Pod %s recovered and ready", recovered_pod - ) - return recovered_pod - time.sleep(15) - - raise errors.VmUtil.IssueCommandError( - f"[swap_encryption] Pod {recovered_pod} did not become ready " - f"within {timeout_sec}s after OOM kill / eviction" - ) - + instance_label = aws_type_out.strip() -def _detect_cloud(pod: str) -> str: - """Detect GCP vs AWS from DMI product info exposed via /sys hostPath mount. + return { + 'benchmark': BENCHMARK_NAME, + 'execution_mode': 'kubernetes_privileged_pod', + 'cloud': cloud, + 'instance_size': instance_label, + 'kernel_version': kernel_out.strip(), + 'host_memory_gb': mem_gb, + 'swap_device': swap_dev, + 'swap_size_gb': swap_gb, + 'swap_encryption': enc, + 'storage_target': _SWAP_TYPE.value, + 'boot_disk_type': _BOOT_DISK_TYPE.value, + 'dmcrypt_enabled': _ENABLE_DMCRYPT.value, + 'node_image_type': _NODE_IMAGE_TYPE.value, + 'boot_disk_iops_target': _BOOT_DISK_IOPS.value, + 'benchmark_machine_type': _BENCHMARK_MACHINE_TYPE.value, + 'zswap_enabled': _ENABLE_ZSWAP.value, + 'min_free_kbytes': _MIN_FREE_KBYTES.value, + 'fio_runtime_sec': _FIO_RUNTIME_SEC.value, + 'stress_vm_bytes_requested': _STRESS_VM_BYTES.value, + 'stress_vm_bytes_list': _STRESS_VM_BYTES_LIST.value, + 'stress_timeout_sec': _STRESS_TIMEOUT_SEC.value, + 'nodepool': _NODEPOOL.value, + } - DMI is the most reliable in-container detection method because it reads - directly from the host kernel's SMBIOS table via /sys (already mounted). - It avoids HTTP metadata endpoint quoting issues and network timeouts. - Falls back to metadata HTTP endpoints if DMI is inconclusive. - """ - # Primary: DMI product name / vendor (available via /sys hostPath mount) - dmi_out, _ = _pod_exec( - pod, - "cat /sys/class/dmi/id/sys_vendor /sys/class/dmi/id/product_name " - '/sys/class/dmi/id/bios_vendor 2>/dev/null || echo ""', - ignore_failure=True, - ) - dmi = dmi_out.strip().lower() - if "google" in dmi: - logging.info( - "[swap_encryption] Cloud detected via DMI: gcp (%s)", - dmi_out.strip(), - ) - return "gcp" - if any(k in dmi for k in ("amazon", "ec2", "aws")): - logging.info( - "[swap_encryption] Cloud detected via DMI: aws (%s)", - dmi_out.strip(), - ) - return "aws" - - # Secondary: GCP metadata endpoint. - # Use -H with no space after colon to avoid shell-quoting issues through - # the kubectl exec → bash -c pipeline. - gcp_out, _ = _pod_exec( - pod, - "curl -s -m 3 " - "http://metadata.google.internal/computeMetadata/v1/instance/zone " - '-H Metadata-Flavor:Google 2>/dev/null || echo ""', +def _detect_cloud(daemonset: _ds_mod.SwapDaemonSet) -> str: + """Detect whether the benchmark pod is running on GCP or AWS.""" + gcp_out, _ = daemonset.PodExec( + 'curl -s -m 2 --fail ' + 'http://metadata.google.internal/computeMetadata/v1/project/project-id' + ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""', ignore_failure=True, ) if gcp_out.strip(): - logging.info("[swap_encryption] Cloud detected via metadata: gcp") - return "gcp" - - # Tertiary: AWS IMDS (IMDSv2 token-based; IMDSv1 is often disabled). - aws_out, _ = _pod_exec( - pod, - "T=$(curl -s -m 3 -X PUT " - "http://169.254.169.254/latest/api/token " - '-H "X-aws-ec2-metadata-token-ttl-seconds: 60" 2>/dev/null); ' - 'curl -s -m 3 -H "X-aws-ec2-metadata-token: $T" ' - "http://169.254.169.254/latest/meta-data/instance-id " - '2>/dev/null || echo ""', - ignore_failure=True, - ) - if aws_out.strip(): - logging.info("[swap_encryption] Cloud detected via IMDS: aws") - return "aws" + return 'GCP' + return 'AWS' - logging.warning( - "[swap_encryption] Could not detect cloud from DMI or metadata" - ) - return "unknown" - -def _setup_gke_swap(pod: str) -> None: - """Configure dm-crypt swap on the GKE node, mirroring go/node:swap-encryption. - - GKE nodes use dm-crypt with an ephemeral random key so that swap contents - are encrypted at rest without requiring persistent key management. - We replicate this exactly using cryptsetup in plain mode (no LUKS header). - """ +def _setup_gke_swap(daemonset: _ds_mod.SwapDaemonSet) -> None: + """Configure dm-crypt swap on the GKE node, mirroring go/node:swap-encryption.""" swap_type = _SWAP_TYPE.value - if swap_type == "auto": - # Check whether Local SSDs are present - lssd_out, _ = _pod_exec( - pod, + if swap_type == 'auto': + lssd_out, _ = daemonset.PodExec( "lsblk -d -o NAME,MODEL | grep -i 'local\\|nvme' | " "grep -v 'nvme0' | awk '{print $1}' | head -1", ignore_failure=True, ) - swap_type = "lssd" if lssd_out.strip() else "hyperdisk" + swap_type = 'lssd' if lssd_out.strip() else 'hyperdisk' - if swap_type == "lssd": - _setup_gke_lssd_swap(pod) - elif swap_type == "boot_disk": - _setup_gke_bootdisk_swap(pod) + if swap_type == 'lssd': + _setup_gke_lssd_swap(daemonset) + elif swap_type == 'boot_disk': + _setup_gke_bootdisk_swap(daemonset) else: - _setup_gke_hyperdisk_swap(pod) - + _setup_gke_hyperdisk_swap(daemonset) -def _setup_gke_hyperdisk_swap(pod: str) -> None: - """Configure dm-crypt swap on hyperdisk-balanced (GKE default). - Disk detection is split into two separate commands so that the boot-device - name is resolved first and then substituted as a literal string — nested - $() expansions inside a kubectl exec bash -c argument are unreliable. +def _setup_gke_hyperdisk_swap(daemonset: _ds_mod.SwapDaemonSet) -> None: + """Configure dm-crypt swap on hyperdisk-balanced (GKE default).""" + logging.info('[swap_encryption] GKE: setting up dm-crypt on hyperdisk') - If no dedicated data disk is attached (single-disk node) dm-crypt is set up - over a loop device backed by a file on the boot hyperdisk, which still - exercises the full encryption path on the same storage tier. - """ - logging.info("[swap_encryption] GKE: setting up dm-crypt on hyperdisk") - - # Step 1: identify the boot device name (e.g. "nvme0n1", "sda") - boot_out, _ = _pod_exec( - pod, + boot_out, _ = daemonset.PodExec( 'lsblk -no pkname "$(findmnt -n -o SOURCE /)" 2>/dev/null | head -1', ignore_failure=True, ) - boot_base = boot_out.strip() or "nvme0n1" - logging.info("[swap_encryption] GKE: boot device: %s", boot_base) + boot_base = boot_out.strip() or 'nvme0n1' + logging.info('[swap_encryption] GKE: boot device: %s', boot_base) - # Step 2: find a non-boot disk using the literal name from step 1 - disk_out, _ = _pod_exec( - pod, + disk_out, _ = daemonset.PodExec( "lsblk -d -o NAME,TYPE | awk '$2==\"disk\"{print $1}' " f"| grep -v '^{boot_base}$' | head -1", ignore_failure=True, @@ -2042,24 +946,22 @@ def _setup_gke_hyperdisk_swap(pod: str) -> None: if not disk_name: logging.info( - "[swap_encryption] No dedicated data disk found – " - "falling back to loop device on /mnt/stateful_partition " - "(direct-io=on, dm-crypt=%s)", + '[swap_encryption] No dedicated data disk found – ' + 'falling back to loop device on /mnt/stateful_partition ' + '(direct-io=on, dm-crypt=%s)', _ENABLE_DMCRYPT.value, ) - _setup_gke_loop_device_swap(pod) + _setup_gke_loop_device_swap(daemonset) return - disk = f"/dev/{disk_name}" + disk = f'/dev/{disk_name}' logging.info( - "[swap_encryption] GKE: swap target disk: %s dmcrypt=%s", + '[swap_encryption] GKE: swap target disk: %s dmcrypt=%s', disk, _ENABLE_DMCRYPT.value, ) - # Clean up any stale mapping from a previous failed run. - _pod_exec( - pod, + daemonset.PodExec( textwrap.dedent(f""" swapoff /dev/mapper/swap_encrypted 2>/dev/null || true dmsetup remove --noudevrules --noudevsync swap_encrypted 2>/dev/null || true @@ -2069,19 +971,7 @@ def _setup_gke_hyperdisk_swap(pod: str) -> None: ) if _ENABLE_DMCRYPT.value: - # We cannot use cryptsetup open from inside a container because - # libdevmapper calls dm_udev_wait() after creating the target, which - # blocks on /run/udev/control. That socket belongs to udevd which is - # not running inside the container — so cryptsetup hangs forever. - # - # Instead we drive dmsetup directly with --noudevrules --noudevsync, - # which skips all udev synchronisation, and call dmsetup mknodes to - # ensure /dev/mapper/swap_encrypted appears without udev. - # - # insmod (not modprobe) loads the kernel module: modprobe also talks to - # systemd-udevd and can deadlock from a container for the same reason. - _pod_exec( - pod, + daemonset.PodExec( textwrap.dedent(f""" grep -q dm_crypt /proc/modules 2>/dev/null || {{ KO=$(find /lib/modules/$(uname -r) -name 'dm-crypt.ko*' 2>/dev/null | head -1) @@ -2098,68 +988,27 @@ def _setup_gke_hyperdisk_swap(pod: str) -> None: """), ) logging.info( - "[swap_encryption] GKE: dm-crypt swap active on " - "/dev/mapper/swap_encrypted" + '[swap_encryption] GKE: dm-crypt swap active on ' + '/dev/mapper/swap_encrypted' ) else: - # Encryption-disabled column of the test matrix - _pod_exec( - pod, + daemonset.PodExec( textwrap.dedent(f""" mkswap {disk} && \\ swapon {disk} """), ) logging.info( - "[swap_encryption] GKE: plain (unencrypted) swap active on %s", disk + '[swap_encryption] GKE: plain (unencrypted) swap active on %s', disk ) -def _setup_gke_loop_device_swap(pod: str) -> None: - """Plain loop-device swap for single-disk GKE nodes (no dedicated swap disk). - - Used when _setup_gke_hyperdisk_swap finds no dedicated second disk (e.g. - n2-highmem-32 / n4-highmem-32 single-boot-disk nodes, regardless of image - type). - - dm-crypt is skipped on this path for two reasons: - 1. On COS (Container-Optimised OS): the device-mapper kernel subsystem is - inaccessible from inside a Kubernetes pod (even privileged). Calls to - cryptsetup/dmsetup block indefinitely and are killed by the PKB timeout. - This is a deliberate COS security restriction, not a permissions issue. - 2. On UBUNTU_CONTAINERD: the loop device is created in the container - namespace; its behaviour under nsenter (needed for dm-crypt on dedicated - disks) is untested, so plain loop swap is used for safety. - For dedicated block devices (hyperdisk, LSSD) nsenter into the host mount - namespace works around the COS restriction (see _setup_gke_hyperdisk_swap). - The loop device path skips dm-crypt on all image types; plain loop swap is - used instead. - - Therefore this path uses a plain loop device as swap without dm-crypt. - Phase 1 (fio) is skipped for plain loop devices — the goal is enc-on vs - enc-off comparison, and fio on a plain loop device measures the backing - filesystem rather than the swap stack. Tiers 2–6 (stress-ng, Redis, - kernel build, OpenSearch) run normally. - - For dm-crypt measurement on GCP use a machine type with local NVMe (LSSD) - or provision a dedicated hyperdisk on a second disk slot (n4-highmem-32+). - - Improvements over the old /var path: - - Backing file on /mnt/stateful_partition (ext4), not the container - overlayfs — avoids overlayfs O_DIRECT limitation. - - losetup --direct-io=on passes I/O through to the host ext4, reducing - double-buffering for Tiers 2–6 workloads. - """ +def _setup_gke_loop_device_swap(daemonset: _ds_mod.SwapDaemonSet) -> None: + """Plain loop-device swap for single-disk GKE nodes (no dedicated swap disk).""" size_gb = _SWAP_SIZE_GB.value - # /mnt/stateful_partition is ext4 on COS (mounted from the stateful - # partition of the node's persistent disk). It is NOT the container - # overlay filesystem and is mounted into the pod via the DaemonSet - # hostPath volume. - backing = "/mnt/stateful_partition/pkb_swap_backing" - - # ── Step 0: detach any stale loop device from a previous failed run ─────── - _pod_exec( - pod, + backing = '/mnt/stateful_partition/pkb_swap_backing' + + daemonset.PodExec( textwrap.dedent(f""" losetup -j {backing} 2>/dev/null | awk -F: '{{print $1}}' | \ while read dev @@ -2172,29 +1021,19 @@ def _setup_gke_loop_device_swap(pod: str) -> None: ignore_failure=True, ) - # ── Step 1: allocate backing file on stateful partition (ext4) ─────────── logging.info( - "[swap_encryption] GKE: creating %dG backing file on" - " stateful_partition", + '[swap_encryption] GKE: creating %dG backing file on' + ' stateful_partition', size_gb, ) - # fallocate preallocates real ext4 blocks (avoids fragmentation during swap - # I/O); truncate is the sparse fallback for filesystems where fallocate - # fails. - _pod_exec( - pod, + daemonset.PodExec( textwrap.dedent(f""" fallocate -l {size_gb}G {backing} 2>/dev/null || \\ truncate -s {size_gb}G {backing} """), ) - # ── Step 2: loop device with direct-io passthrough ─────────────────────── - # --direct-io=on lets the loop driver pass O_DIRECT to the host ext4, - # reducing double-buffering for workload I/O (kernel 5.x+, present on - # GKE COS ≥ 1.29). - loop_out, _ = _pod_exec( - pod, + loop_out, _ = daemonset.PodExec( textwrap.dedent(f""" LOOP=$(losetup -f) && \\ losetup --direct-io=on "$LOOP" {backing} && \\ @@ -2202,53 +1041,38 @@ def _setup_gke_loop_device_swap(pod: str) -> None: """), ) loop_dev = loop_out.strip() - if not loop_dev.startswith("/dev/loop"): + if not loop_dev.startswith('/dev/loop'): raise RuntimeError( - f"[swap_encryption] losetup failed – output: {loop_out!r}" + f'[swap_encryption] losetup failed – output: {loop_out!r}' ) logging.info( - "[swap_encryption] GKE: loop device: %s direct-io=on", loop_dev + '[swap_encryption] GKE: loop device: %s direct-io=on', loop_dev ) - # ── Step 3: plain mkswap + swapon (dm-crypt skipped on loop devices) ──────── - _pod_exec(pod, f"mkswap {loop_dev}") - _pod_exec(pod, f"swapon {loop_dev}") + daemonset.PodExec(f'mkswap {loop_dev}') + daemonset.PodExec(f'swapon {loop_dev}') logging.warning( - "[swap_encryption] GKE: plain loop swap active on %s " - "(dm-crypt unavailable from COS pod — device-mapper is blocked by " - "COS kernel namespace restrictions). " - "Phase 1 (fio) will be skipped. " - "Use a machine with LSSD (c4-*-lssd) or attach a dedicated second " - "hyperdisk for dm-crypt measurement.", + '[swap_encryption] GKE: plain loop swap active on %s ' + '(dm-crypt unavailable from COS pod — device-mapper is blocked by ' + 'COS kernel namespace restrictions). ' + 'Phase 1 (fio) will be skipped. ' + 'Use a machine with LSSD (c4-*-lssd) or attach a dedicated second ' + 'hyperdisk for dm-crypt measurement.', loop_dev, ) -def _setup_gke_bootdisk_swap(pod: str) -> None: - """Swap on the OS BOOT disk — methodology Table 0 rows 1-4. - - Creates a loop-backed swap file on /mnt/stateful_partition (the node's boot - disk, whose type — pd-balanced or hyperdisk-balanced — is chosen at - nodepool-creation time via --swap_encryption_boot_disk_type). dm-crypt is - layered on the loop device when --swap_encryption_enable_dmcrypt is set - (encryption-on rows 2/4); otherwise plain swap is used (encryption-off rows - 1/3). - - Reuses the same loop-creation and dmsetup patterns as the LSSD/hyperdisk - paths — no shared provider module is touched. Requires an Ubuntu node image - (dm-crypt from a pod is blocked on COS). - """ +def _setup_gke_bootdisk_swap(daemonset: _ds_mod.SwapDaemonSet) -> None: + """Swap on the OS BOOT disk — methodology Table 0 rows 1-4.""" size_gb = _SWAP_SIZE_GB.value - backing = "/mnt/stateful_partition/pkb_swap_backing" + backing = '/mnt/stateful_partition/pkb_swap_backing' logging.info( - "[swap_encryption] GKE: boot-disk swap (%dG backing, dmcrypt=%s)", + '[swap_encryption] GKE: boot-disk swap (%dG backing, dmcrypt=%s)', size_gb, _ENABLE_DMCRYPT.value, ) - # Clean up any stale loop/mapping from a previous run. - _pod_exec( - pod, + daemonset.PodExec( textwrap.dedent(f""" swapoff /dev/mapper/swap_encrypted 2>/dev/null || true dmsetup remove --noudevrules --noudevsync swap_encrypted 2>/dev/null || true @@ -2262,32 +1086,28 @@ def _setup_gke_bootdisk_swap(pod: str) -> None: ignore_failure=True, ) - # Allocate the backing file on the boot-disk ext4 stateful partition. - _pod_exec( - pod, + daemonset.PodExec( textwrap.dedent(f""" fallocate -l {size_gb}G {backing} 2>/dev/null || truncate -s {size_gb}G {backing} """), ) - loop_out, _ = _pod_exec( - pod, + loop_out, _ = daemonset.PodExec( textwrap.dedent(f""" LOOP=$(losetup -f) && losetup --direct-io=on "$LOOP" {backing} && echo "$LOOP" """), ) loop_dev = ( - loop_out.strip().splitlines()[-1].strip() if loop_out.strip() else "" + loop_out.strip().splitlines()[-1].strip() if loop_out.strip() else '' ) - if not loop_dev.startswith("/dev/loop"): + if not loop_dev.startswith('/dev/loop'): raise RuntimeError( - f"[swap_encryption] boot-disk losetup failed: {loop_out!r}" + f'[swap_encryption] boot-disk losetup failed: {loop_out!r}' ) - logging.info("[swap_encryption] GKE: boot-disk loop device: %s", loop_dev) + logging.info('[swap_encryption] GKE: boot-disk loop device: %s', loop_dev) if _ENABLE_DMCRYPT.value: - _pod_exec( - pod, + daemonset.PodExec( textwrap.dedent(f""" grep -q dm_crypt /proc/modules 2>/dev/null || {{ KO=$(find /lib/modules/$(uname -r) -name 'dm-crypt.ko*' 2>/dev/null | head -1) @@ -2304,32 +1124,25 @@ def _setup_gke_bootdisk_swap(pod: str) -> None: """), ) logging.info( - "[swap_encryption] GKE: boot-disk dm-crypt swap active on " - "/dev/mapper/swap_encrypted" + '[swap_encryption] GKE: boot-disk dm-crypt swap active on ' + '/dev/mapper/swap_encrypted' ) else: - _pod_exec( - pod, + daemonset.PodExec( textwrap.dedent(f""" mkswap {loop_dev} && swapon {loop_dev} """), ) logging.info( - "[swap_encryption] GKE: boot-disk plain swap active on %s", loop_dev + '[swap_encryption] GKE: boot-disk plain swap active on %s', loop_dev ) -def _setup_gke_lssd_swap(pod: str) -> None: +def _setup_gke_lssd_swap(daemonset: _ds_mod.SwapDaemonSet) -> None: """Configure dm-crypt on LSSD RAID-0 array (go/gke-swap-lssd).""" - logging.info("[swap_encryption] GKE: setting up LSSD RAID-0 swap") - - # Reused-node hygiene: a previous run on this node may have left an ACTIVE - # dm-crypt swap (e.g. /dev/nvme0n1 └─swap_encrypted [SWAP]). That makes the - # LSSD look "unclean/busy" to the device selector below, which then wrongly - # falls back to the hyperdisk path and tries the boot disk. Tear down any - # prior PKB swap mapping FIRST so the underlying LSSD is freed and selectable. - _pod_exec( - pod, + logging.info('[swap_encryption] GKE: setting up LSSD RAID-0 swap') + + daemonset.PodExec( textwrap.dedent(""" swapoff /dev/mapper/swap_encrypted 2>/dev/null || true swapoff -a 2>/dev/null || true @@ -2338,28 +1151,15 @@ def _setup_gke_lssd_swap(pod: str) -> None: ignore_failure=True, ) - # Log the full block-device topology up front for diagnosis (every prior - # swap failure traced back to picking the wrong device). - topo, _ = _pod_exec( - pod, - "lsblk -o NAME,TYPE,SIZE,ROTA,MOUNTPOINT 2>/dev/null", + topo, _ = daemonset.PodExec( + 'lsblk -o NAME,TYPE,SIZE,ROTA,MOUNTPOINT 2>/dev/null', ignore_failure=True, ) logging.info( - "[swap_encryption] block device topology:\n%s", (topo or "").strip() + '[swap_encryption] block device topology:\n%s', (topo or '').strip() ) - # Identify candidate swap devices = whole disks that are NOT the boot/OS - # disk. We must NOT rely on a device name (boot disk enumerates as nvme0n1 - # on some nodes, nvme1n1 on others) and we cannot use `findmnt /` because the - # container root is an overlay. Instead we EXCLUDE any disk that: - # * has partition children (boot disk has p1/p14/p15/p16), or - # * has any mounted filesystem (itself or a child). - # A raw local SSD intended for swap has neither. This robustly prevents the - # catastrophic bug where the 100 GB boot disk (root mounted) was RAIDed into - # the swap device, yielding a non-functional swap (fio empty + stress OOM). - lssd_out, _ = _pod_exec( - pod, + lssd_out, _ = daemonset.PodExec( textwrap.dedent(""" for d in $(lsblk -dno NAME,ROTA | awk '$2==0{print $1}') do @@ -2377,77 +1177,26 @@ def _setup_gke_lssd_swap(pod: str) -> None: devices = [d.strip() for d in lssd_out.strip().splitlines() if d.strip()] if not devices: logging.warning( - "[swap_encryption] No clean (unpartitioned, unmounted) local SSD" - " found — falling back to hyperdisk swap path" + '[swap_encryption] No clean (unpartitioned, unmounted) local SSD' + ' found — falling back to hyperdisk swap path' ) - _setup_gke_hyperdisk_swap(pod) + _setup_gke_hyperdisk_swap(daemonset) return - device_list = " ".join(devices) + device_list = ' '.join(devices) n = len(devices) logging.info( - "[swap_encryption] GKE: LSSD RAID-0 across %d clean device(s): " - "%s dmcrypt=%s", + '[swap_encryption] GKE: LSSD RAID-0 across %d clean device(s): ' + '%s dmcrypt=%s', n, device_list, _ENABLE_DMCRYPT.value, ) - # Clean up stale mappings, RAID arrays, and GKE-managed mounts. - # - # GKE UBUNTU nodes run google-ssd-startup.service at boot which formats - # local NVMe SSDs as ext4 and mounts them at /mnt/disks/ssd0 etc. even - # when --local-nvme-ssd-block is set. The mount makes the block device - # busy so mdadm/wipefs fail silently (we had || true). We must unmount - # those paths first. /proc-host/mounts reflects the host mount table - # (hostPID:true + privileged gives us access). - # - # pkb_swap is the dm-crypt device created by the node startup script (for - # single-LSSD nodes it holds /dev/nvme1n1 directly without an md0 layer). - _pod_exec( - pod, + daemonset.PodExec( textwrap.dedent(f""" echo "[pkb-lssd-cleanup] /proc/mdstat:" >&2 cat /proc/mdstat 2>/dev/null || true - echo "[pkb-lssd-cleanup] dmsetup ls:" >&2 - dmsetup ls 2>/dev/null || true - echo "[pkb-lssd-cleanup] /proc/swaps:" >&2 - cat /proc/swaps 2>/dev/null || true - echo "[pkb-lssd-cleanup] host mounts on {device_list}:" >&2 - grep -E '{('|'.join(devices))}' /proc-host/mounts 2>/dev/null || true - echo "[pkb-lssd-cleanup] sysfs holders:" >&2 - for dev in {device_list} - do - devname=$(basename "$dev") - ls -1 /sys/block/$devname/holders/ 2>/dev/null | while read h - do - echo "[pkb-lssd-cleanup] $dev held by $h" >&2 - done - done - echo "[pkb-lssd-cleanup] --- begin teardown ---" >&2 - for dev in {device_list} - do - test -b "$dev" || continue - devname=$(basename "$dev") - for holder in /sys/block/$devname/holders/* - do - test -e "$holder" || continue - h=$(basename "$holder") - echo "[pkb-lssd-cleanup] removing holder /dev/$h from $dev" >&2 - if echo "$h" | grep -q "^md" - then - mdadm --stop /dev/$h 2>/dev/null || true - else - dmsetup remove --force --noudevrules --noudevsync /dev/$h 2>/dev/null || true - fi - done - mounts=$(awk -v d="$dev" '$1==d{{print $2}}' /proc-host/mounts 2>/dev/null || true) - for mp in $mounts - do - echo "[pkb-lssd-cleanup] unmounting $mp from $dev" >&2 - umount -f "$mp" 2>/dev/null || true - done - done swapoff -a 2>/dev/null || true swapoff /dev/mapper/pkb_swap 2>/dev/null || true swapoff /dev/mapper/swap_encrypted 2>/dev/null || true @@ -2456,25 +1205,12 @@ def _setup_gke_lssd_swap(pod: str) -> None: mdadm --stop --scan 2>/dev/null || true mdadm --zero-superblock {device_list} 2>/dev/null || true wipefs -a {device_list} 2>/dev/null || true - echo "[pkb-lssd-cleanup] lsblk after wipefs:" >&2 - lsblk {device_list} 2>/dev/null || true - partx -u {device_list} 2>/dev/null || true - losetup -D 2>/dev/null || true - rm -f /mnt/stateful_partition/pkb_swap.img 2>/dev/null || true sleep 2 """), ignore_failure=True, ) - # Step 3: verify the devices are truly raw (unpartitioned). On GKE Ubuntu - # nodes the local NVMe device may be partitioned by node startup scripts - # even when --local-nvme-ssd-block is specified. The kernel refuses a - # whole-disk exclusive open (DM_TABLE_LOAD → EBUSY) when any partition of - # the disk is open by another process (e.g. the container overlay FS is - # backed by nvme1n1p1). Detect this and fall back to a loop device backed - # by a file on /mnt/stateful_partition (which IS the SSD partition). - raw_check_out, _ = _pod_exec( - pod, + raw_check_out, _ = daemonset.PodExec( textwrap.dedent(f""" for dev in {device_list} do @@ -2494,28 +1230,24 @@ def _setup_gke_lssd_swap(pod: str) -> None: if not raw_devices: logging.info( - "[swap_encryption] GKE: all LSSD devices are partitioned — " - "falling back to loop device on /mnt/stateful_partition" + '[swap_encryption] GKE: all LSSD devices are partitioned — ' + 'falling back to loop device on /mnt/stateful_partition' ) - _setup_gke_lssd_stateful_loop_swap(pod) + _setup_gke_lssd_stateful_loop_swap(daemonset) return - # Use only raw (unpartitioned) devices going forward. devices = raw_devices - device_list = " ".join(devices) + device_list = ' '.join(devices) n = len(devices) logging.info( - "[swap_encryption] GKE: using %d raw LSSD device(s): %s dmcrypt=%s", + '[swap_encryption] GKE: using %d raw LSSD device(s): %s dmcrypt=%s', n, device_list, _ENABLE_DMCRYPT.value, ) - # For N=1 LSSD, skip mdadm entirely and target the raw device directly. - # For N>1 we stripe across multiple NVMe devices. if n > 1: - _pod_exec( - pod, + daemonset.PodExec( textwrap.dedent(f""" mdadm --create /dev/md0 --force \\ --level=0 --raid-devices={n} \\ @@ -2523,19 +1255,17 @@ def _setup_gke_lssd_swap(pod: str) -> None: test -b /dev/md0 || {{ echo "mdadm: /dev/md0 not created" >&2; exit 1; }} """), ) - swap_block_dev = "/dev/md0" + swap_block_dev = '/dev/md0' else: swap_block_dev = devices[0] logging.info( - "[swap_encryption] GKE: single LSSD — skipping mdadm, " - "using %s directly", + '[swap_encryption] GKE: single LSSD — skipping mdadm, ' + 'using %s directly', swap_block_dev, ) if _ENABLE_DMCRYPT.value: - # Same dmsetup --noudevrules --noudevsync approach as _setup_gke_hyperdisk_swap. - _pod_exec( - pod, + daemonset.PodExec( textwrap.dedent(f""" grep -q dm_crypt /proc/modules 2>/dev/null || {{ KO=$(find /lib/modules/$(uname -r) -name 'dm-crypt.ko*' 2>/dev/null | head -1) @@ -2554,37 +1284,27 @@ def _setup_gke_lssd_swap(pod: str) -> None: """), ) logging.info( - "[swap_encryption] GKE: LSSD dm-crypt swap active on %s", + '[swap_encryption] GKE: LSSD dm-crypt swap active on %s', swap_block_dev, ) else: - _pod_exec( - pod, + daemonset.PodExec( textwrap.dedent(f""" mkswap {swap_block_dev} swapon {swap_block_dev} """), ) logging.info( - "[swap_encryption] GKE: LSSD plain swap active on %s", + '[swap_encryption] GKE: LSSD plain swap active on %s', swap_block_dev, ) -def _setup_gke_lssd_stateful_loop_swap(pod: str) -> None: - """Set up swap on the LSSD partition via a loop device. +def _setup_gke_lssd_stateful_loop_swap(daemonset: _ds_mod.SwapDaemonSet) -> None: + """Set up swap on the LSSD partition via a loop device.""" + img_path = '/mnt/stateful_partition/pkb_swap.img' - Used when the local NVMe device is partitioned by GKE startup scripts - and cannot be opened as a whole raw block device (DM_TABLE_LOAD EBUSY). - The DaemonSet mounts /mnt/stateful_partition (hostPath) from the host's - nvme1n1p1 — which is still local SSD storage. We create a large file - there and layer loop → dm-crypt → swap on top of it. - """ - img_path = "/mnt/stateful_partition/pkb_swap.img" - - # Clean up any previous run artifacts. - _pod_exec( - pod, + daemonset.PodExec( textwrap.dedent(f""" swapoff -a 2>/dev/null || true dmsetup remove --force --noudevrules --noudevsync swap_encrypted 2>/dev/null || true @@ -2594,23 +1314,19 @@ def _setup_gke_lssd_stateful_loop_swap(pod: str) -> None: ignore_failure=True, ) - # Determine file size: 80% of available space, at least 16 GB. - size_out, _ = _pod_exec( - pod, - f"df -P /mnt/stateful_partition | awk 'NR==2{{print $4}}'", + size_out, _ = daemonset.PodExec( + "df -P /mnt/stateful_partition | awk 'NR==2{print $4}'", ignore_failure=True, ) - avail_kb = int(size_out.strip() or "0") + avail_kb = int(size_out.strip() or '0') swap_gb = max(16, int(avail_kb * 0.8 / 1024 / 1024)) logging.info( - "[swap_encryption] GKE: LSSD stateful-loop: %d GB image at %s", + '[swap_encryption] GKE: LSSD stateful-loop: %d GB image at %s', swap_gb, img_path, ) - # Allocate file (fallocate is instant on ext4; dd fallback for others). - _pod_exec( - pod, + daemonset.PodExec( textwrap.dedent(f""" fallocate -l {swap_gb}G {img_path} 2>/dev/null || \\ dd if=/dev/zero of={img_path} bs=1G count={swap_gb} @@ -2620,24 +1336,22 @@ def _setup_gke_lssd_stateful_loop_swap(pod: str) -> None: timeout=300, ) - loop_out, _ = _pod_exec( - pod, + loop_out, _ = daemonset.PodExec( f"losetup -j {img_path} | awk -F: '{{print $1}}' | head -1", ignore_failure=True, ) loop_dev = loop_out.strip() - if not loop_dev.startswith("/dev/loop"): + if not loop_dev.startswith('/dev/loop'): raise RuntimeError( - f"[swap_encryption] losetup failed for {img_path} — got:" - f" {loop_out!r}" + f'[swap_encryption] losetup failed for {img_path} — got:' + f' {loop_out!r}' ) logging.info( - "[swap_encryption] GKE: LSSD stateful-loop device: %s", loop_dev + '[swap_encryption] GKE: LSSD stateful-loop device: %s', loop_dev ) if _ENABLE_DMCRYPT.value: - _pod_exec( - pod, + daemonset.PodExec( textwrap.dedent(f""" grep -q dm_crypt /proc/modules 2>/dev/null || {{ KO=$(find /lib/modules/$(uname -r) -name 'dm-crypt.ko*' 2>/dev/null | head -1) @@ -2656,156 +1370,48 @@ def _setup_gke_lssd_stateful_loop_swap(pod: str) -> None: """), ) logging.info( - "[swap_encryption] GKE: LSSD stateful-loop dm-crypt swap active " - "on %s → %s", + '[swap_encryption] GKE: LSSD stateful-loop dm-crypt swap active ' + 'on %s → %s', img_path, loop_dev, ) else: - _pod_exec( - pod, + daemonset.PodExec( textwrap.dedent(f""" mkswap {loop_dev} swapon {loop_dev} """), ) logging.info( - "[swap_encryption] GKE: LSSD stateful-loop plain swap active " - "on %s → %s", + '[swap_encryption] GKE: LSSD stateful-loop plain swap active ' + 'on %s → %s', img_path, loop_dev, ) -_IO2_VOLUME_ID = "" # set by _ensure_io2_volume; serial-based detection - - -def _ensure_io2_volume() -> None: - """Create + attach a dedicated io2 EBS volume to the benchmark node so the - io2 test-matrix row swaps on real io2 hardware-encrypted storage. - - No-op unless --swap_encryption_swap_type=io2 on an AWS/EKS cluster. - Best-effort: logs and returns on failure. Stashes the created volume id in - _IO2_VOLUME_ID for serial-based device detection in _setup_eks_io2_swap. - """ - global _IO2_VOLUME_ID - if _SWAP_TYPE.value != "io2": - return - out, _, rc = kubectl.RunKubectlCommand( - ["get", "nodes", "-o", "jsonpath={.items[0].spec.providerID}"], - raise_on_failure=False, - ) - provider = (out or "").strip() # aws:///us-east-1a/i-0abc... - if rc != 0 or "aws://" not in provider: - logging.warning( - "[swap_encryption] io2 attach skipped: could not resolve " - "EC2 instance from providerID=%r", - provider, - ) - return - parts = [p for p in provider.split("/") if p] - instance_id, az = parts[-1], parts[-2] - region = az[:-1] - base = ["aws", "ec2", "--region", region] - try: - create_args = [ - "create-volume", - "--volume-type", - "io2", - "--size", - "500", - "--iops", - "16000", - "--availability-zone", - az, - "--tag-specifications", - "ResourceType=volume,Tags=[{Key=pkb,Value=swap_encryption}]", - ] - if _IO2_ENCRYPTED.value: - create_args.append("--encrypted") - if _IO2_KMS_KEY_ID.value: - create_args += ["--kms-key-id", _IO2_KMS_KEY_ID.value] - logging.info( - "[swap_encryption] io2 volume will be EBS-encrypted " - "(row: hardware encryption)" - ) - else: - logging.info( - "[swap_encryption] io2 volume UNENCRYPTED (baseline row)" - ) - create_args += ["--query", "VolumeId", "--output", "text"] - vol_id, _, vrc = vm_util.IssueCommand( - base + create_args, raise_on_failure=False - ) - vol_id = (vol_id or "").strip() - if vrc != 0 or not vol_id.startswith("vol-"): - logging.warning( - "[swap_encryption] io2 create-volume failed: %r", vol_id - ) - return - vm_util.IssueCommand( - base + ["wait", "volume-available", "--volume-ids", vol_id], - raise_on_failure=False, - ) - vm_util.IssueCommand( - base - + [ - "attach-volume", - "--volume-id", - vol_id, - "--instance-id", - instance_id, - "--device", - "/dev/sdf", - ], - raise_on_failure=False, - ) - vm_util.IssueCommand( - base + ["wait", "volume-in-use", "--volume-ids", vol_id], - raise_on_failure=False, - ) - _IO2_VOLUME_ID = vol_id - logging.info( - "[swap_encryption] Attached io2 volume %s to %s as /dev/sdf", - vol_id, - instance_id, - ) - time.sleep(15) # allow the NVMe device node to appear - except Exception as e: # pylint: disable=broad-except - logging.warning( - "[swap_encryption] io2 attach error (continuing): %s", e - ) - +_IO2_VOLUME_ID = '' # set by _ensure_io2_volume; serial-based detection -def _setup_eks_swap(pod: str) -> None: - """Configure swap on EKS nodes — Instance Store OR io2 root disk. - Swap type is selected by --swap_encryption_swap_type: - instance_store (default) – NVMe SSD attached by Nitro (i4i, m6id, c6id). - Nitro encrypts all block-device writes at hardware level; no extra - cryptsetup needed. - io2 – EBS io2 volume provisioned as the node root/data disk. - Used for apples-to-apples comparison against GKE hyperdisk-balanced. - """ +def _setup_eks_swap(daemonset: _ds_mod.SwapDaemonSet) -> None: + """Configure swap on EKS nodes — Instance Store OR io2 root disk.""" swap_type = _SWAP_TYPE.value - if swap_type in ("auto", "instance_store"): - _setup_eks_instance_store_swap(pod) - elif swap_type == "io2": - _setup_eks_io2_swap(pod) + if swap_type in ('auto', 'instance_store'): + _setup_eks_instance_store_swap(daemonset) + elif swap_type == 'io2': + _setup_eks_io2_swap(daemonset) else: logging.warning( - "[swap_encryption] Unknown EKS swap type %s – fallback", swap_type + '[swap_encryption] Unknown EKS swap type %s – fallback', swap_type ) - _setup_eks_instance_store_swap(pod) + _setup_eks_instance_store_swap(daemonset) -def _setup_eks_instance_store_swap(pod: str) -> None: +def _setup_eks_instance_store_swap(daemonset: _ds_mod.SwapDaemonSet) -> None: """Swap on AWS NVMe Instance Store (Nitro hardware-offloaded encryption).""" - logging.info("[swap_encryption] EKS: setting up Instance Store swap") + logging.info('[swap_encryption] EKS: setting up Instance Store swap') - # Find the Instance Store NVMe device (not the root EBS volume) - nvme_out, _ = _pod_exec( - pod, + nvme_out, _ = daemonset.PodExec( "nvme list 2>/dev/null | awk '/Instance Storage/{print $1}' | head -1" " || lsblk -d -o NAME,MODEL | grep -i 'instance\\|nvme' | grep -v" " 'nvme0' | awk '{print \"/dev/\"$1}' | head -1", @@ -2813,125 +1419,96 @@ def _setup_eks_instance_store_swap(pod: str) -> None: ) device = nvme_out.strip() if not device: - # Common Instance Store device paths on AWS - for candidate in ["/dev/nvme1n1", "/dev/nvme2n1", "/dev/xvdb"]: - exists_out, _ = _pod_exec( - pod, - f"test -b {candidate} && echo yes || echo no", + for candidate in ['/dev/nvme1n1', '/dev/nvme2n1', '/dev/xvdb']: + exists_out, _ = daemonset.PodExec( + f'test -b {candidate} && echo yes || echo no', ignore_failure=True, ) - if exists_out.strip() == "yes": + if exists_out.strip() == 'yes': device = candidate break if not device: logging.warning( - "[swap_encryption] No Instance Store NVMe found – creating swapfile" + '[swap_encryption] No Instance Store NVMe found – creating swapfile' ) - _setup_plain_swap_file(pod, _SWAP_SIZE_GB.value) + _setup_plain_swap_file(daemonset, _SWAP_SIZE_GB.value) return - logging.info("[swap_encryption] EKS: Instance Store device: %s", device) + logging.info('[swap_encryption] EKS: Instance Store device: %s', device) - # Nitro encrypts all Instance Store writes automatically. - # No additional cryptsetup required. - _pod_exec( - pod, + daemonset.PodExec( textwrap.dedent(f""" mkswap {device} && \\ swapon {device} """), ) logging.info( - "[swap_encryption] EKS: Instance Store swap active on %s", device + '[swap_encryption] EKS: Instance Store swap active on %s', device ) -def _setup_eks_io2_swap(pod: str) -> None: - """Swap on AWS EBS io2 volume – apples-to-apples comparison vs GKE hyperdisk. - - EBS io2 volumes on Nitro instances are encrypted at rest by AWS KMS (if - enabled on the volume) or via Nitro-level hardware encryption. No additional - cryptsetup is needed here; we simply format the attached data disk as swap. +def _setup_eks_io2_swap(daemonset: _ds_mod.SwapDaemonSet) -> None: + """Swap on AWS EBS io2 volume.""" + logging.info('[swap_encryption] EKS: setting up io2 EBS swap') - Device discovery order: - 1. Match the io2 volume created by _ensure_io2_volume() by its NVMe serial - (serial == volume id without the dash). This is unambiguous and never - picks the root disk or the instance store regardless of nvmeXn1 - enumeration order on Nitro. - 2. First non-root EBS ("Elastic Block Store") block device that is not - currently mounted. - """ - logging.info("[swap_encryption] EKS: setting up io2 EBS swap") - - # Identify root device so we can exclude it. - root_out, _ = _pod_exec( - pod, - "lsblk -no pkname $(findmnt -n -o SOURCE /) 2>/dev/null || echo" - " nvme0n1", + root_out, _ = daemonset.PodExec( + 'lsblk -no pkname $(findmnt -n -o SOURCE /) 2>/dev/null || echo' + ' nvme0n1', ignore_failure=True, ) - root_base = root_out.strip() or "nvme0n1" + root_base = root_out.strip() or 'nvme0n1' - # Identify the io2 volume UNAMBIGUOUSLY by its NVMe serial == volume id. - # An EBS NVMe device's serial equals the volume id minus the dash - # (vol-0abc... -> serial vol0abc...). - device = "" - target = _IO2_VOLUME_ID.replace("-", "") + device = '' + target = _IO2_VOLUME_ID.replace('-', '') if target: - ser_out, _ = _pod_exec( - pod, - "for d in /sys/block/nvme*n1; do " + ser_out, _ = daemonset.PodExec( + 'for d in /sys/block/nvme*n1; do ' '[ -e "$d" ] || continue; ' 's=$(cat "$d/device/serial" 2>/dev/null | tr -d "-" | tr -d " "); ' f'[ "$s" = "{target}" ] && {{ echo "/dev/$(basename "$d")"; break;' - " }; " - "done", + ' }; ' + 'done', ignore_failure=True, ) device = ser_out.strip() if device: logging.info( - "[swap_encryption] EKS: io2 matched by serial %s -> %s", + '[swap_encryption] EKS: io2 matched by serial %s -> %s', target, device, ) if not device: - # Fallback: first non-root EBS device, excluding any device that is - # currently mounted (root) or already active swap. - disk_out, _ = _pod_exec( - pod, - "for d in /sys/block/nvme*n1 /sys/block/xvd[b-z]" - " /sys/block/sd[b-z];" + disk_out, _ = daemonset.PodExec( + 'for d in /sys/block/nvme*n1 /sys/block/xvd[b-z]' + ' /sys/block/sd[b-z];' ' do [ -e "$d" ] || continue; n=$(basename "$d"); [ "$n" =' f' "{root_base}" ] && continue; m=$(cat "$d/device/model"' - " 2>/dev/null);" + ' 2>/dev/null);' ' echo "$m" | grep -qi "Elastic Block Store" || continue;' - " mnt=$(lsblk" + ' mnt=$(lsblk' ' -no MOUNTPOINT "/dev/$n" 2>/dev/null | tr -d " "); [ -n "$mnt"' - " ] &&" + ' ] &&' ' continue; echo "/dev/$n"; break; done', ignore_failure=True, ) device = disk_out.strip() if device: logging.info( - "[swap_encryption] EKS: io2 fallback EBS device: %s", device + '[swap_encryption] EKS: io2 fallback EBS device: %s', device ) if not device: logging.warning( - "[swap_encryption] No io2 EBS disk found – creating plain swapfile" + '[swap_encryption] No io2 EBS disk found – creating plain swapfile' ) - _setup_plain_swap_file(pod, _SWAP_SIZE_GB.value) + _setup_plain_swap_file(daemonset, _SWAP_SIZE_GB.value) return - logging.info("[swap_encryption] EKS: io2 EBS device: %s", device) + logging.info('[swap_encryption] EKS: io2 EBS device: %s', device) - # EBS io2 encryption is handled at the AWS level (Nitro / KMS). - out, _ = _pod_exec( - pod, + out, _ = daemonset.PodExec( textwrap.dedent(f""" swapoff {device} 2>/dev/null || true wipefs -a {device} 2>/dev/null || true @@ -2942,23 +1519,17 @@ def _setup_eks_io2_swap(pod: str) -> None: ) if device not in out: raise RuntimeError( - f"[swap_encryption] io2 swap did not activate on {device}; " - f"swapon --show output: {out!r}. The device may be busy/mounted " - "(wrong device picked) or mkswap failed." + f'[swap_encryption] io2 swap did not activate on {device}; ' + f'swapon --show output: {out!r}. The device may be busy/mounted ' + '(wrong device picked) or mkswap failed.' ) - logging.info("[swap_encryption] EKS: io2 EBS swap active on %s", device) - + logging.info('[swap_encryption] EKS: io2 EBS swap active on %s', device) -def _setup_plain_swap_file(pod: str, size_gb: int) -> None: - """Fallback: create a loop-device-backed swapfile. - A plain file on overlayfs (the container root) cannot be used as swap — - the kernel rejects it with EINVAL. Routing it through a loop device - presents a proper block device to the mm subsystem and succeeds. - """ - logging.info("[swap_encryption] Creating %dGB loop-device swap", size_gb) - _pod_exec( - pod, +def _setup_plain_swap_file(daemonset: _ds_mod.SwapDaemonSet, size_gb: int) -> None: + """Fallback: create a loop-device-backed swapfile.""" + logging.info('[swap_encryption] Creating %dGB loop-device swap', size_gb) + daemonset.PodExec( textwrap.dedent(f""" fallocate -l {size_gb}G /tmp/pkb_swapfile && \\ chmod 600 /tmp/pkb_swapfile && \\ @@ -2971,20 +1542,20 @@ def _setup_plain_swap_file(pod: str, size_gb: int) -> None: ) -def _enable_zswap(pod: str) -> None: +def _enable_zswap(daemonset: _ds_mod.SwapDaemonSet) -> None: """Enable zswap with lz4 compressor and 20% pool limit inside the pod.""" - logging.info("[swap_encryption] Enabling zswap (lz4, 20%% pool)") + logging.info('[swap_encryption] Enabling zswap (lz4, 20%% pool)') for cmd in [ - "echo 1 > /sys/module/zswap/parameters/enabled", - "echo lz4 > /sys/module/zswap/parameters/compressor", - "echo 20 > /sys/module/zswap/parameters/max_pool_percent", - "echo z3fold > /sys/module/zswap/parameters/zpool", + 'echo 1 > /sys/module/zswap/parameters/enabled', + 'echo lz4 > /sys/module/zswap/parameters/compressor', + 'echo 20 > /sys/module/zswap/parameters/max_pool_percent', + 'echo z3fold > /sys/module/zswap/parameters/zpool', ]: - _pod_exec(pod, cmd, ignore_failure=True) + daemonset.PodExec(cmd, ignore_failure=True) def _phase1_fio( - pod: str, swap_dev: str, base_meta: dict + daemonset: _ds_mod.SwapDaemonSet, swap_dev: str, base_meta: dict ) -> list[sample.Sample]: """Run fio directly on the swap block device for raw I/O characterisation. @@ -2994,111 +1565,82 @@ def _phase1_fio( boot_disk target (--swap_encryption_swap_type=boot_disk, methodology rows 1-4), the loop over the boot disk IS the device under test, so fio runs and characterises it. - - For dedicated second disks (hyperdisk, LSSD, NVMe) direct I/O is always - used and swap is restored (mkswap + swapon) after the fio run. - To get fio results use c4-*-lssd (local NVMe) or - --swap_encryption_add_swap_disk to provision a dedicated second disk. """ - if swap_dev.startswith("/dev/loop") and _SWAP_TYPE.value != "boot_disk": + if swap_dev.startswith('/dev/loop') and _SWAP_TYPE.value != 'boot_disk': logging.warning( - "[swap_encryption] Phase 1 (fio) SKIPPED for plain loop device %s" - " (unintentional single-disk fallback). fio on a loop-backed device" - " measures the underlying ext4 filesystem (stateful_partition), not" - " the swap stack. Use c4-*-lssd, --swap_encryption_add_swap_disk," - " or --swap_encryption_swap_type=boot_disk for fio data.", + '[swap_encryption] Phase 1 (fio) SKIPPED for plain loop device %s' + ' (unintentional single-disk fallback). fio on a loop-backed device' + ' measures the underlying ext4 filesystem (stateful_partition), not' + ' the swap stack. Use c4-*-lssd, --swap_encryption_add_swap_disk,' + ' or --swap_encryption_swap_type=boot_disk for fio data.', swap_dev, ) return [] results = [] - _pod_exec(pod, f"swapoff {swap_dev}", ignore_failure=True) + daemonset.PodExec(f'swapoff {swap_dev}', ignore_failure=True) # Pre-fill device so read tests have real data (avoids zero-block optimisation # by the storage controller skewing read latency measurements). - # Cap at 20 GiB — enough to warm up the dm-crypt pipeline and cover the fio - # runtime window. Writing 100% of a 500 GiB hyperdisk takes ~500+ seconds - # at provisioned throughput, which exceeds the PKB command timeout. - # Timeout: 20 GiB / ~150 MB/s (conservative dm-crypt write rate) + 60 s buffer. _PREFILL_GIB = 20 prefill_timeout = ( _PREFILL_GIB * 1024 // 150 + 60 - ) # ~197 s, rounds up to ~200 s - prefill_timeout = max(prefill_timeout, 300) # floor at 5 min + ) + prefill_timeout = max(prefill_timeout, 300) logging.info( - "[swap_encryption] Pre-filling %d GiB of %s", _PREFILL_GIB, swap_dev + '[swap_encryption] Pre-filling %d GiB of %s', _PREFILL_GIB, swap_dev ) - # No --output-format=json for prefill; we only care that it completes. - # Still use --output to avoid streaming large stdout over the websocket. - _pod_exec( - pod, + daemonset.PodExec( ( - f"fio --name=prefill --filename={swap_dev} --ioengine=libaio" - f" --direct=1 --rw=write --bs=1m --size={_PREFILL_GIB}g --verify=0" - " --output=/tmp/pkb_fio_prefill.log" + f'fio --name=prefill --filename={swap_dev} --ioengine=libaio' + f' --direct=1 --rw=write --bs=1m --size={_PREFILL_GIB}g --verify=0' + ' --output=/tmp/pkb_fio_prefill.log' ), timeout=prefill_timeout, ignore_failure=True, ) - # Each fio job: runtime + 90 s buffer (run + JSON write + file read). - # We write fio output to a file inside the pod and retrieve it in a second - # short-lived kubectl exec, because: - # - A single 120 s kubectl exec session over GKE websocket can be reset - # by the control-plane load balancer mid-stream ("connection reset by - # peer"), losing the output. - # - Separating the long run from the short file-read gives each exec a - # much shorter window, avoiding the keepalive timeout. fio_run_timeout = _FIO_RUNTIME_SEC.value + 90 - fio_read_timeout = 60 # just a cat of the JSON file + fio_read_timeout = 60 for name, rw, bs, depth, label in _FIO_JOBS: - logging.info("[swap_encryption] fio: %s", name) - out_file = f"/tmp/pkb_fio_{name}.json" - # Remove any stale output first so a parse can never silently reuse a - # previous job's/run's result (rules out byte-identical results between - # runs being a caching artifact rather than a true device ceiling). - _pod_exec( - pod, - f"rm -f {out_file}", + logging.info('[swap_encryption] fio: %s', name) + out_file = f'/tmp/pkb_fio_{name}.json' + daemonset.PodExec( + f'rm -f {out_file}', ignore_failure=True, _retries=0, timeout=15, ) run_cmd = ( - f"fio --name={name} --filename={swap_dev} " - "--ioengine=libaio --direct=1 --verify=0 --randrepeat=0 " - f"--bs={bs} --iodepth={depth} --rw={rw} " - f"--time_based --runtime={_FIO_RUNTIME_SEC.value}s " - f"--output-format=json --output={out_file}" + f'fio --name={name} --filename={swap_dev} ' + '--ioengine=libaio --direct=1 --verify=0 --randrepeat=0 ' + f'--bs={bs} --iodepth={depth} --rw={rw} ' + f'--time_based --runtime={_FIO_RUNTIME_SEC.value}s ' + f'--output-format=json --output={out_file}' ) - _, err = _pod_exec( - pod, + _, err = daemonset.PodExec( run_cmd, timeout=fio_run_timeout, ignore_failure=True, _retries=0, ) - if "connection reset by peer" in err: + if 'connection reset by peer' in err: logging.warning( - "[swap_encryption] fio %s: kubectl exec connection " - "reset; result may be incomplete", + '[swap_encryption] fio %s: kubectl exec connection ' + 'reset; result may be incomplete', name, ) - out, _ = _pod_exec( - pod, + out, _ = daemonset.PodExec( f'cat {out_file} 2>/dev/null || echo ""', timeout=fio_read_timeout, ignore_failure=True, ) results += _parse_fio_json(out, name, label, base_meta) - # fio prefill overwrites the entire device, destroying the mkswap header. - # Re-stamp and re-enable before the remaining phases need active swap. - _pod_exec( - pod, - f"mkswap {swap_dev} && swapon {swap_dev}", + daemonset.PodExec( + f'mkswap {swap_dev} && swapon {swap_dev}', ignore_failure=True, timeout=120, ) @@ -3114,41 +1656,41 @@ def _parse_fio_json( data = json.loads(stdout) except (json.JSONDecodeError, ValueError): logging.warning( - "[swap_encryption] fio JSON parse failed for %s", job_name + '[swap_encryption] fio JSON parse failed for %s', job_name ) return results meta = dict(base_meta, fio_job=job_name, fio_label=label) - for job in data.get("jobs", []): - for direction in ("read", "write"): + for job in data.get('jobs', []): + for direction in ('read', 'write'): d = job.get(direction, {}) - if not d or d.get("io_bytes", 0) == 0: + if not d or d.get('io_bytes', 0) == 0: continue - iops = float(d.get("iops", 0)) - bw_kib = float(d.get("bw", 0)) - clat = d.get("clat_ns", {}) - pct = clat.get("percentile", {}) - lat_mean = float(clat.get("mean", 0)) / 1000.0 - lat_p50 = float(pct.get("50.000000", 0)) / 1000.0 - lat_p99 = float(pct.get("99.000000", 0)) / 1000.0 - lat_p999 = float(pct.get("99.900000", 0)) / 1000.0 + iops = float(d.get('iops', 0)) + bw_kib = float(d.get('bw', 0)) + clat = d.get('clat_ns', {}) + pct = clat.get('percentile', {}) + lat_mean = float(clat.get('mean', 0)) / 1000.0 + lat_p50 = float(pct.get('50.000000', 0)) / 1000.0 + lat_p99 = float(pct.get('99.000000', 0)) / 1000.0 + lat_p999 = float(pct.get('99.900000', 0)) / 1000.0 m = dict(meta, direction=direction) results += [ - sample.Sample(f"{job_name}_{direction}_iops", iops, "iops", m), + sample.Sample(f'{job_name}_{direction}_iops', iops, 'iops', m), sample.Sample( - f"{job_name}_{direction}_bw_mbps", bw_kib / 1024, "MB/s", m + f'{job_name}_{direction}_bw_mbps', bw_kib / 1024, 'MB/s', m ), sample.Sample( - f"{job_name}_{direction}_lat_mean", lat_mean, "us", m + f'{job_name}_{direction}_lat_mean', lat_mean, 'us', m ), sample.Sample( - f"{job_name}_{direction}_lat_p50", lat_p50, "us", m + f'{job_name}_{direction}_lat_p50', lat_p50, 'us', m ), sample.Sample( - f"{job_name}_{direction}_lat_p99", lat_p99, "us", m + f'{job_name}_{direction}_lat_p99', lat_p99, 'us', m ), sample.Sample( - f"{job_name}_{direction}_lat_p999", lat_p999, "us", m + f'{job_name}_{direction}_lat_p999', lat_p999, 'us', m ), ] return results @@ -3164,16 +1706,15 @@ def _parse_vm_bytes_to_mb(vm_bytes: str) -> float: value = float(vm_bytes[:-1]) except ValueError: return 0.0 - if suffix == "G": + if suffix == 'G': return value * 1024.0 - elif suffix == "M": + elif suffix == 'M': return value - elif suffix == "K": + elif suffix == 'K': return value / 1024.0 - elif suffix == "T": + elif suffix == 'T': return value * 1024.0 * 1024.0 else: - # Assume bytes try: return float(vm_bytes) / (1024.0 * 1024.0) except ValueError: @@ -3187,12 +1728,8 @@ def _per_worker_vm_bytes(total_vm_bytes: str, workers: int) -> str: touches ``N * B`` of memory. Every vm_bytes value in this benchmark (the --swap_encryption_stress_vm_bytes flag and the _autoscale_vm_bytes result) represents the intended *combined* footprint, as documented on - --swap_encryption_stress_vm_workers ("workers divide vm_bytes equally ... - the combined in-flight footprint equals vm_bytes"). We therefore divide by - the worker count before handing the value to stress-ng; otherwise N>1 - workers allocate N x the target and the kernel OOM-kills the whole pod - (observed as stress-ng rc=137, after which all later phases fail with - "pods not found"). + --swap_encryption_stress_vm_workers. We therefore divide by the worker + count before handing the value to stress-ng. Returns a stress-ng-friendly ``M`` string (megabytes), floored to at least 1M. @@ -3200,32 +1737,23 @@ def _per_worker_vm_bytes(total_vm_bytes: str, workers: int) -> str: workers = max(1, int(workers)) total_mb = _parse_vm_bytes_to_mb(total_vm_bytes) if total_mb <= 0: - # Unparseable — fall back to letting stress-ng divide nothing rather than - # silently changing behaviour; the caller's value is passed through. return total_vm_bytes per_worker_mb = max(1, int(total_mb / workers)) - return f"{per_worker_mb}M" + return f'{per_worker_mb}M' -def _cgroup_swap_limit_mb(pod: str) -> float: +def _cgroup_swap_limit_mb(daemonset: _ds_mod.SwapDaemonSet) -> float: """Return the swap budget (in MB) that the benchmark cgroup can actually use. - GKE sets the per-container cgroup v2 ``memory.swap.max`` to 0, so even though - the node advertises a large swap device the container cannot page anything - out. Sizing stress-ng against the *node* swap total in that case guarantees - an OOM kill. This probe finds the swap budget of *our* cgroup so the caller - can size against reality. - - We locate our own cgroup from the host-mounted /sys by finding the - ``cgroup.procs`` file that lists this shell's PID — ``hostPID: true`` means - ``$$`` is a host-namespace PID that appears in those files, and the - kubectl-exec'd shell shares the container's cgroup with stress-ng. + GKE sets the per-container cgroup v2 ``memory.swap.max`` to 0, so even + though the node advertises a large swap device the container cannot page + anything out. This probe finds the swap budget of *our* cgroup so the + caller can size against reality. Returns: ``float('inf')`` when swap is uncapped (``max``); the limit in MB when capped to a finite value; ``0.0`` when swap is fully locked - (``memory.swap.max == 0``); ``-1.0`` when the limit could not be read (the - caller then falls back to the legacy node-total behaviour). + (``memory.swap.max == 0``); ``-1.0`` when the limit could not be read. """ probe = textwrap.dedent(""" mypid=$$ @@ -3246,39 +1774,41 @@ def _cgroup_swap_limit_mb(pod: str) -> float: done """) try: - out, _ = _pod_exec(pod, probe, timeout=20, ignore_failure=True) + out, _ = daemonset.PodExec(probe, timeout=20, ignore_failure=True) except Exception as e: # pylint: disable=broad-except logging.warning( - "[swap_encryption] cgroup swap-limit probe failed: %s", e + '[swap_encryption] cgroup swap-limit probe failed: %s', e ) return -1.0 - text = (out or "").strip() - m = re.search(r"V2=(\S+)", text) + text = (out or '').strip() + m = re.search(r'V2=(\S+)', text) if m: val = m.group(1) - if val == "max": - return float("inf") + if val == 'max': + return float('inf') try: return int(val) / (1024.0 * 1024.0) except ValueError: return -1.0 - # cgroup v1: the combined RAM+swap ceiling is memsw; swap budget = memsw-mem. - m = re.search(r"MEMSW=(\S+)\s+MEM=(\S+)", text) + m = re.search(r'MEMSW=(\S+)\s+MEM=(\S+)', text) if m: try: memsw = int(m.group(1)) mem = int(m.group(2)) except ValueError: return -1.0 - # A near-2^63 sentinel means "unlimited" in cgroup v1. if memsw >= (1 << 62): - return float("inf") + return float('inf') return max(0.0, (memsw - mem) / (1024.0 * 1024.0)) return -1.0 -def _autoscale_vm_bytes(pod: str, vm_bytes: str) -> str: +def _autoscale_vm_bytes( + daemonset: _ds_mod.SwapDaemonSet, + vm_bytes: str, + degraded_reasons: list[str], +) -> str: """Ensure vm_bytes forces real swap I/O without hard-crashing the container. Strategy @@ -3287,42 +1817,24 @@ def _autoscale_vm_bytes(pod: str, vm_bytes: str) -> str: overhead is actually measured. Two competing constraints apply: 1. vm_bytes must exceed available RAM so that anonymous pages are paged out - to the swap device. A value below ~95 % of RAM fits entirely in memory - and produces swap_out_pages_per_sec = 0 (benchmark defeats itself). + to the swap device. 2. vm_bytes must not be so large that the kernel OOM-kills the whole container before any meaningful swap activity is recorded. - Target formula - -------------- - target = RAM + min(swap_size × 0.25, 64 GB) - - This guarantees at least 25 % of the swap device is actively exercised - (measured swap I/O) while keeping the allocation safely within what the - kernel can page out given the available swap space. The 64 GB cap prevents - extremely large targets on machines with huge swap devices. - - On large-RAM machines (e.g. n4-highmem-32, 252 GB) the old 110%-of-RAM - formula only overflowed by ~25 GB; with sequential write64 patterns the - kernel handled that via LRU page eviction without actually hitting the swap - device, yielding swap_out = 0. The new formula forces a much larger working - set into swap. - - Hard ceiling - ------------ - Regardless of the formula, cap at RAM + swap_size - 4 GB (4 GB headroom) - to avoid exhausting the swap device and triggering kernel panics. + Target formula: target = RAM + min(swap_size x 0.25, 64 GB) + Hard ceiling: RAM + swap_size - 4 GB headroom. """ try: - meminfo_out, _ = _pod_exec(pod, "cat /proc/meminfo", timeout=15) + meminfo_out, _ = daemonset.PodExec('cat /proc/meminfo', timeout=15) node_ram_kb = 0 swap_total_kb = 0 for line in meminfo_out.splitlines(): - if line.startswith("MemTotal:"): + if line.startswith('MemTotal:'): parts = line.split() if len(parts) >= 2: node_ram_kb = int(parts[1]) - elif line.startswith("SwapTotal:"): + elif line.startswith('SwapTotal:'): parts = line.split() if len(parts) >= 2: swap_total_kb = int(parts[1]) @@ -3331,7 +1843,7 @@ def _autoscale_vm_bytes(pod: str, vm_bytes: str) -> str: if node_ram_kb <= 0: logging.warning( - "[swap_encryption] Could not read MemTotal; using vm_bytes=%s", + '[swap_encryption] Could not read MemTotal; using vm_bytes=%s', vm_bytes, ) return vm_bytes @@ -3342,57 +1854,39 @@ def _autoscale_vm_bytes(pod: str, vm_bytes: str) -> str: if requested_mb <= 0: return vm_bytes - # The node may advertise a large SwapTotal while THIS cgroup is forbidden - # from using it (GKE sets memory.swap.max=0 per container). Size against - # the swap the cgroup can actually reach, not the node total — otherwise a - # value like 32G OOM-kills the pod the instant it exceeds RAM. - cgroup_swap_mb = _cgroup_swap_limit_mb(pod) - usable_swap_mb = ( - swap_total_mb # default / legacy when probe is inconclusive - ) + cgroup_swap_mb = _cgroup_swap_limit_mb(daemonset) + usable_swap_mb = swap_total_mb if cgroup_swap_mb == 0.0: - # Swap is fully locked. Cap the working set just under RAM so the pod - # survives, and mark the run degraded: swap-encryption overhead cannot be - # measured when the cgroup cannot page out. safe_gb = max(1, int(node_ram_mb * 0.9 / 1024)) msg = ( - "cgroup swap is locked (memory.swap.max=0); the" - f" {swap_total_mb/1024:.0f} GB node swap device is unreachable." - f" Capping stress-ng vm_bytes {vm_bytes} → {safe_gb}G (0.9 x" - " RAM) to keep the pod alive — swap-encryption overhead will" - " NOT be measured this run" + 'cgroup swap is locked (memory.swap.max=0); the' + f' {swap_total_mb/1024:.0f} GB node swap device is unreachable.' + f' Capping stress-ng vm_bytes {vm_bytes} → {safe_gb}G (0.9 x' + ' RAM) to keep the pod alive — swap-encryption overhead will' + ' NOT be measured this run' ) - logging.error("[swap_encryption] %s", msg) - _degraded_reasons.append(msg) - return f"{safe_gb}G" - if 0.0 < cgroup_swap_mb < float("inf"): - # cgroup permits a finite swap budget smaller than the device. + logging.error('[swap_encryption] %s', msg) + degraded_reasons.append(msg) + return f'{safe_gb}G' + if 0.0 < cgroup_swap_mb < float('inf'): usable_swap_mb = min(swap_total_mb, cgroup_swap_mb) - # cgroup_swap_mb == inf -> swap fully usable (node total stands) - # cgroup_swap_mb == -1 -> undetermined; fall back to node total (legacy) - # Desired overflow: 25% of usable swap capped at 64 GB, minimum 4 GB. overflow_mb = max(min(usable_swap_mb * 0.25, 64.0 * 1024), 4.0 * 1024) target_mb = node_ram_mb + overflow_mb - # Hard ceiling: never exceed RAM + usable swap − 4 GB headroom. if usable_swap_mb > 0: ceiling_mb = node_ram_mb + usable_swap_mb - 4096.0 target_mb = min(target_mb, ceiling_mb) else: - # No usable swap at all (and not the locked-at-0 case handled above): - # keep the working set just under RAM. target_mb = min(target_mb, node_ram_mb * 0.9) - target_gb = max( - 1, int(target_mb / 1024) - ) # floor to GB for a clean flag + target_gb = max(1, int(target_mb / 1024)) if requested_mb < node_ram_mb * 0.95: - new_vm_bytes = f"{target_gb}G" + new_vm_bytes = f'{target_gb}G' logging.warning( - "[swap_encryption] Auto-scaling vm_bytes UP: %s → %s (RAM %.0f" - " GB, swap %.0f GB; original value would not trigger swap)", + '[swap_encryption] Auto-scaling vm_bytes UP: %s → %s (RAM %.0f' + ' GB, swap %.0f GB; original value would not trigger swap)', vm_bytes, new_vm_bytes, node_ram_mb / 1024, @@ -3401,10 +1895,10 @@ def _autoscale_vm_bytes(pod: str, vm_bytes: str) -> str: return new_vm_bytes if requested_mb > target_mb: - new_vm_bytes = f"{target_gb}G" + new_vm_bytes = f'{target_gb}G' logging.warning( - "[swap_encryption] Capping vm_bytes DOWN: %s → %s (RAM %.0f GB," - " swap %.0f GB; original value risks swap exhaustion)", + '[swap_encryption] Capping vm_bytes DOWN: %s → %s (RAM %.0f GB,' + ' swap %.0f GB; original value risks swap exhaustion)', vm_bytes, new_vm_bytes, node_ram_mb / 1024, @@ -3415,118 +1909,85 @@ def _autoscale_vm_bytes(pod: str, vm_bytes: str) -> str: return vm_bytes except Exception as e: # pylint: disable=broad-except logging.warning( - "[swap_encryption] _autoscale_vm_bytes failed (%s); using %s", + '[swap_encryption] _autoscale_vm_bytes failed (%s); using %s', e, vm_bytes, ) return vm_bytes -def _get_stress_vm_method(pod: str) -> str: +def _get_stress_vm_method(daemonset: _ds_mod.SwapDaemonSet) -> str: """Detect the best --vm-method argument for stress-ng on this node. - stress-ng vm-method support varies by version and distro: - - Older Ubuntu / some GKE images: supports 'mmap' - - Newer Ubuntu on n4-highmem-32 (kernel 6.8+ GKE): 'mmap' removed; supports - 'write64', 'rand-set', etc. - - We prefer 'mmap' (lowest overhead, no kernel structure cycling), fall back to - 'write64' (simple sequential writes, universally available), then 'rand-set', - and if none are listed we return '' so callers omit the --vm-method flag - entirely (stress-ng then uses its compiled-in default). - - NOTE on forcing swap (two independent requirements): - (a) The working set must exceed RAM. Without --vm-keep each worker re-mmaps - and re-touches its full slice every iteration, so all - --swap_encryption_stress_vm_workers slices are simultaneously resident and - the combined footprint exceeds RAM (run 910c8da5 swapped ~10k pages/s with - write64 and no --vm-keep). Adding --vm-keep made stress-ng reuse one - quiescent mapping, the resident set plateaued below RAM, and the gate - fired — so we must NOT pass --vm-keep. - (b) The workers must stay BUSY for the whole phase. Do NOT pass --vm-hang 0: - stress-ng documents "--vm-hang 0" as "sleep for an INFINITE time before - unmapping", so each worker wrote its slice once and then slept for the - rest of the run — usr+sys CPU was ~10 s out of 300 s and si/so stayed 0 - (runs 14907cff, config1/111, even with KSM disabled and rand-set). - Omitting --vm-hang entirely lets the workers loop continuously, keeping - the slices hot so the over-RAM remainder pages to swap throughout. - - Result is cached in _stress_vm_method so the detection kubectl exec only runs - once per benchmark run. + Result is cached in _stress_vm_method so the detection kubectl exec only + runs once per benchmark run. """ if _stress_vm_method: return _stress_vm_method[0] try: - # stress-ng prints its valid vm-methods to stdout when given an invalid one. out, _, _ = kubectl.RunKubectlCommand( [ - "exec", - (_active_pod[0] if _active_pod else pod), - "-n", + 'exec', + daemonset.pod_name, + '-n', _DS_NAMESPACE, - "--", - "bash", - "-c", + '--', + 'bash', + '-c', ( - "stress-ng --vm 1 --vm-bytes 1M --vm-method __invalid__" - " --timeout 1s 2>&1 || true" + 'stress-ng --vm 1 --vm-bytes 1M --vm-method __invalid__' + ' --timeout 1s 2>&1 || true' ), ], raise_on_failure=False, timeout=15, ) combined = out.lower() - # Prefer rand-set: random access keeps every page of each worker's slice - # hot (no cold pages behind a sequential write pointer to reclaim) and - # writes non-identical data (so KSM cannot merge the workers' regions). - # write64 is sequential and was empirically reclaimed / merged, leaving the - # resident set below RAM and swap_out ~0. - if "rand-set" in combined: - method = "rand-set" - elif "mmap" in combined: - method = "mmap" - elif "write64" in combined: - method = "write64" + if 'rand-set' in combined: + method = 'rand-set' + elif 'mmap' in combined: + method = 'mmap' + elif 'write64' in combined: + method = 'write64' else: - method = "" # omit flag; use stress-ng default + method = '' logging.info( - "[swap_encryption] stress-ng vm-method detected: %r", - method or "(default)", + '[swap_encryption] stress-ng vm-method detected: %r', + method or '(default)', ) except Exception as e: # pylint: disable=broad-except logging.warning( - "[swap_encryption] vm-method detection failed (%s); using rand-set", + '[swap_encryption] vm-method detection failed (%s); using rand-set', e, ) - method = "rand-set" + method = 'rand-set' _stress_vm_method.append(method) return method -def _stress_vm_method_flag(pod: str) -> str: +def _stress_vm_method_flag(daemonset: _ds_mod.SwapDaemonSet) -> str: """Return the --vm-method flag string, or empty string if none.""" - method = _get_stress_vm_method(pod) - return f"--vm-method {method}" if method else "" + method = _get_stress_vm_method(daemonset) + return f'--vm-method {method}' if method else '' -def _phase2a_cpu_overhead(pod: str, base_meta: dict) -> list[sample.Sample]: +def _phase2a_cpu_overhead( + daemonset: _ds_mod.SwapDaemonSet, + base_meta: dict, + degraded_reasons: list[str], +) -> list[sample.Sample]: """Measure CPU cost of dm-crypt / Nitro while stress-ng drives swap I/O. If --swap_encryption_stress_vm_bytes_list is set the phase is run once per listed intensity value so that a full pressure-curve is captured (gap 5). Otherwise the single value from --swap_encryption_stress_vm_bytes is used. - - Auto-scaling: if the requested vm_bytes is less than 95% of node RAM, it is - automatically increased to 110% of node RAM so that swap is actually - triggered on large-RAM machines (e.g. n4-highmem-32 with 256 GB). """ - # Build the list of vm-bytes intensities to sweep (gap 5) if _STRESS_VM_BYTES_LIST.value.strip(): intensities = [ v.strip() - for v in _STRESS_VM_BYTES_LIST.value.split(",") + for v in _STRESS_VM_BYTES_LIST.value.split(',') if v.strip() ] else: @@ -3534,16 +1995,21 @@ def _phase2a_cpu_overhead(pod: str, base_meta: dict) -> list[sample.Sample]: results = [] for vm_bytes in intensities: - scaled = _autoscale_vm_bytes(pod, vm_bytes) + scaled = _autoscale_vm_bytes(daemonset, vm_bytes, degraded_reasons) logging.info( - "[swap_encryption] Phase 2a: stress-ng intensity %s", scaled + '[swap_encryption] Phase 2a: stress-ng intensity %s', scaled + ) + results += _run_cpu_overhead_sweep( + daemonset, base_meta, scaled, degraded_reasons ) - results += _run_cpu_overhead_sweep(pod, base_meta, scaled) return results def _run_cpu_overhead_sweep( - pod: str, base_meta: dict, vm_bytes: str + daemonset: _ds_mod.SwapDaemonSet, + base_meta: dict, + vm_bytes: str, + degraded_reasons: list[str], ) -> list[sample.Sample]: """Phase 2a stressor sweep, WITH RETRY for flaky swap. @@ -3554,23 +2020,22 @@ def _run_cpu_overhead_sweep( reclaim memory and re-run, keeping the BEST attempt. An OOM, or a peak at/above threshold, ends the retries immediately. """ - meta = dict(base_meta, phase="cpu_overhead", stress_vm_bytes=vm_bytes) + meta = dict(base_meta, phase='cpu_overhead', stress_vm_bytes=vm_bytes) timeout = _STRESS_TIMEOUT_SEC.value interval = 2 n_samples = timeout // interval + 10 - vmstat_log = f"/tmp/pkb_vmstat_{vm_bytes}.log" - pidstat_log = f"/tmp/pkb_pidstat_{vm_bytes}.log" + vmstat_log = f'/tmp/pkb_vmstat_{vm_bytes}.log' + pidstat_log = f'/tmp/pkb_pidstat_{vm_bytes}.log' workers = max(1, _STRESS_VM_WORKERS.value) per_worker = _per_worker_vm_bytes(vm_bytes, workers) min_so = _MIN_SWAP_OUT_PAGES.value - method_flag = _stress_vm_method_flag(pod) + method_flag = _stress_vm_method_flag(daemonset) max_attempts = 3 best = None for attempt in range(1, max_attempts + 1): t0 = time.time() - stress_out, _ = _pod_exec( - pod, + stress_out, _ = daemonset.PodExec( textwrap.dedent(f""" echo 2 > /sys/kernel/mm/ksm/run 2>/dev/null || true echo 0 > /sys/kernel/mm/ksm/run 2>/dev/null || true @@ -3594,14 +2059,16 @@ def _run_cpu_overhead_sweep( elapsed = time.time() - t0 completed_cleanly = ( - "successful run completed" in stress_out.lower() - or "metrics-brief" in stress_out.lower() - or "bogo-ops" in stress_out.lower() + 'successful run completed' in stress_out.lower() + or 'metrics-brief' in stress_out.lower() + or 'bogo-ops' in stress_out.lower() ) oom_killed = (not completed_cleanly) and elapsed < timeout * 0.8 - vmstat_out, _ = _pod_exec(pod, f"cat {vmstat_log}", ignore_failure=True) - pidstat_out, _ = _pod_exec( - pod, f"cat {pidstat_log}", ignore_failure=True + vmstat_out, _ = daemonset.PodExec( + f'cat {vmstat_log}', ignore_failure=True + ) + pidstat_out, _ = daemonset.PodExec( + f'cat {pidstat_log}', ignore_failure=True ) vmstat_samples = _parse_vmstat(vmstat_out, meta) swap_out_max = max( @@ -3609,26 +2076,26 @@ def _run_cpu_overhead_sweep( s.value for s in vmstat_samples if s.metric - in ("swap_out_pages_per_sec", "swap_out_pages_per_sec_max") + in ('swap_out_pages_per_sec', 'swap_out_pages_per_sec_max') ), default=0.0, ) bogo = None for line in stress_out.splitlines(): - mm = re.search(r"vm\s+\d+\s+(\d+)\s+\S+\s+bogo-ops", line) + mm = re.search(r'vm\s+\d+\s+(\d+)\s+\S+\s+bogo-ops', line) if mm: bogo = float(mm.group(1)) break logging.info( - "[swap_encryption] Phase 2a attempt %d/%d: peak swap-out " - "%.0f pages/s (completed=%s, oom=%s)", + '[swap_encryption] Phase 2a attempt %d/%d: peak swap-out ' + '%.0f pages/s (completed=%s, oom=%s)', attempt, max_attempts, swap_out_max, completed_cleanly, oom_killed, ) - if best is None or swap_out_max > best["swap_out_max"]: + if best is None or swap_out_max > best['swap_out_max']: best = dict( elapsed=elapsed, oom_killed=oom_killed, @@ -3641,15 +2108,14 @@ def _run_cpu_overhead_sweep( break if attempt < max_attempts: logging.warning( - "[swap_encryption] Phase 2a swap-out %.0f < %d threshold " - "— reclaiming and retrying (%d/%d)", + '[swap_encryption] Phase 2a swap-out %.0f < %d threshold ' + '— reclaiming and retrying (%d/%d)', swap_out_max, min_so, attempt + 1, max_attempts, ) - _pod_exec( - pod, + daemonset.PodExec( textwrap.dedent(""" echo -1000 > /proc/self/oom_score_adj 2>/dev/null || true pkill -9 stress-ng 2>/dev/null || true @@ -3661,40 +2127,40 @@ def _run_cpu_overhead_sweep( # Emit samples from the BEST attempt. results = [ - sample.Sample("stress_ng_duration_sec", best["elapsed"], "s", meta), + sample.Sample('stress_ng_duration_sec', best['elapsed'], 's', meta), sample.Sample( - "stress_ng_completed", - 0.0 if best["oom_killed"] else 1.0, - "status", + 'stress_ng_completed', + 0.0 if best['oom_killed'] else 1.0, + 'status', meta, ), ] - if best["bogo"] is not None: + if best['bogo'] is not None: results.append( - sample.Sample("stress_ng_bogo_ops", best["bogo"], "ops", meta) + sample.Sample('stress_ng_bogo_ops', best['bogo'], 'ops', meta) ) - results += best["vmstat_samples"] - results += _parse_pidstat(best["pidstat_out"], meta) + results += best['vmstat_samples'] + results += _parse_pidstat(best['pidstat_out'], meta) # Swap-activity gate: a completed run that moved ~no pages to swap never - # exercised the encrypted swap path (the headline numbers would be hollow). - if best["oom_killed"]: + # exercised the encrypted swap path. + if best['oom_killed']: msg = ( - f"stress-ng (vm_bytes={vm_bytes}) was OOM-killed — the cgroup could" - " not page anonymous memory out to swap; swap-encryption overhead" - " was not measured" + f'stress-ng (vm_bytes={vm_bytes}) was OOM-killed — the cgroup could' + ' not page anonymous memory out to swap; swap-encryption overhead' + ' was not measured' ) - logging.error("[swap_encryption] %s", msg) - _degraded_reasons.append(msg) - elif best["swap_out_max"] < min_so: + logging.error('[swap_encryption] %s', msg) + degraded_reasons.append(msg) + elif best['swap_out_max'] < min_so: msg = ( - f"stress-ng (vm_bytes={vm_bytes}) peak swap-out was only " + f'stress-ng (vm_bytes={vm_bytes}) peak swap-out was only ' f'{best["swap_out_max"]:.0f} pages/s (< {min_so} threshold) after ' - f"{max_attempts} attempts — the working set never meaningfully " - f"paged to swap. Check vm_bytes vs RAM and the swap device" + f'{max_attempts} attempts — the working set never meaningfully ' + f'paged to swap. Check vm_bytes vs RAM and the swap device' ) - logging.error("[swap_encryption] %s", msg) - _degraded_reasons.append(msg) + logging.error('[swap_encryption] %s', msg) + degraded_reasons.append(msg) return results @@ -3708,10 +2174,9 @@ def _parse_vmstat(output: str, base_meta: dict) -> list[sample.Sample]: si=6, so=7 – swap-in / swap-out pages/s us=12 – user CPU % - sy=13 – system (kernel) CPU % ← gap 2: system time % + sy=13 – system (kernel) CPU % id=14 – idle CPU % wa=15 – I/O wait CPU % - total_active = us + sy + wa ← gap 1: total CPU utilisation """ si_vals, so_vals = [], [] us_vals, sy_vals, wa_vals = [], [], [] @@ -3732,7 +2197,7 @@ def _parse_vmstat(output: str, base_meta: dict) -> list[sample.Sample]: if not si_vals: return [] - meta = dict(base_meta, metric_source="vmstat") + meta = dict(base_meta, metric_source='vmstat') def _mean(lst): return sum(lst) / len(lst) if lst else 0.0 @@ -3743,26 +2208,22 @@ def _peak(lst): total_active = [u + s + w for u, s, w in zip(us_vals, sy_vals, wa_vals)] return [ - # Swap rates - sample.Sample("swap_in_pages_per_sec", _mean(si_vals), "pages/s", meta), + sample.Sample('swap_in_pages_per_sec', _mean(si_vals), 'pages/s', meta), sample.Sample( - "swap_in_pages_per_sec_max", _peak(si_vals), "pages/s", meta + 'swap_in_pages_per_sec_max', _peak(si_vals), 'pages/s', meta ), sample.Sample( - "swap_out_pages_per_sec", _mean(so_vals), "pages/s", meta + 'swap_out_pages_per_sec', _mean(so_vals), 'pages/s', meta ), sample.Sample( - "swap_out_pages_per_sec_max", _peak(so_vals), "pages/s", meta + 'swap_out_pages_per_sec_max', _peak(so_vals), 'pages/s', meta ), - # Total CPU utilisation (gap 1) - sample.Sample("total_cpu_pct_avg", _mean(total_active), "%", meta), - sample.Sample("total_cpu_pct_max", _peak(total_active), "%", meta), - # System (kernel) time % – encryption overhead signal (gap 2) - sample.Sample("system_time_pct_avg", _mean(sy_vals), "%", meta), - sample.Sample("system_time_pct_max", _peak(sy_vals), "%", meta), - # User and I/O-wait for completeness - sample.Sample("user_cpu_pct_avg", _mean(us_vals), "%", meta), - sample.Sample("iowait_cpu_pct_avg", _mean(wa_vals), "%", meta), + sample.Sample('total_cpu_pct_avg', _mean(total_active), '%', meta), + sample.Sample('total_cpu_pct_max', _peak(total_active), '%', meta), + sample.Sample('system_time_pct_avg', _mean(sy_vals), '%', meta), + sample.Sample('system_time_pct_max', _peak(sy_vals), '%', meta), + sample.Sample('user_cpu_pct_avg', _mean(us_vals), '%', meta), + sample.Sample('iowait_cpu_pct_avg', _mean(wa_vals), '%', meta), ] @@ -3781,39 +2242,26 @@ def _parse_pidstat(output: str, base_meta: dict) -> list[sample.Sample]: except (ValueError, IndexError): pass results = [] - meta = dict(base_meta, metric_source="pidstat") + meta = dict(base_meta, metric_source='pidstat') for proc, vals in cpu_by_proc.items(): m = dict(meta, process=proc) results += [ - sample.Sample(f"cpu_pct_avg_{proc}", sum(vals) / len(vals), "%", m), - sample.Sample(f"cpu_pct_max_{proc}", max(vals), "%", m), + sample.Sample(f'cpu_pct_avg_{proc}', sum(vals) / len(vals), '%', m), + sample.Sample(f'cpu_pct_max_{proc}', max(vals), '%', m), ] return results -def _launch_confined_bg_stress(pod: str, timeout_s: int, logfile: str) -> None: - """Launch the Phase 2b/3a background swap stressor confined to its OWN +def _launch_confined_bg_stress( + daemonset: _ds_mod.SwapDaemonSet, timeout_s: int, logfile: str +) -> None: + """Launch the Phase 2b background swap stressor confined to its OWN memory-capped cgroup, so it drives swap pressure WITHOUT starving the - concurrent foreground workload (fio / Redis) or OOM-killing the pod. - - On a small node (config1, 30 GB) a flat 32 GB stressor plus a concurrent - workload exhausts RAM faster than the kernel pages out, and the OOM killer - takes the foreground process (the under-pressure app_io fio died with - rc=137). Confining the stressor to memory.max = 60% of RAM (with unlimited - swap) makes it page within its own budget; the other ~40% of RAM stays free - for the workload, and if the stressor overruns its cap only IT is killed — - never the pod or the workload. - - Config-2 safety: on a 256 GB node, 60% = ~150 GB, far above the 32 GB - stressor, so the cap is never reached and behaviour is unchanged. - Best-effort: if the cgroup can't be created the stressor still runs in the - main cgroup (degrades to prior behaviour, not worse). MemTotal is read with - grep/cut (no awk) to keep this clear of f-string brace escaping. + concurrent foreground workload (fio) or OOM-killing the pod. """ - method = _stress_vm_method_flag(pod) + method = _stress_vm_method_flag(daemonset) vm_bytes = _STRESS_VM_BYTES.value - _pod_exec( - pod, + daemonset.PodExec( textwrap.dedent(f""" nohup bash -c ' BG=/sys/fs/cgroup/pkb_bgstress @@ -3832,25 +2280,19 @@ def _launch_confined_bg_stress(pod: str, timeout_s: int, logfile: str) -> None: ) -def _set_memory_high_guard(pod: str, fraction: float = 0.9) -> None: +def _set_memory_high_guard( + daemonset: _ds_mod.SwapDaemonSet, fraction: float = 0.9 +) -> None: """Cap the container cgroup ``memory.high`` at `fraction` x RAM. - Phases 2b (I/O interference) and 3a (Redis) run a background stressor *and* a - concurrent foreground workload (an 8 GB fio file / a Redis dataset). On a - small-RAM node (config1, 30 GB) their combined footprint exceeds RAM and the - hard OOM killer (``memory.max``) terminates the pod (rc=137), wiping out both - phases. ``memory.high`` is a soft limit: when the cgroup crosses it the - kernel reclaims and *swaps* aggressively (throttling the cgroup) instead of - killing it — which is exactly the swap pressure these phases want to create. - - Config-2 safety: this is a no-op in effect on large-RAM nodes. On - n4-highmem-32 (256 GB) the 32 GB background workload never approaches 0.9 x - 256 GB = 230 GB, so the soft limit is never crossed and behaviour is - unchanged. Phase 2a is deliberately NOT guarded (it works on both configs). + Phases 2b run a background stressor AND a concurrent foreground workload. + On a small-RAM node their combined footprint exceeds RAM and the hard OOM + killer terminates the pod. ``memory.high`` is a soft limit: when the + cgroup crosses it the kernel reclaims and swaps aggressively (throttling + the cgroup) instead of killing it. Best-effort; any failure is ignored. """ - _pod_exec( - pod, + daemonset.PodExec( textwrap.dedent(f""" PKB_MCG=$(awk -F: '/^0::/{{print $3}}' /proc/self/cgroup 2>/dev/null) MT_KB=$(awk '/MemTotal/{{print $2}}' /proc/meminfo) @@ -3867,10 +2309,9 @@ def _set_memory_high_guard(pod: str, fraction: float = 0.9) -> None: ) -def _reset_memory_high_guard(pod: str) -> None: +def _reset_memory_high_guard(daemonset: _ds_mod.SwapDaemonSet) -> None: """Restore ``memory.high`` to ``max`` after a guarded phase.""" - _pod_exec( - pod, + daemonset.PodExec( textwrap.dedent(""" PKB_MCG=$(awk -F: '/^0::/{print $3}' /proc/self/cgroup 2>/dev/null) if [ -n "$PKB_MCG" ] && [ -f "/sys/fs/cgroup$PKB_MCG/memory.high" ]; then @@ -3883,26 +2324,18 @@ def _reset_memory_high_guard(pod: str) -> None: ) -def _phase2b_io_interference(pod: str, base_meta: dict) -> list[sample.Sample]: +def _phase2b_io_interference( + daemonset: _ds_mod.SwapDaemonSet, base_meta: dict +) -> list[sample.Sample]: """Quantify drop in application I/O when swap is under simultaneous pressure.""" results = [] - # IMPORTANT: keep this OFF tmpfs. /tmp is RAM-backed (tmpfs/overlay), so an - # 8 GB fio file there consumes 8 GB of RAM and OOM-kills the pod on a small - # node (config1, rc=137 at "Laying out IO file") before any swap pressure is - # even applied. /mnt/stateful_partition is the node's persistent boot disk - # (hostPath mount) — the file lives on disk, not RAM, and the fio results - # then measure real disk I/O under swap pressure, which is the intent. - app_file = "/mnt/stateful_partition/pkb_app_io" + app_file = '/mnt/stateful_partition/pkb_app_io' timeout = _STRESS_TIMEOUT_SEC.value - meta = dict(base_meta, phase="io_interference") + meta = dict(base_meta, phase='io_interference') - # Relieve memory pressure via swap rather than the OOM killer (see helper). - # No-op on large-RAM nodes; prevents the config1 Phase 2b OOM (rc=137). - _set_memory_high_guard(pod) + _set_memory_high_guard(daemonset) - # Ensure fio is available — apt-get may have failed during DaemonSet init. - _pod_exec( - pod, + daemonset.PodExec( textwrap.dedent(""" command -v fio >/dev/null 2>&1 || { apt-get install -y -qq fio 2>/dev/null || true @@ -3912,15 +2345,7 @@ def _phase2b_io_interference(pod: str, base_meta: dict) -> list[sample.Sample]: timeout=120, ) - # Reclaim node memory BEFORE creating the test file. By this point Phase 2a - # has hard-swapped the node and Phase 3c's OpenSearch (which runs first) may - # have left a multi-GB JVM footprint; on a 30 GB node the file create then - # gets OOM-killed (rc=137) at the NODE level — which neither --direct=1 nor - # the cgroup memory.high guard can prevent (those are cgroup/page-cache - # tools, not node-eviction controls). Kill any leftover stressors/servers, - # flush dirty pages, and drop caches so the node starts Phase 2b clean. - _pod_exec( - pod, + daemonset.PodExec( textwrap.dedent(""" pkill -9 stress-ng 2>/dev/null || true pkill -9 -f 'opensearch|elasticsearch' 2>/dev/null || true @@ -3934,294 +2359,117 @@ def _phase2b_io_interference(pod: str, base_meta: dict) -> list[sample.Sample]: timeout=60, ) - # Create the test file on the persistent disk (see app_file note above). - # --direct=1 (O_DIRECT, ext4 supports it) bypasses the page cache. Size is - # kept at 4 GB (not 8) so the create + the concurrent background stressor - # cannot exhaust a 30 GB node even with swap already in use. - _pod_exec( - pod, + daemonset.PodExec( ( - f"fio --name=create --filename={app_file} " - "--rw=write --bs=1m --size=4G --verify=0 --direct=1" + f'fio --name=create --filename={app_file} ' + '--rw=write --bs=1m --size=4G --verify=0 --direct=1' ), timeout=600, ignore_failure=True, ) def _run_app_fio(pressure_label: str) -> list[sample.Sample]: - # --direct=1 (O_DIRECT) avoids page-cache buildup; ext4 on the persistent - # disk supports it. --size=4G matches the file created above. This - # measures the disk's I/O under swap pressure directly. cmd = ( - f"fio --name=app_io --filename={app_file} " - "--ioengine=libaio --direct=1 " - "--rw=randrw --bs=4k --iodepth=32 --size=4G --verify=0 " - "--time_based --runtime=60s --output-format=json" + f'fio --name=app_io --filename={app_file} ' + '--ioengine=libaio --direct=1 ' + '--rw=randrw --bs=4k --iodepth=32 --size=4G --verify=0 ' + '--time_based --runtime=60s --output-format=json' ) - # ignore_failure=True: fio rc=137 is expected when the pod is OOM-evicted - # under heavy swap pressure. _pod_exec handles recovery; callers rely on - # _parse_fio_json returning [] on empty/bad output rather than an exception. - out, _ = _pod_exec(pod, cmd, ignore_failure=True) + out, _ = daemonset.PodExec(cmd, ignore_failure=True) return _parse_fio_json( out, - "app_io", - f"App I/O ({pressure_label})", + 'app_io', + f'App I/O ({pressure_label})', dict(meta, pressure=pressure_label), ) - # 1. Baseline – no swap pressure - logging.info("[swap_encryption] I/O interference: baseline (no pressure)") - results += _run_app_fio("no_pressure") - - # 2. Under swap pressure - # Use nohup + disown so bash exits immediately after launching stress-ng; - # otherwise kubectl exec keeps the session alive until stress-ng finishes - # (300 s) and PKB's IssueCommand times out. - logging.info("[swap_encryption] I/O interference: under swap pressure") - # Confined background stressor: pages within a 60%-RAM cgroup so it can't - # OOM the concurrent app_io fio on a small node (see helper). - _launch_confined_bg_stress(pod, timeout, "/tmp/pkb_stress_io.log") - time.sleep(10) # let swap pressure build - results += _run_app_fio("with_swap_pressure") - - # Stop background stress-ng. If the pod was OOM-evicted while fio ran, - # stress-ng is already dead — kill is a no-op and we skip the long wait. - # _retries=0: no recovery here; the first Phase 3a command will recover - # the pod properly if needed (and it already waits for /tmp/pkb_ready). - _pod_exec( - pod, - "pkill -9 stress-ng 2>/dev/null || true", + logging.info('[swap_encryption] I/O interference: baseline (no pressure)') + results += _run_app_fio('no_pressure') + + logging.info('[swap_encryption] I/O interference: under swap pressure') + _launch_confined_bg_stress(daemonset, timeout, '/tmp/pkb_stress_io.log') + time.sleep(10) + results += _run_app_fio('with_swap_pressure') + + daemonset.PodExec( + 'pkill -9 stress-ng 2>/dev/null || true', ignore_failure=True, _retries=0, timeout=15, ) - _reset_memory_high_guard(pod) + _reset_memory_high_guard(daemonset) return results _INSTANCE_PRICE_USD_PER_HR: dict[str, float] = { # GCP (on-demand, us-central1 unless noted) - "c4-standard-8-lssd": 0.5888, # 8 vCPU, 32 GB RAM + 1×375 GB LSSD - "c4-standard-8": 0.5008, # 8 vCPU, 32 GB RAM, no LSSD - "n4-highmem-32": 3.0256, # 32 vCPU, 256 GB RAM - "n2-highmem-32": 2.5216, # 32 vCPU, 256 GB RAM - "n2-standard-32": 1.5264, # 32 vCPU, 120 GB RAM - "z3-highmem-8": 2.7248, # 8 vCPU + 4× LSSD + 'c4-standard-8-lssd': 0.5888, + 'c4-standard-8': 0.5008, + 'n4-highmem-32': 3.0256, + 'n2-highmem-32': 2.5216, + 'n2-standard-32': 1.5264, + 'z3-highmem-8': 2.7248, # AWS - "i4i.4xlarge": 1.4960, # 16 vCPU, 128 GB RAM, NVMe Instance Store - "i4i.2xlarge": 0.7480, - "m6id.4xlarge": 0.9072, # 16 vCPU, 64 GB RAM, NVMe Instance Store - "m6i.4xlarge": 0.7680, # 16 vCPU, 64 GB RAM, no Instance Store - "r6i.4xlarge": 1.0080, # 16 vCPU, 128 GB RAM, no Instance Store + 'i4i.4xlarge': 1.4960, + 'i4i.2xlarge': 0.7480, + 'm6id.4xlarge': 0.9072, + 'm6i.4xlarge': 0.7680, + 'r6i.4xlarge': 1.0080, } def _collect_cost_sample( - pod: str, elapsed_sec: float, base_meta: dict + daemonset: _ds_mod.SwapDaemonSet, + elapsed_sec: float, + base_meta: dict, ) -> list[sample.Sample]: - """Emit a cost_estimate_usd sample for the benchmark run (gap 7). - - Instance type is read from cloud metadata inside the pod. Price is looked - up from _INSTANCE_PRICE_USD_PER_HR; if unknown, the sample is omitted and - a warning is logged. - - Args: - pod: Benchmark pod name. - elapsed_sec: Wall-clock seconds the benchmark phases took. - base_meta: Shared metadata dict. + """Emit a cost_estimate_usd sample for the benchmark run.""" + instance_type = '' - Returns: - A list of zero or one sample.Sample. - """ - # Detect instance type from cloud metadata - instance_type = "" - - # GCP: machine type is the last segment of the metadata URL value - gcp_type_out, _ = _pod_exec( - pod, - "curl -s -m 3 --fail" - " http://metadata.google.internal/computeMetadata/v1/instance/machine-type" + gcp_type_out, _ = daemonset.PodExec( + 'curl -s -m 3 --fail' + ' http://metadata.google.internal/computeMetadata/v1/instance/machine-type' ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""', ignore_failure=True, ) if gcp_type_out.strip(): - instance_type = gcp_type_out.strip().split("/")[-1] + instance_type = gcp_type_out.strip().split('/')[-1] if not instance_type: - # AWS: instance-type is a plain string - aws_type_out, _ = _pod_exec( - pod, - "curl -s -m 3 --fail " - "http://169.254.169.254/latest/meta-data/instance-type " + aws_type_out, _ = daemonset.PodExec( + 'curl -s -m 3 --fail ' + 'http://169.254.169.254/latest/meta-data/instance-type ' '2>/dev/null || echo ""', ignore_failure=True, ) instance_type = aws_type_out.strip() - # Allow explicit override (useful when running on custom/renamed machine - # types or when the pod was unavailable during cost collection). if _INSTANCE_SIZE_LABEL.value: instance_type = _INSTANCE_SIZE_LABEL.value - # Last resort: fall back to the benchmark machine type flag. This ensures - # cost tracking works even when the pod was evicted before cost collection - # ran (in which case the metadata curl above returned empty). if not instance_type and _BENCHMARK_MACHINE_TYPE.value: instance_type = _BENCHMARK_MACHINE_TYPE.value logging.info( - "[swap_encryption] Instance type from metadata unavailable; using" - " --swap_encryption_benchmark_machine_type=%s for cost tracking", + '[swap_encryption] Instance type from metadata unavailable; using' + ' --swap_encryption_benchmark_machine_type=%s for cost tracking', instance_type, ) price = _INSTANCE_PRICE_USD_PER_HR.get(instance_type) if price is None: logging.warning( - '[swap_encryption] Unknown instance type "%s" – skipping cost' - " sample. Add it to _INSTANCE_PRICE_USD_PER_HR to enable cost" - " tracking.", + '[swap_encryption] Unknown instance type "%s" — skipping cost' + ' sample. Add it to _INSTANCE_PRICE_USD_PER_HR to enable cost' + ' tracking.', instance_type, ) return [] hours = elapsed_sec / 3600.0 - cost = hours * price meta = dict( base_meta, instance_type=instance_type, price_usd_per_hr=price, benchmark_elapsed_sec=round(elapsed_sec, 1), ) - return [sample.Sample("cost_estimate_usd", cost, "USD", meta)] - - -def _detect_swap_device(pod: str) -> str: - """Return the active swap device path on the cluster node.""" - if _SWAP_DEVICE.value: - return _SWAP_DEVICE.value - - # /proc/swaps is the source of truth: it lists the swap device that is - # ACTUALLY active. We must NOT just `test -e /dev/mapper/swap_encrypted`, - # because a stale dm-crypt mapping from a previous run on a reused node can - # still exist as a /dev node while being non-functional (fio/swapoff then - # fail with "No such device or address"). So read the active device from - # /proc/swaps first; only fall back to the mapper path if /proc/swaps is - # somehow empty but the mapper is genuinely present. - dm_out, _ = _pod_exec( - pod, - textwrap.dedent(""" - ACTIVE=$(awk 'NR==2{print $1}' /proc/swaps 2>/dev/null) - if [ -n "$ACTIVE" ] - then - echo "$ACTIVE" - elif test -e /dev/mapper/swap_encrypted - then - echo /dev/mapper/swap_encrypted - fi - """), - ignore_failure=True, - ) - dev = dm_out.strip().splitlines()[-1].strip() if dm_out.strip() else "" - if dev: - return dev - raise ValueError( - "No active swap device found in the benchmark pod. " - "Use --swap_encryption_device to specify one." - ) - - -def _build_metadata(pod: str, swap_dev: str) -> dict: - """Collect node environment, encryption type, and config into a dict.""" - - kernel_out, _ = _pod_exec(pod, "uname -r", ignore_failure=True) - mem_out, _ = _pod_exec( - pod, - "awk '/MemTotal/{print $2}' /proc/meminfo", - ignore_failure=True, - ) - swap_out, _ = _pod_exec( - pod, - "awk 'NR>1{sum+=$3} END{print sum+0}' /proc/swaps", - ignore_failure=True, - ) - - try: - mem_gb = round(int(mem_out.strip()) / (1024 * 1024), 1) - except ValueError: - mem_gb = 0 - try: - swap_gb = round(int(swap_out.strip()) / (1024 * 1024), 1) - except ValueError: - swap_gb = 0 - - # Encryption type — key off dm-crypt presence + the swap target, NOT the - # device path. A GKE plain Local SSD is /dev/nvme0n1 but is NOT Nitro- - # encrypted; only the AWS targets (instance_store / io2) are. - enc = "unknown" - if "/dev/mapper/" in swap_dev: - table_out, _ = _pod_exec( - pod, - f'dmsetup table {swap_dev.split("/")[-1]} 2>/dev/null || echo ""', - ignore_failure=True, - ) - enc = "dm-crypt-plain" if "crypt" in table_out.lower() else "dm-other" - elif _SWAP_TYPE.value in ("instance_store", "io2"): - enc = "nitro_hardware_offload" # AWS: encrypted by the Nitro card - elif not _ENABLE_DMCRYPT.value: - enc = "none" # GKE plain swap (encryption OFF) - - cloud = _detect_cloud(pod) - - # Gap 6: instance size label for multi-size comparison runs. - # If the flag is set use it directly; otherwise try to read it from - # cloud metadata so that the field is always populated. - instance_label = _INSTANCE_SIZE_LABEL.value - if not instance_label: - gcp_type_out, _ = _pod_exec( - pod, - "curl -s -m 3 --fail" - " http://metadata.google.internal/computeMetadata/v1/instance/machine-type" - ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""', - ignore_failure=True, - ) - if gcp_type_out.strip(): - instance_label = gcp_type_out.strip().split("/")[-1] - if not instance_label: - aws_type_out, _ = _pod_exec( - pod, - "curl -s -m 3 --fail " - "http://169.254.169.254/latest/meta-data/instance-type " - '2>/dev/null || echo ""', - ignore_failure=True, - ) - instance_label = aws_type_out.strip() - - return { - "benchmark": BENCHMARK_NAME, - "execution_mode": "kubernetes_privileged_pod", - "cloud": cloud, - "instance_size": instance_label, - "kernel_version": kernel_out.strip(), - "host_memory_gb": mem_gb, - "swap_device": swap_dev, - "swap_size_gb": swap_gb, - "swap_encryption": enc, - # Test-matrix columns: storage target, encryption on/off, image, IOPS - "storage_target": _SWAP_TYPE.value, - "boot_disk_type": _BOOT_DISK_TYPE.value, - "dmcrypt_enabled": _ENABLE_DMCRYPT.value, - "node_image_type": _NODE_IMAGE_TYPE.value, - "boot_disk_iops_target": _BOOT_DISK_IOPS.value, - "benchmark_machine_type": _BENCHMARK_MACHINE_TYPE.value, - # Other config - "zswap_enabled": _ENABLE_ZSWAP.value, - "min_free_kbytes": _MIN_FREE_KBYTES.value, - "fio_runtime_sec": _FIO_RUNTIME_SEC.value, - # Requested config value only. The *effective* stress-ng footprint may - # be autoscaled per node (see _autoscale_vm_bytes); Phase 2a records the - # actual value it ran with as 'stress_vm_bytes' so the two never conflict. - "stress_vm_bytes_requested": _STRESS_VM_BYTES.value, - "stress_vm_bytes_list": _STRESS_VM_BYTES_LIST.value, - "stress_timeout_sec": _STRESS_TIMEOUT_SEC.value, - "nodepool": _NODEPOOL.value, - } + return [sample.Sample('cost_estimate_usd', hours * price, 'USD', meta)] From bce82f79270916378106f03e7e19b14d76a1d54f Mon Sep 17 00:00:00 2001 From: DevVegeta Date: Mon, 29 Jun 2026 17:30:41 +0530 Subject: [PATCH 8/8] refactor(swap_encryption/pr4): thin benchmark + Phase 2 CPU/IO overhead Inherits PR1 framework changes (swap_config as NodepoolSpec field). - Prepare(): deploy SwapDaemonSet + _delete_default_pool(cluster) - Phase 2: stress-ng vmstat/pidstat sampling (kswapd/kcryptd CPU%) - DaemonSet: adds stress-ng + build tools, pre-fetches kernel source --- .../swap_encryption_benchmark.py | 147 ++++++++---------- 1 file changed, 62 insertions(+), 85 deletions(-) diff --git a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py index cd867b8234..f4399dc8a6 100644 --- a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py +++ b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py @@ -39,7 +39,6 @@ Infrastructure lifecycle lives in two BaseResource subclasses: - SwapNodePool (perfkitbenchmarker/resources/container_service/swap_nodepool.py) _Create(): gcloud container node-pools create with linuxConfig.swapConfig + sysctl via --system-config-from-file; waits for node Ready; optionally creates and attaches a dedicated swap disk. @@ -88,7 +87,6 @@ from perfkitbenchmarker import sample from perfkitbenchmarker.resources.container_service import kubectl from perfkitbenchmarker.resources.container_service import swap_daemonset as _ds_mod -from perfkitbenchmarker.resources.container_service import swap_nodepool as _np_mod FLAGS = flags.FLAGS @@ -105,26 +103,34 @@ BENCHMARK_CONFIG = """ swap_encryption: description: > - GKE vs. EKS swap encryption and LSSD performance comparison. - Two-step nodepool setup: PKB provisions a minimal cluster with a cheap - default nodepool (Step 1), then Prepare() adds the real benchmark - nodepool (n4-highmem-32 / c4-*-lssd, UBUNTU_CONTAINERD, 80k IOPS) with a - node-level startup script that configures dm-crypt swap before any pod - is scheduled, then removes the default nodepool (Step 2). All benchmark - phases run inside a privileged DaemonSet pinned to the benchmark nodepool. - flags: {} + CPU/IO overhead benchmarks (Tier 2) on swap-encrypted GKE/EKS nodes. Swap-enabled 'benchmark' nodepool declared in BENCHMARK_CONFIG; + GKE cluster creation applies --system-config-from-file (dm-crypt swapConfig) + automatically via swap_config field on NodepoolSpec. container_cluster: + cloud: GCP type: Kubernetes vm_count: 1 vm_spec: GCP: - # Cheap placeholder — the benchmark nodepool is created in Prepare(). machine_type: e2-medium boot_disk_size: 20 - AWS: - # Cheap placeholder — the benchmark nodegroup is added in Prepare(). - machine_type: t3.medium - boot_disk_size: 20 + zone: us-central1-a + nodepools: + benchmark: + vm_count: 1 + vm_spec: + GCP: + machine_type: n4-highmem-32 + boot_disk_type: hyperdisk-balanced + boot_disk_size: 500 + zone: us-central1-a + swap_config: + enabled: true + swappiness: 100 + min_free_kbytes: 200 + watermark_scale_factor: 500 + boot_disk_iops: 160000 + boot_disk_throughput: 2400 """ @@ -465,60 +471,20 @@ def GetConfig(user_config: dict[str, Any]) -> dict[str, Any]: def Prepare(spec: _BenchmarkSpec) -> None: """Two-step nodepool setup then DaemonSet deployment. - Step 1 (handled by PKB infrastructure): cluster provisioned with a cheap - e2-medium default nodepool. - - Step 2 (this function): - a. GCP: Create SwapNodePool (benchmark nodepool + optional swap disk). - EKS: label existing nodes with pkb_nodepool=benchmark. - b. Create SwapDaemonSet: deploy manifest + wait for Running + sentinel. - c. GCP: DeleteDefaultPool() — safe now that DaemonSet pod is Running. - d. GCP: re-resolve pod name in case default-pool deletion evicts the pod. + PKB cluster creation automatically provisions the swap-enabled 'benchmark' + nodepool (swap_config in BENCHMARK_CONFIG). This function only: + 1. Deploys the privileged SwapDaemonSet and waits for Running. + 2. Deletes the cheap e2-medium default-pool (required at cluster create). - Both resources are appended to spec.resources for auto-cleanup. + DaemonSet is appended to spec.resources for PKB auto-cleanup. """ cluster = spec.container_cluster - is_gcp = getattr(cluster, 'project', None) is not None - - if is_gcp: - # ── Step 2a (GCP): create benchmark nodepool + wait for node ────────── - logging.info('[swap_encryption] Step 2a: creating benchmark nodepool') - nodepool = _np_mod.SwapNodePool( - cluster=cluster, - machine_type=_BENCHMARK_MACHINE_TYPE.value, - node_image_type=_NODE_IMAGE_TYPE.value, - disk_type=_BOOT_DISK_TYPE.value, - disk_size_gb=_BOOT_DISK_SIZE_GB.value, - disk_iops=_BOOT_DISK_IOPS.value, - disk_throughput=_BOOT_DISK_THROUGHPUT.value, - lssd=_BENCHMARK_LSSD.value, - lssd_count=_LSSD_COUNT.value, - add_swap_disk=_ADD_SWAP_DISK.value, - swap_disk_size_gb=_SWAP_DISK_SIZE_GB.value, - ) - nodepool.Create() - spec.resources.append(nodepool) - else: - # ── Step 2a (EKS): label existing nodes to match DaemonSet selector ── - logging.info( - '[swap_encryption] EKS cluster — labelling existing nodes with' - ' pkb_nodepool=%s so the DaemonSet nodeSelector matches.', - _BENCHMARK_NODEPOOL, - ) - kubectl.RunKubectlCommand([ - 'label', - 'nodes', - '--all', - '--overwrite', - f'pkb_nodepool={_BENCHMARK_NODEPOOL}', - ]) - _ensure_io2_volume() - - # ── Step 2b: deploy DaemonSet and wait for pod ──────────────────────────── - # Deploy BEFORE deleting the default pool: deleting the default pool while - # the benchmark node is still joining causes a brief API-server I/O timeout. - # The pod being Running means the cluster is fully stable. - logging.info('[swap_encryption] Step 2b: deploying privileged DaemonSet') + + # The swap-enabled 'benchmark' nodepool is already provisioned by GKE + # cluster creation (swap_config declared in BENCHMARK_CONFIG). + # Prepare() only deploys the privileged DaemonSet + deletes the cheap + # e2-medium default pool that GKE requires at cluster creation time. + logging.info('[swap_encryption] Deploying privileged DaemonSet') daemonset = _ds_mod.SwapDaemonSet( name=_DS_NAME, namespace=_DS_NAMESPACE, @@ -528,28 +494,13 @@ def Prepare(spec: _BenchmarkSpec) -> None: ) daemonset.Create() spec.resources.append(daemonset) + logging.info('[swap_encryption] Benchmark pod ready: %s', daemonset.pod_name) + _delete_default_pool(cluster) + daemonset.WaitForPod() logging.info( - '[swap_encryption] Benchmark pod ready: %s', daemonset.pod_name + '[swap_encryption] Benchmark pod (post-deletion): %s', daemonset.pod_name ) - # ── Step 2c+d (GCP): delete dummy default nodepool, re-resolve pod name ── - if is_gcp: - logging.info( - '[swap_encryption] Step 2c: deleting dummy default nodepool' - ) - nodepool.DeleteDefaultPool() - # The pod may be evicted and rescheduled with a new name during the - # default nodepool deletion. Re-resolve to avoid stale references. - logging.info( - '[swap_encryption] Step 2d: re-resolving benchmark pod after' - ' nodepool deletion' - ) - daemonset.WaitForPod() - logging.info( - '[swap_encryption] Benchmark pod (post-deletion): %s', - daemonset.pod_name, - ) - def Run(spec: _BenchmarkSpec) -> list[sample.Sample]: """Execute all benchmark phases with gate logic. @@ -713,6 +664,32 @@ def Run(spec: _BenchmarkSpec) -> list[sample.Sample]: return results + +def _delete_default_pool(cluster) -> None: + """Delete the dummy e2-medium default-pool once the benchmark pod is Running. + + GKE requires at least one nodepool at cluster creation time; the e2-medium + default-pool satisfies that requirement. Deleting it before the DaemonSet + pod is Running can trigger a brief API-server timeout while two concurrent + nodepool operations are in progress. + """ + try: + cmd = cluster._GcloudCommand( # pylint: disable=protected-access + 'container', 'node-pools', 'delete', _DEFAULT_POOL, + '--cluster', cluster.name, + ) + cmd.args.append('--quiet') + logging.info('[swap_encryption] Deleting default nodepool: %s', _DEFAULT_POOL) + _, stderr, rc = cmd.Issue(timeout=300, raise_on_failure=False) + if rc != 0: + logging.warning( + '[swap_encryption] Could not delete default nodepool (rc=%d): %s', + rc, stderr, + ) + else: + logging.info('[swap_encryption] Default nodepool deleted') + except Exception as e: # pylint: disable=broad-except + logging.warning('[swap_encryption] _delete_default_pool failed: %s', e) def Cleanup(spec: _BenchmarkSpec) -> None: """Resources in spec.resources are auto-deleted by the PKB framework.