Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions perfkitbenchmarker/benchmark_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
from perfkitbenchmarker.configs import vm_group_decoders
from perfkitbenchmarker.resources import ai_agent_service
from perfkitbenchmarker.resources import base_job
from perfkitbenchmarker.resources import agent_sandbox
from perfkitbenchmarker.resources import example_resource
from perfkitbenchmarker.resources import managed_ai_model
from perfkitbenchmarker.resources.container_service import container_cluster
Expand Down Expand Up @@ -202,6 +203,7 @@ def __init__(
self.base_job = None
self.edw_service = None
self.edw_compute_resource = None
self.agent_sandbox = None
self.example_resource = None
self.multi_attach_disk = None
self.nfs_service = None
Expand Down Expand Up @@ -337,6 +339,7 @@ def ConstructResources(self):
# Put registry first, as it can be needed by cluster.
self.ConstructContainerRegistry()
self.ConstructContainerCluster()
self.ConstructAgentSandbox()
# dpb service needs to go first, because it adds some vms.
self.ConstructDpbService()
self.ConstructCluster()
Expand Down Expand Up @@ -589,6 +592,19 @@ def ConstructExampleResource(self):
) # pytype: disable=not-instantiable
self.resources.append(self.example_resource)

def ConstructAgentSandbox(self):
"""Create the agent_sandbox object (requires a container_cluster)."""
if self.config.agent_sandbox is None:
return
if self.container_cluster is None:
raise errors.Config.InvalidValue(
'agent_sandbox requires a container_cluster to be configured.')
self.agent_sandbox = agent_sandbox.GetAgentSandbox(
self.config.agent_sandbox, self.container_cluster
)
if self.agent_sandbox:
self.resources.append(self.agent_sandbox)

def ConstructBaseJob(self):
"""Create an instance of the base job.It is also called from pkb.py."""
if self.config.base_job is None:
Expand Down Expand Up @@ -1057,6 +1073,8 @@ def Provision(self):

if self.container_cluster:
self.container_cluster.Create()
if self.agent_sandbox:
self.agent_sandbox.Create()

# do after network setup but before VM created
if self.nfs_service and self.nfs_service.CLOUD != nfs_service.UNMANAGED:
Expand Down Expand Up @@ -1207,6 +1225,8 @@ def Delete(self):
self.edw_service.Delete()
if hasattr(self, 'edw_compute_resource') and self.edw_compute_resource:
self.edw_compute_resource.Delete()
if self.agent_sandbox:
self.agent_sandbox.Delete()
if self.example_resource:
self.example_resource.Delete()
if self.base_job:
Expand Down
5 changes: 5 additions & 0 deletions perfkitbenchmarker/configs/benchmark_config_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
from perfkitbenchmarker.configs import spec
from perfkitbenchmarker.configs import vm_group_decoders
from perfkitbenchmarker.resources import ai_agent_service_spec
from perfkitbenchmarker.resources import agent_sandbox_spec
from perfkitbenchmarker.resources import example_resource_spec
from perfkitbenchmarker.resources import jobs_setter
from perfkitbenchmarker.resources import managed_ai_model_spec
Expand Down Expand Up @@ -1488,6 +1489,10 @@ def _GetOptionDecoderConstructions(cls):
'tpu_groups': (_TpuGroupsDecoder, {'default': {}}),
'edw_compute_resource': (_EdwComputeResourceDecoder, {'default': None}),
'edw_service': (_EdwServiceDecoder, {'default': None}),
'agent_sandbox': (
agent_sandbox_spec.AgentSandboxConfigDecoder,
{'default': None, 'none_ok': True},
),
'example_resource': (_ExampleResourceDecoder, {'default': None}),
'base_job': (_BaseJobDecoder, {'default': None}),
'memory_store': (_MemoryStoreDecoder, {'default': None}),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Privileged DaemonSet that runs an init container to install runsc and the
# containerd-runsc shim onto the host, then sleeps as a pause container.
#
# The actual install logic comes from a ConfigMap named gvisor-installer-script
# (created by install_gvisor() in resources/kubernetes/k8s_agent_sandbox.py from
# data/agent_sandbox/gvisor-installer/install.sh before this DaemonSet is
# applied). The ConfigMap key is "install.sh", mounted at /scripts.
#
# nodeSelector and tolerations are injected at apply time (see
# _render_gvisor_daemonset in resources/kubernetes/k8s_agent_sandbox.py) so the
# DaemonSet targets the sandbox node pool via the pkb_nodepool label.
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: gvisor-installer
namespace: kube-system
labels:
app.kubernetes.io/name: gvisor-installer
spec:
selector:
matchLabels:
app.kubernetes.io/name: gvisor-installer
template:
metadata:
labels:
app.kubernetes.io/name: gvisor-installer
spec:
hostPID: true
initContainers:
- name: install
image: docker.io/library/ubuntu:24.04
imagePullPolicy: IfNotPresent
securityContext:
privileged: true
env:
- name: GVISOR_VERSION
# Pinned for benchmarking. Update in lockstep across all envs.
# Verify available releases at https://gvisor.dev/docs/user_guide/install/
value: "20260511"
command: ["/bin/bash", "/scripts/install.sh"]
volumeMounts:
- name: host
mountPath: /host
- name: script
mountPath: /scripts
readOnly: true
containers:
# Pause container keeps the DaemonSet "Running" after install completes.
- name: pause
image: registry.k8s.io/pause:3.9
resources:
requests:
cpu: 10m
memory: 16Mi
limits:
cpu: 50m
memory: 64Mi
volumes:
- name: host
hostPath:
path: /
- name: script
configMap:
name: gvisor-installer-script
defaultMode: 0755
81 changes: 81 additions & 0 deletions perfkitbenchmarker/data/agent_sandbox/gvisor-installer/install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/bin/bash
set -euxo pipefail

: "${GVISOR_VERSION:?must be set}"
HOST=/host
ARCH=$(uname -m)
URL="https://storage.googleapis.com/gvisor/releases/release/${GVISOR_VERSION}/${ARCH}"

apt-get update -qq
apt-get install -y -qq curl util-linux

NEEDS_RESTART=0

# On COS nodes /usr/local/bin is read-only; binaries live on the writable
# stateful partition at /home/kubernetes/bin. On all other nodes (Ubuntu,
# Amazon Linux) /usr/local/bin is writable and already on PATH.
if [ -d "${HOST}/home/kubernetes" ]; then
INSTALL_DIR="${HOST}/home/kubernetes/bin"
NEEDS_PATH_DROPIN=1
else
INSTALL_DIR="${HOST}/usr/local/bin"
NEEDS_PATH_DROPIN=0
fi
mkdir -p "${INSTALL_DIR}"

for bin in runsc containerd-shim-runsc-v1; do
TARGET="${INSTALL_DIR}/${bin}"
if [ ! -x "${TARGET}" ]; then
curl -fsSL "${URL}/${bin}" -o "${TARGET}.new"
chmod +x "${TARGET}.new"
mv "${TARGET}.new" "${TARGET}"
NEEDS_RESTART=1
fi
done

# On COS, /home/kubernetes/bin is not on systemd's default PATH; drop in a
# unit override for containerd so the shim is found. Not needed on non-COS
# nodes where /usr/local/bin is already on PATH.
if [ "${NEEDS_PATH_DROPIN}" -eq 1 ]; then
DROPIN_DIR="${HOST}/etc/systemd/system/containerd.service.d"
DROPIN="${DROPIN_DIR}/10-runsc-path.conf"
mkdir -p "${DROPIN_DIR}"
if [ ! -f "${DROPIN}" ]; then
cat > "${DROPIN}" <<'EOF'
[Service]
Environment="PATH=/home/kubernetes/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
EOF
NEEDS_RESTART=1
fi
fi

# Register the runsc runtime with containerd.
CONFIG="${HOST}/etc/containerd/config.toml"
if [ ! -f "${CONFIG}" ]; then
mkdir -p "$(dirname "${CONFIG}")"
nsenter -t 1 -m -u -i -n -p -- containerd config default > "${CONFIG}"
fi
if ! grep -q 'io.containerd.runsc.v1' "${CONFIG}"; then
# containerd v2+ uses config version 3 where the CRI runtime plugin moved
# from io.containerd.grpc.v1.cri to io.containerd.cri.v1.runtime.
# Appending to the wrong section is silently ignored, leaving runsc
# unconfigured even though the binary is installed.
if grep -q 'version = 3' "${CONFIG}"; then
CRI_PLUGIN='io.containerd.cri.v1.runtime'
else
CRI_PLUGIN='io.containerd.grpc.v1.cri'
fi
cat >>"${CONFIG}" <<TOML

[plugins."${CRI_PLUGIN}".containerd.runtimes.runsc]
runtime_type = "io.containerd.runsc.v1"
TOML
NEEDS_RESTART=1
fi

if [ "${NEEDS_RESTART}" -eq 1 ]; then
nsenter -t 1 -m -u -i -n -p -- systemctl daemon-reload
nsenter -t 1 -m -u -i -n -p -- systemctl restart containerd
fi

echo "gVisor self-install complete on $(uname -n)."
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# RuntimeClass that self-managed sandbox pods reference via
# runtimeClassName: runsc. The handler "runsc" matches the runtime
# registered in containerd's config by the installer DaemonSet:
# [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runsc]
# runtime_type = "io.containerd.runsc.v1"
#
# Named "runsc" (not "gvisor") because GKE Standard ships a pre-installed
# "gvisor" RuntimeClass with handler "gvisor" and addonmanager mode
# Reconcile, so we can't own that name. Platform-managed scenarios use
# GKE's "gvisor" RC and don't need this manifest.
apiVersion: node.k8s.io/v1
kind: RuntimeClass
metadata:
name: runsc
handler: runsc
55 changes: 55 additions & 0 deletions perfkitbenchmarker/data/agent_sandbox/load_runner_job.yaml.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
apiVersion: batch/v1
kind: Job
metadata:
name: agent-sandbox-load-runner
namespace: {{ namespace }}
spec:
backoffLimit: 0
ttlSecondsAfterFinished: 3600
template:
metadata:
labels:
app: agent-sandbox-load-runner
spec:
restartPolicy: Never
serviceAccountName: agent-sandbox-load-runner
containers:
- name: runner
image: python:3.12-slim
command: ["bash", "-c"]
args:
- |
set -euo pipefail
mkdir -p /tmp/results
pip install --no-cache-dir --quiet kubernetes
python /opt/load-runner/load_runner.py \
--namespace {{ namespace }} \
--template-name {{ template_name }} \
--max-concurrent {{ max_concurrent }} \
--workload-duration {{ workload_duration }} \
{% if qps is not none %}--qps {{ qps }} \
{% endif %}{% if total is not none %}--total {{ total }} \
{% endif %}{% if duration is not none %}--duration {{ duration }} \
{% endif %}{% if ready_timeout is not none %}--ready-timeout {{ ready_timeout }} \
{% endif %}--output /tmp/results/run.jsonl
echo '---RESULTS---'
cat /tmp/results/run.jsonl
resources:
requests:
cpu: "1"
memory: "1Gi"
limits:
cpu: "2"
memory: "2Gi"
volumeMounts:
- name: script
mountPath: /opt/load-runner
readOnly: true
- name: results
mountPath: /tmp/results
volumes:
- name: script
configMap:
name: agent-sandbox-load-runner-script
- name: results
emptyDir: {}
41 changes: 41 additions & 0 deletions perfkitbenchmarker/data/agent_sandbox/load_runner_rbac.yaml.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: agent-sandbox-load-runner
namespace: {{ namespace }}
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: agent-sandbox-load-runner
namespace: {{ namespace }}
rules:
- apiGroups: ["extensions.agents.x-k8s.io"]
resources: ["sandboxclaims"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
- apiGroups: ["extensions.agents.x-k8s.io"]
resources: ["sandboxtemplates"]
verbs: ["get", "list", "watch"]
- apiGroups: ["agents.x-k8s.io"]
resources: ["sandboxes"]
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "list"]
- apiGroups: [""]
resources: ["pods/exec"]
verbs: ["create"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: agent-sandbox-load-runner
namespace: {{ namespace }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: agent-sandbox-load-runner
subjects:
- kind: ServiceAccount
name: agent-sandbox-load-runner
namespace: {{ namespace }}
44 changes: 44 additions & 0 deletions perfkitbenchmarker/data/agent_sandbox/sandbox-template.yaml.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Reusable blueprint for the sandboxes that SandboxClaim will provision.
# Pod-shape values come from the sandbox_template config block; the defaults
# below match the original hardcoded template.
#
# The security/placement fields below are REQUIRED by GKE's
# secure-sandbox-policy ValidatingAdmissionPolicy on Agent-Sandbox-enabled
# clusters. They are harmless on clusters that do not run the policy.
apiVersion: extensions.agents.x-k8s.io/v1beta1
kind: SandboxTemplate
metadata:
name: {{ name }}
spec:
podTemplate:
metadata:
labels:
{% for k, v in labels.items() %} {{ k }}: "{{ v }}"
{% endfor %} spec:
runtimeClassName: {{ runtime_class }}
automountServiceAccountToken: false
securityContext:
runAsNonRoot: true
containers:
- name: python-runtime
image: {{ image }}
ports:
- containerPort: 8888
readinessProbe:
httpGet:
path: "/"
port: 8888
initialDelaySeconds: 0
periodSeconds: 1
securityContext:
capabilities:
drop: ["ALL"]
resources:
requests:
cpu: "{{ cpu_request }}"
memory: "{{ memory_request }}"
ephemeral-storage: "256Mi"
limits:
cpu: "{{ cpu_limit }}"
memory: "{{ memory_limit }}"
restartPolicy: "OnFailure"
11 changes: 11 additions & 0 deletions perfkitbenchmarker/data/agent_sandbox/sandbox-warmpool.yaml.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Pre-warmed pool of sandbox pods. Replenishment latency under contention is
# one of the primary metrics for the benchmark -- keep replicas identical
# across envs so claim throughput is the only variable.
apiVersion: extensions.agents.x-k8s.io/v1beta1
kind: SandboxWarmPool
metadata:
name: {{ name }}
spec:
replicas: {{ replicas }}
sandboxTemplateRef:
name: {{ name }}
Loading