Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,5 @@
/.idea
/*git_ignore*
.DS_Store
.adk
tmp/
171 changes: 171 additions & 0 deletions perfkitbenchmarker/data/k8s_agents/config/gke-benchmark.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
#!/bin/bash
#
# Agentic Workload Benchmarking configuration file for GKE
# Adapted from nginx DPv2 baseline for Python Sandbox & Chromium Simulation
#
# Override machine type and cluster suffix via environment variables:
# MACHINE_TYPE=c4d-standard-8 CLUSTER_SUFFIX=c4d bash setup_infrastructure_gke.sh
#
# Supported profiles:
# MACHINE_TYPE=c3-standard-192-metal CLUSTER_SUFFIX=c3metal
# MACHINE_TYPE=c4-standard-8 CLUSTER_SUFFIX=c4 (default)
# MACHINE_TYPE=c4d-standard-8 CLUSTER_SUFFIX=c4d
# MACHINE_TYPE=c4a-standard-8 CLUSTER_SUFFIX=c4a (ARM64)

USER_NAME_PREFIX=${USER%%.*}

# GCP Project (MUST be set before running any script)
PROJECT_ID="your-project-id"
REGION="us-central1"
ZONE="us-central1-a"

# Google/ADK aliases (derived from canonical names above)
# These are used by envsubst for the K8s manifest and by the ADK agent.
GOOGLE_CLOUD_PROJECT="${PROJECT_ID}"
GOOGLE_CLOUD_LOCATION="${REGION}"

# Network Configuration
VPC_NAME="${USER_NAME_PREFIX}-agentic-vpc"
SUBNET_NAME="${USER_NAME_PREFIX}-agentic-subnet"
SUBNET_CIDR="10.134.20.0/24"
LAPTOP_IP="$(curl -s ifconfig.me)/32" # PUBLIC IP to access the target (dynamically detected)
# Cloud Router and NAT Configuration
ROUTER_NAME="${USER_NAME_PREFIX}-agentic-nat-router"
NAT_NAME="${USER_NAME_PREFIX}-agentic-nat-config"

# GKE Cluster Configuration
CLUSTER_SUFFIX="${CLUSTER_SUFFIX:-c4}"
CLUSTER_NAME="${USER_NAME_PREFIX}-agentic-${CLUSTER_SUFFIX}"
GKE_VERSION="1.35.3-gke.1389000"
USE_CONNECT_GATEWAY="${USE_CONNECT_GATEWAY:-true}" # Use Connect Gateway for kubectl access
# Set to "false" to use direct public endpoint

# =========================================================================
# Machine Type Configuration (overridable via MACHINE_TYPE env var)
# =========================================================================
MACHINE_TYPE="${MACHINE_TYPE:-c4-standard-8}"

# Derive disk type from machine family:
# C3 → pd-balanced, C4/C4D/C4A → hyperdisk-balanced
_MACHINE_FAMILY="${MACHINE_TYPE%%-*}" # e.g. "c4" from "c4-standard-8"
case "${_MACHINE_FAMILY}" in
c3) _DISK_TYPE="pd-balanced" ;;
*) _DISK_TYPE="hyperdisk-balanced" ;;
esac

# Derive target architecture from machine family:
# C4A → arm64, everything else → amd64
case "${_MACHINE_FAMILY}" in
c4a) _TARGET_ARCH="arm64" ;;
*) _TARGET_ARCH="amd64" ;;
esac

# Derive unique master CIDR per cluster (each private cluster needs its own /28):
# c4 → 172.16.0.0/28, c4d → 172.16.0.16/28, c4a → 172.16.0.32/28, c3metal → 172.16.0.48/28
case "${CLUSTER_SUFFIX}" in
c4) MASTER_IPV4_CIDR="172.16.0.0/28" ;;
c4d) MASTER_IPV4_CIDR="172.16.0.16/28" ;;
c4a) MASTER_IPV4_CIDR="172.16.0.32/28" ;;
c3metal) MASTER_IPV4_CIDR="172.16.0.48/28" ;;
*) MASTER_IPV4_CIDR="172.16.0.64/28" ;; # fallback for future clusters
esac

DEFAULT_POOL_MACHINE_TYPE="${MACHINE_TYPE}"
DEFAULT_POOL_DISK_TYPE="${_DISK_TYPE}"
DEFAULT_POOL_DISK_SIZE="50" # Disk size in GB
DEFAULT_POOL_NODE_COUNT="1" # Number of nodes in the default pool

# =========================================================================
# Agentic Workload NodePools
# =========================================================================

# Sandbox NodePool (Python + Chromium workloads with gVisor)
SANDBOX_NODE_POOL_NAME="agentic-sandbox-pool"
SANDBOX_MACHINE_TYPE="${MACHINE_TYPE}" # Same as default pool (overridable)
SANDBOX_DISK_SIZE="100"
SANDBOX_DISK_TYPE="${_DISK_TYPE}" # Derived from machine family
SANDBOX_NODE_COUNT="1"
SANDBOX_MAX_PODS_PER_NODE="250" # Raise from default 110 to avoid GKE pod limit as density ceiling
SANDBOX_ENABLE_GVISOR="true" # Enable GKE Sandbox (gVisor) on this pool

AGENT_SANDBOX_VERSION="v0.4.6"

# =========================================================================
# Workload Configuration
# =========================================================================
AGENTIC_NAMESPACE="agentic"

# Python Sandbox Workload
PYTHON_IMAGE="python:3.11-slim"
PYTHON_POD_NAME="python-sandbox"
PYTHON_REPLICAS="1" # Start with 1; sweep for density tests
PYTHON_CPU_REQUEST="1"
PYTHON_CPU_LIMIT="2"
PYTHON_MEMORY_REQUEST="1Gi"
PYTHON_MEMORY_LIMIT="4Gi"

# Chromium Browser Simulation Workload
CHROMIUM_IMAGE="${REGION}-docker.pkg.dev/${PROJECT_ID}/agent-sandbox/chrome-sandbox:${_TARGET_ARCH}"
CHROMIUM_POD_NAME="chromium-sandbox"
CHROMIUM_REPLICAS="1" # Start with 1; sweep for density tests

# Mock LLM Coordinator
MOCK_LLM_IMAGE="python:3.11-slim"
MOCK_LLM_POD_NAME="mock-llm-coordinator"
MOCK_LLM_PORT="8080"

# =========================================================================
# Benchmark Parameters
# =========================================================================

# Python Density Benchmark (UC-B)
SAMPLE_COUNT="20" # Samples per sandbox session
SAMPLE_WARMUP="0" # Warmup samples (excluded from stats)

# Payload Transfer Benchmark (UC-D)
PAYLOAD_SIZE_MB="1" # Default payload size in MB
PAYLOAD_ITERATIONS="20" # Transfer iterations per session

# Chromium Benchmark
CHROMIUM_TASK_COUNT="10" # Number of browser tasks per run
CHROMIUM_WARMUP_TASKS="2"

# General
BENCHMARK_DURATION="300" # Duration in seconds per test
NOTE="agentic-V0-gVisor-DPv2-baseline"

# =========================================================================
# Logging
# =========================================================================
# Log directory — defaults to tmp/ inside the repo (gitignored).
# Override by setting BASE_LOG_DIR before sourcing this file,
# e.g. export BASE_LOG_DIR="$HOME/agentic-logs" to keep logs outside the repo.
_REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../../.." && pwd)"
BASE_LOG_DIR="${BASE_LOG_DIR:-${_REPO_ROOT}/tmp/agentic-logs}"
WRAPPER_LOG_DIR="${BASE_LOG_DIR}/wrapper_logs"

LOG_PATH="logs"
LOG_LEVEL="info"

# =========================================================================
# ADK Agent Deployment
# =========================================================================
ADK_REPO_NAME="adk-repo" # Artifact Registry repository name
ADK_IMAGE_NAME="adk-agent" # Container image name
GOOGLE_GENAI_USE_VERTEXAI="true"
ADK_IMAGE_PATH="${REGION}-docker.pkg.dev/${PROJECT_ID}/${ADK_REPO_NAME}/${ADK_IMAGE_NAME}:${_TARGET_ARCH}"
ADK_K8S_SA="adk-agent-sa" # Kubernetes service account for the agent
CLOUD_BUILD_SA="adk-cloud-build-sa" # Service account for Cloud Build submissions

# Sandbox Router & Warm Pool
SANDBOX_ROUTER_IMAGE="${REGION}-docker.pkg.dev/${PROJECT_ID}/agent-sandbox/sandbox-router:${_TARGET_ARCH}"
WARMPOOL_REPLICAS="2" # Number of pre-warmed sandbox pods

# =========================================================================
# Pod Snapshot Configuration (UC-A: Cold Start & Snapshot Pressure Test)
# =========================================================================
ENABLE_POD_SNAPSHOTS="true" # Enable pod snapshots feature on cluster
SNAPSHOTS_BUCKET_NAME="agent-sandbox-snapshots-${PROJECT_ID}"
SNAPSHOT_KSA_NAME="pod-snapshot-sa" # KSA for snapshot storage access
SNAPSHOT_FOLDER="benchmark-snapshots" # Managed folder inside the bucket
SNAPSHOT_PRELOAD_MB="10" # Default memory preload for snapshot sizing
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Native PKB Provision Config for Agentic Benchmarks
# Used with --gke_provision_mode=native
#
# Prerequisites (run once before PKB):
# python tools/agentic-benchmark/scripts/prerequisite_setup.py \

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This tool isn't being included & therefore this comment doesn't need to be here.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, removed stale references in new commit.

# --project_id=<project> --machine_type=<machine>
#
# IMPORTANT: Do NOT pass --gce_subnet_name on the command line.
# PKB incorrectly resolves it as the --network value. Instead, pass the
# subnet via --gke_additional_flags on the command line.
#
# Usage (provision):
# python pkb.py --benchmarks=gke_python_density \
# --gke_provision_mode=native \
# --benchmark_config_file=k8s_agents/config/native_provision_config.yaml \
# --gce_network_name=<user>-agentic-vpc \
# --gce_subnet_region=us-central1 \
# --zone=us-central1-a \
# --project=<project> \
# --owner=<owner> \
# --container_cluster_version=1.35.3-gke.1389000 \
# --gke_additional_flags="--subnetwork=<user>-agentic-subnet,--workload-pool=<project>.svc.id.goog"
#
# For sweeps (cluster pre-exists, PKB skips provision/teardown):
# The sweep bridge injects --run_stage=run,cleanup automatically.

gke_python_density:

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Internally we put a lot of this info but externally it is useful.. it's probably a good addition.

flags:
# Force gcloud beta for preview features (pod snapshots)
gke_use_beta: true

# Cluster-level additional flags (appended to gcloud [beta] container clusters create)
# NOTE: --subnetwork and --workload-pool are user/project-specific.
# Pass them on the command line via --gke_additional_flags=... (comma-separated).
gke_additional_flags:
- "--enable-pod-snapshots"
- "--enable-dataplane-v2"
- "--enable-private-nodes"
- "--enable-ip-alias"
- "--master-ipv4-cidr=172.16.0.0/28"

# Node-pool-level additional flags (appended to gcloud container node-pools create)
gke_additional_nodepool_flags:
- "--max-pods-per-node=250"

# Standard PKB GKE flags
container_cluster_version: "1.35.3-gke.1389000"
gke_enable_shielded_nodes: false

container_cluster:
cloud: GCP
type: Kubernetes
vm_count: 1
vm_spec:
GCP:
machine_type: c4-standard-8
zone: us-central1-a
boot_disk_type: hyperdisk-balanced
boot_disk_size: 50
nodepools:
sandbox:
vm_count: 1
vm_spec:
GCP:
machine_type: c4-standard-8
zone: us-central1-a
boot_disk_type: hyperdisk-balanced
boot_disk_size: 100
sandbox_config:
type: gvisor
Loading