Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions perfkitbenchmarker/configs/container_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,9 @@ def __init__(
self.vm_spec: virtual_machine_spec.BaseVmSpec
self.machine_families: list[str] | None
self.sandbox_config: SandboxSpec | None
self.max_pods_per_node: int | None
self.node_labels: dict[str, str] | None
self.node_taints: list[str] | None

@classmethod
def _GetOptionDecoderConstructions(cls):
Expand Down Expand Up @@ -273,6 +276,18 @@ def _GetOptionDecoderConstructions(cls):
),
'vm_spec': (spec.PerCloudConfigDecoder, {}),
'sandbox_config': (_SandboxDecoder, {'default': None}),
'max_pods_per_node': (
option_decoders.IntDecoder,
{'default': None, 'none_ok': True, 'min': 1},
),
'node_labels': (
option_decoders.TypeVerifier,
{'valid_types': (dict,), 'default': None, 'none_ok': True},
),
'node_taints': (
option_decoders.TypeVerifier,
{'valid_types': (list,), 'default': None, 'none_ok': True},
),
})
return result

Expand Down
95 changes: 92 additions & 3 deletions perfkitbenchmarker/providers/gcp/flags.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,7 @@
'Whether or not we create a Confidential VM Instance',
)
GCE_CONFIDENTIAL_COMPUTE_TYPE = flags.DEFINE_string(
'gce_confidential_compute_type',
'sev',
'Type of Confidential VM Instance'
'gce_confidential_compute_type', 'sev', 'Type of Confidential VM Instance'
)
GCE_NETWORK_NAMES = flags.DEFINE_list(
'gce_network_name',
Expand Down Expand Up @@ -561,6 +559,97 @@
False,
'Whether to enable shielded nodes.',
)
GKE_ENABLE_PRIVATE_NODES = flags.DEFINE_boolean(
'gke_enable_private_nodes',
False,
'Whether to create the cluster with private nodes (nodes have only internal'
' IPs).',
)
GKE_ENABLE_DNS_ACCESS = flags.DEFINE_boolean(
'gke_enable_dns_access',
False,
'Whether to enable DNS-based control plane access (replaces the'
' public/private IP endpoint model).',
)
GKE_ENABLE_IP_ACCESS = flags.DEFINE_boolean(
'gke_enable_ip_access',
True,
'Whether to enable IP-based control plane access. Disabling requires DNS'
' access and is mutually exclusive with public clusters (nodes with public'
' IPs).',
)
GKE_MASTER_IPV4_CIDR = flags.DEFINE_string(
'gke_master_ipv4_cidr',
None,
'CIDR range to use for the hosted master network. Required when private'
' nodes are enabled without DNS access.',
)
GKE_ENABLE_DATAPLANE_V2 = flags.DEFINE_boolean(
'gke_enable_dataplane_v2',
False,
'Whether to enable GKE Dataplane V2 (eBPF-based datapath, Cilium under the'
' hood). Requires cluster recreation; cannot be toggled on an existing'
' cluster.',
)
GKE_ENABLE_MANAGED_PROMETHEUS = flags.DEFINE_boolean(
'gke_enable_managed_prometheus',
False,
'Whether to enable Google Cloud Managed Service for Prometheus on the'
' cluster.',
)
GKE_ENABLE_COST_ALLOCATION = flags.DEFINE_boolean(
'gke_enable_cost_allocation',
False,
'Whether to enable GKE cost allocation tracking.',
)
GKE_MONITORING_COMPONENTS = flags.DEFINE_string(
'gke_monitoring_components',
'SYSTEM,API_SERVER,SCHEDULER,CONTROLLER_MANAGER',
'Comma-separated list of GKE monitoring components to enable '
'(e.g. SYSTEM,API_SERVER,SCHEDULER,CONTROLLER_MANAGER,POD,DEPLOYMENT,'
'STATEFULSET,DAEMONSET,HPA,STORAGE,CADVISOR,KUBELET).',
)
GKE_ENABLE_AGENT_SANDBOX = flags.DEFINE_boolean(
'gke_enable_agent_sandbox',
False,
'Whether to enable the GKE Agent Sandbox controller on the cluster. '
'Installs the managed agent-sandbox controller and CRDs, enabling '
'SandboxClaim/Sandbox/SandboxWarmPool reconciliation by GKE. This is '
'separate from the gvisor sandbox runtime (--sandbox=type=gvisor on a '
'node pool). Requires GKE 1.35.2-gke.1269000 or later. See '
'https://docs.cloud.google.com/kubernetes-engine/docs/how-to/agent-sandbox.',
)


def _ValidateGkePrivateNodeFlags(flags_dict):
if (
not flags_dict['gke_enable_ip_access']
and not flags_dict['gke_enable_dns_access']
):
raise flags.ValidationError(
'--no-gke_enable_ip_access requires --gke_enable_dns_access.'
)
if (
flags_dict['gke_enable_private_nodes']
and not flags_dict['gke_enable_dns_access']
and not flags_dict['gke_master_ipv4_cidr']
):
raise flags.ValidationError(
'--gke_enable_private_nodes without --gke_enable_dns_access requires'
' --gke_master_ipv4_cidr.'
)
return True


flags.register_multi_flags_validator(
[
'gke_enable_ip_access',
'gke_enable_dns_access',
'gke_enable_private_nodes',
'gke_master_ipv4_cidr',
],
_ValidateGkePrivateNodeFlags,
)
GKE_ADDONS = flags.DEFINE_string(
'gke_addons',
'',
Expand Down
63 changes: 63 additions & 0 deletions perfkitbenchmarker/providers/gcp/gce_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -914,6 +914,7 @@ class GceNetwork(network.BaseNetwork):
def __init__(self, network_spec: GceNetworkSpec):
super().__init__(network_spec)
self.project: str | None = network_spec.project
self._zone: str = network_spec.zone
self.vpn_gateway: Dict[str, GceVpnGateway] = {}

# Figuring out the type of network here.
Expand Down Expand Up @@ -1231,6 +1232,60 @@ def _GetNumberVms(self) -> int:
for group_spec in benchmark_spec.config.vm_groups.values()
)

def _CreateCloudNat(self):
"""Provision a Cloud Router + NAT so private resources can egress.

Called during network provisioning so NAT has time to fully propagate
before any cluster lifecycle code starts. Shared across all resources
in the network.
"""
region = util.GetRegionFromZone(self._zone)
router_name = f'{self.primary_subnet_name}-router'
nat_name = f'{self.primary_subnet_name}-nat'

router_cmd = util.GcloudCommand(
self, 'compute', 'routers', 'create', router_name
)
router_cmd.flags['network'] = self.primary_subnet_name
router_cmd.flags['region'] = region
router_cmd.flags.pop('zone', None)
_, stderr, retcode = router_cmd.Issue(raise_on_failure=False)
if retcode and 'already exists' not in stderr:
logging.warning('Cloud Router create failed: %s', stderr)

nat_cmd = util.GcloudCommand(
self, 'compute', 'routers', 'nats', 'create', nat_name
)
nat_cmd.flags['router'] = router_name
nat_cmd.flags['region'] = region
nat_cmd.flags.pop('zone', None)
nat_cmd.args.append('--auto-allocate-nat-external-ips')
nat_cmd.args.append('--nat-all-subnet-ip-ranges')
_, stderr, retcode = nat_cmd.Issue(raise_on_failure=False)
if retcode and 'already exists' not in stderr:
logging.warning('Cloud NAT create failed: %s', stderr)

def _DeleteCloudNat(self):
"""Best-effort teardown of the NAT and router this network created."""
region = util.GetRegionFromZone(self._zone)
router_name = f'{self.primary_subnet_name}-router'
nat_name = f'{self.primary_subnet_name}-nat'

nat_cmd = util.GcloudCommand(
self, 'compute', 'routers', 'nats', 'delete', nat_name
)
nat_cmd.flags['router'] = router_name
nat_cmd.flags['region'] = region
nat_cmd.flags.pop('zone', None)
nat_cmd.Issue(raise_on_failure=False)

router_cmd = util.GcloudCommand(
self, 'compute', 'routers', 'delete', router_name
)
router_cmd.flags['region'] = region
router_cmd.flags.pop('zone', None)
router_cmd.Issue(raise_on_failure=False)

def Create(self):
"""Creates the actual network."""
if not self.is_existing_network:
Expand All @@ -1244,6 +1299,8 @@ def Create(self):
lambda rule: self.external_nets_rules[rule].Create(),
list(self.external_nets_rules.keys()),
)
if gcp_flags.GKE_ENABLE_PRIVATE_NODES.value:
self._CreateCloudNat()
if getattr(self, 'vpn_gateway', False):
background_tasks.RunThreaded(
lambda gateway: self.vpn_gateway[gateway].Create(),
Expand All @@ -1257,6 +1314,12 @@ def Delete(self):
if self.placement_group:
self.placement_group.Delete()
if not self.is_existing_network:
# Always attempt NAT+router cleanup: both gcloud calls use
# raise_on_failure=False so this is a no-op when none was created.
# Checking the live flag here is wrong because teardown runs from a
# restored pickle and the flag may default to False even when a NAT
# was created at provision time.
self._DeleteCloudNat()
if getattr(self, 'vpn_gateway', False):
background_tasks.RunThreaded(
lambda gateway: self.vpn_gateway[gateway].Delete(),
Expand Down
52 changes: 50 additions & 2 deletions perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,10 +168,37 @@ def _RunClusterCreateCommand(self, cmd: util.GcloudCommand):
)
cmd.flags['release-channel'] = self.release_channel

if gcp_flags.GKE_ENABLE_PRIVATE_NODES.value:
cmd.args.append('--enable-private-nodes')
# GKE requires VPC-native (alias IPs) when private nodes are enabled.
# Without this gcloud rejects the create with:
# Cannot specify --enable-private-nodes without --enable-ip-alias.
cmd.args.append('--enable-ip-alias')
else:
cmd.args.append('--no-enable-private-nodes')
if gcp_flags.GKE_ENABLE_DNS_ACCESS.value:
cmd.args.append('--enable-dns-access')
else:
cmd.args.append('--no-enable-dns-access')
if gcp_flags.GKE_ENABLE_IP_ACCESS.value:
cmd.args.append('--enable-ip-access')
else:
cmd.args.append('--no-enable-ip-access')
if gcp_flags.GKE_ENABLE_DATAPLANE_V2.value:
cmd.args.append('--enable-dataplane-v2')
if gcp_flags.GKE_ENABLE_AGENT_SANDBOX.value:
cmd.args.append('--enable-agent-sandbox')
if gcp_flags.GKE_MASTER_IPV4_CIDR.value:
cmd.flags['master-ipv4-cidr'] = gcp_flags.GKE_MASTER_IPV4_CIDR.value

if FLAGS.gke_enable_alpha:
cmd.args.append('--enable-kubernetes-alpha')
cmd.args.append('--no-enable-autorepair')
cmd.flags['monitoring'] = 'SYSTEM,API_SERVER,SCHEDULER,CONTROLLER_MANAGER'
cmd.flags['monitoring'] = gcp_flags.GKE_MONITORING_COMPONENTS.value
if gcp_flags.GKE_ENABLE_MANAGED_PROMETHEUS.value:
cmd.args.append('--enable-managed-prometheus')
if gcp_flags.GKE_ENABLE_COST_ALLOCATION.value:
cmd.args.append('--enable-cost-allocation')

user = util.GetDefaultUser()
if FLAGS.gcp_service_account:
Expand Down Expand Up @@ -209,6 +236,10 @@ def _GetKubeconfig(self):
cmd = self._GcloudCommand(
'container', 'clusters', 'get-credentials', self.name
)
if gcp_flags.GKE_ENABLE_DNS_ACCESS.value:
# Private-node clusters are unreachable via the IP endpoint; use the
# DNS-based control plane endpoint instead.
cmd.args.append('--dns-endpoint')
env = os.environ.copy()
env['KUBECONFIG'] = FLAGS.kubeconfig
cmd.IssueRetryable(env=env)
Expand Down Expand Up @@ -377,6 +408,10 @@ def _Create(self):
cmd = self._GcloudCommand('container', 'clusters', 'create', self.name)
if self.default_nodepool.network:
cmd.flags['network'] = self.default_nodepool.network.network_resource.name
if self.default_nodepool.network.subnet_resource:
cmd.flags['subnetwork'] = (
self.default_nodepool.network.subnet_resource.name
)

if gcp_flags.GKE_ENABLE_SHIELDED_NODES.value:
cmd.args.append('--enable-shielded-nodes')
Expand Down Expand Up @@ -576,10 +611,19 @@ def _AddNodeParamsToCmd(
if nodepool_config.sandbox_config is not None:
cmd.flags['sandbox'] = nodepool_config.sandbox_config.ToSandboxFlag()

if nodepool_config.max_pods_per_node is not None:
cmd.flags['max-pods-per-node'] = nodepool_config.max_pods_per_node

if self.image_type:
cmd.flags['image-type'] = self.image_type

cmd.flags['node-labels'] = f'pkb_nodepool={nodepool_config.name}'
labels = {}
if nodepool_config.node_labels:
labels.update(nodepool_config.node_labels)
labels['pkb_nodepool'] = nodepool_config.name
cmd.flags['node-labels'] = ','.join(f'{k}={v}' for k, v in labels.items())
if nodepool_config.node_taints:
cmd.flags['node-taints'] = ','.join(nodepool_config.node_taints)
if nodepool_config.min_nodes != nodepool_config.max_nodes:
cmd.args.append('--enable-autoscaling')
cmd.flags['min-nodes'] = nodepool_config.min_nodes
Expand Down Expand Up @@ -673,6 +717,10 @@ def _Create(self):
)
if self.default_nodepool.network:
cmd.flags['network'] = self.default_nodepool.network.network_resource.name
if self.default_nodepool.network.subnet_resource:
cmd.flags['subnetwork'] = (
self.default_nodepool.network.subnet_resource.name
)
cmd.flags['labels'] = util.MakeFormattedDefaultTags()

if self.enable_aam:
Expand Down
3 changes: 3 additions & 0 deletions perfkitbenchmarker/resources/container_service/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,9 @@ def __init__(
# Defined by GceVirtualMachineConfig. Used by google_kubernetes_engine
# pylint: disable=g-missing-from-attributes
self.sandbox_config: container_spec_lib.SandboxSpec | None = None
self.node_labels: dict[str, str] | None = None
self.node_taints: list[str] | None = None
self.max_pods_per_node: int | None = None
self.max_local_disks: int | None
self.ssd_interface: str | None
self.threads_per_core: int
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,9 @@ def _InitializeNodePool(
nodepool_spec.machine_families,
)
nodepool_config.sandbox_config = nodepool_spec.sandbox_config
nodepool_config.node_labels = nodepool_spec.node_labels
nodepool_config.node_taints = nodepool_spec.node_taints
nodepool_config.max_pods_per_node = nodepool_spec.max_pods_per_node
nodepool_config.zone = zone
nodepool_config.num_nodes = nodepool_spec.vm_count
if nodepool_spec.min_vm_count is None:
Expand Down
Loading