Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions perfkitbenchmarker/configs/container_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,8 @@ def __init__(
self.vm_spec: virtual_machine_spec.BaseVmSpec
self.machine_families: list[str] | None
self.sandbox_config: SandboxSpec | None
self.node_labels: dict[str, str] | None
self.node_taints: list[str] | None

@classmethod
def _GetOptionDecoderConstructions(cls):
Expand Down Expand Up @@ -273,6 +275,14 @@ def _GetOptionDecoderConstructions(cls):
),
'vm_spec': (spec.PerCloudConfigDecoder, {}),
'sandbox_config': (_SandboxDecoder, {'default': None}),
'node_labels': (
option_decoders.TypeVerifier,
{'valid_types': (dict,), 'default': None, 'none_ok': True},
),
'node_taints': (
option_decoders.TypeVerifier,
{'valid_types': (list,), 'default': None, 'none_ok': True},
),
})
return result

Expand Down
8 changes: 5 additions & 3 deletions perfkitbenchmarker/providers/aws/aws_virtual_machine.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,8 +441,10 @@ def ImportKeyfile(cls, region):
with cls._lock:
if _GetKeyfileSetKey(region) in cls.imported_keyfile_set:
return
cat_cmd = ['cat', vm_util.GetPublicKeyPath()]
keyfile, _ = vm_util.IssueRetryableCommand(cat_cmd)
# `aws ec2 import-key-pair --public-key-material` rejects a raw
# OpenSSH-formatted key with "Invalid base64" when the value comes
# in as a CLI string. The `fileb://` URI tells the CLI to read the
# file as bytes and send them through, which AWS accepts.
formatted_tags = util.FormatTagSpecifications(
'key-pair', util.MakeDefaultTags()
)
Expand All @@ -451,7 +453,7 @@ def ImportKeyfile(cls, region):
'--region=%s' % region,
'import-key-pair',
'--key-name=%s' % cls.GetKeyNameForRun(),
'--public-key-material=%s' % keyfile,
'--public-key-material=fileb://%s' % vm_util.GetPublicKeyPath(),
'--tag-specifications=%s' % formatted_tags,
]
_, stderr, retcode = vm_util.IssueCommand(
Expand Down
25 changes: 22 additions & 3 deletions perfkitbenchmarker/providers/aws/elastic_kubernetes_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,22 @@ def ApplyInferenceS3PvAndPvc() -> None:
logging.info('Successfully applied S3 PVC.')


def _ParseTaint(taint_str: str) -> dict[str, str]:

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we add the matching GKE wiring in _AddNodeParamsToCmd as well? It would help symmetric setup for label/taint-based pinning for cross-cloud benchmarks.. You could merge node_labels
into --node-labels and --node-taints from node_taints in google_kubernetes_engine.py.

"""Parses a taint string into eksctl's dict format.

Args:
taint_str: Taint in 'key=value:Effect' or 'key:Effect' format.

Returns:
Dict with 'key', 'value' (optional), and 'effect' keys for eksctl.
"""
key_value, effect = taint_str.rsplit(':', 1)
if '=' in key_value:
key, value = key_value.split('=', 1)
return {'key': key, 'value': value, 'effect': effect}
return {'key': key_value, 'effect': effect}


class BaseEksCluster(kubernetes_cluster.KubernetesCluster):
"""Shared base class for Elastic Kubernetes Service cluster auto mode & not."""

Expand Down Expand Up @@ -195,16 +211,19 @@ def _RenderNodeGroupJson(
self, nodepool: container.BaseNodePoolConfig
) -> dict[str, Any]:
"""Constructs the node group json dictionary."""
labels = {'pkb_nodepool': nodepool.name}

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will work for eksctl setup, but #6748 uses eks create-group explicitly which cannot leverage this.

@ashishsuneja could we converge on the approach?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks Ajay — agree this is worth converging on. A few things that might help:
The kubernetes_management EKS path (#6748) already sets the pkb_nodepool label on every nodegroup — in both the eksctl-config path and the create-nodegroup payload — and already selects nodes by it (jsonpath on .metadata.labels.pkb_nodepool). So the label-based selection of single-node benchmarks need already works against nodegroups it provisions.
On the divergence you flagged: #6748 uses eks create-nodegroup directly rather than eksctl because the benchmark measures the managed-nodegroup create/upgrade/delete operations themselves — eksctl wraps those in CloudFormation, which adds provisioning latency we can't attribute to the control plane. So the two paths can't share the eksctl-config label mechanism directly, but they converge on the same pkb_nodepool contract.
Your broader point — shared nodepool-provisioning helpers so single-node benchmarks can reuse the logic — is the right direction, but it spans this PR, #6744, and the existing provisioning benchmark. Could we come to a conclusion maybe by having a short discussion or chat to agree on the shared interface before reshaping any one PR? I'm happy to align to whatever shared contract we land on.

if nodepool.node_labels:
labels.update(nodepool.node_labels)
group_json = {
'name': nodepool.name,
'instanceType': nodepool.machine_type,
'desiredCapacity': nodepool.num_nodes,
'amiFamily': 'AmazonLinux2023',
'tags': util.MakeDefaultTags(),
'labels': {
'pkb_nodepool': nodepool.name,
},
'labels': labels,
}
if nodepool.node_taints:
group_json['taints'] = [_ParseTaint(t) for t in nodepool.node_taints]
if nodepool.min_nodes != nodepool.max_nodes:
group_json['minSize'] = nodepool.min_nodes
group_json['maxSize'] = nodepool.max_nodes
Expand Down
2 changes: 2 additions & 0 deletions perfkitbenchmarker/resources/container_service/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,8 @@ def __init__(
self.disk_size: int = vm_spec.boot_disk_size
self.gpu_type: str | None = vm_spec.gpu_type
self.gpu_count: int | None = vm_spec.gpu_count
self.node_labels: dict[str, str] | None = None
self.node_taints: list[str] | None = None
# Defined by GceVirtualMachineConfig. Used by google_kubernetes_engine
# pylint: disable=g-missing-from-attributes
self.sandbox_config: container_spec_lib.SandboxSpec | None = None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,8 @@ def _InitializeNodePool(
nodepool_spec.machine_families,
)
nodepool_config.sandbox_config = nodepool_spec.sandbox_config
nodepool_config.node_labels = nodepool_spec.node_labels
nodepool_config.node_taints = nodepool_spec.node_taints
nodepool_config.zone = zone
nodepool_config.num_nodes = nodepool_spec.vm_count
if nodepool_spec.min_vm_count is None:
Expand Down
1 change: 0 additions & 1 deletion tests/providers/aws/aws_cluster_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ def setUp(self):

def testCreateDependencies(self):
self.mock_issue.side_effect = [
('fake_key', None, None), # Mock 'cat key' from __init__
('', None, None), # Mock import key from ImportKeyfile
# Mock vpc creation
('''Creating CloudFormation stack...
Expand Down
41 changes: 41 additions & 0 deletions tests/providers/aws/elastic_kubernetes_service_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,47 @@ def testEksClusterNodepoolsAutoscaling(self):
self.assertEqual(node_groups[1]['maxSize'], 10)
self.assertEqual(node_groups[1]['desiredCapacity'], 3)

def testEksClusterNodepoolLabels(self):
cluster = elastic_kubernetes_service.EksCluster(EKS_SPEC)
nodepool = cluster.default_nodepool
nodepool.node_labels = {'env': 'prod', 'team': 'ml'}
actual = cluster._RenderNodeGroupJson(nodepool)
self.assertEqual(
actual['labels'],
{'pkb_nodepool': nodepool.name, 'env': 'prod', 'team': 'ml'},
)

def testEksClusterNodepoolTaints(self):
cluster = elastic_kubernetes_service.EksCluster(EKS_SPEC)
nodepool = cluster.default_nodepool
nodepool.node_taints = [
'sandbox.gke.io/runtime=runsc:NoSchedule',
'dedicated:NoExecute',
]
actual = cluster._RenderNodeGroupJson(nodepool)
self.assertEqual(
actual['taints'],
[
{
'key': 'sandbox.gke.io/runtime',
'value': 'runsc',
'effect': 'NoSchedule',
},
{'key': 'dedicated', 'effect': 'NoExecute'},
],
)


def testParseTaint(self):
self.assertEqual(
elastic_kubernetes_service._ParseTaint('key=value:NoSchedule'),
{'key': 'key', 'value': 'value', 'effect': 'NoSchedule'},
)
self.assertEqual(
elastic_kubernetes_service._ParseTaint('dedicated:NoExecute'),
{'key': 'dedicated', 'effect': 'NoExecute'},
)

def testGetNodePoolNames(self):
# Mock the output of the aws cli command
cluster = elastic_kubernetes_service.EksCluster(EKS_SPEC)
Expand Down