Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,43 @@ To deploy a PodCliqueSet with this structure and explore the naming hierarchy th
**You control:** PodCliqueSet name, PodClique template names, PCSG template names
**Grove generates:** All resource instances with hierarchical naming

### Using Generated Names in User-Managed HPAs

Grove can create HPAs for PodCliques and PodCliqueScalingGroups when the target has autoscaling configured. If you prefer to create an HPA yourself, leave autoscaling unset for that target and use the resource name in `scaleTargetRef`.

List the PodCliqueSet and generated targets:

```bash
kubectl get pcs <pcs-name>
kubectl get pcsg -l app.kubernetes.io/part-of=<pcs-name>
kubectl get pclq -l app.kubernetes.io/part-of=<pcs-name>
```

Then use the selected target name in the HPA:

```yaml
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: example-pcsg-hpa
spec:
scaleTargetRef:
apiVersion: grove.io/v1alpha1
kind: PodCliqueScalingGroup
name: <pcsg-name>
minReplicas: 1
maxReplicas: 5
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
```

For the whole workload, use `kind: PodCliqueSet` and the PodCliqueSet name. For standalone PodCliques, use `kind: PodClique` and the selected PodClique name. PodCliques that belong to a PodCliqueScalingGroup should be scaled through the PodCliqueScalingGroup instead.

## Key Takeaways

1. **Self-Documenting Hierarchy**: Pod names encode the complete hierarchy from PodCliqueSet → PCSG (if applicable) → PodClique → Pod, making `kubectl get pods` output immediately understandable.
Expand All @@ -177,4 +214,3 @@ Now that you understand Grove's naming scheme and best practices:
- **See it in action**: Continue to the [Hands-On Example](./03_hands-on-example.md) to deploy an example system and observe the naming hierarchy firsthand.

- **Learn programmatic discovery**: Head to the [Environment Variables guide](../03_environment-variables-for-pod-discovery/01_overview.md) to learn how to use these names programmatically for pod discovery, including how Grove injects environment variables and how to construct FQDNs for pod-to-pod communication.

6 changes: 4 additions & 2 deletions operator/internal/controller/podclique/reconcilestatus.go
Original file line number Diff line number Diff line change
Expand Up @@ -165,9 +165,11 @@ func mutateUpdatedReplica(pclq *grovecorev1alpha1.PodClique, existingPods []*cor
}
}

// mutateSelector creates and sets the label selector for autoscaler use when scaling is configured
// mutateSelector publishes the label selector on the PodClique /scale subresource so HPAs can
// target the PodClique. PodCliques that belong to a PodCliqueScalingGroup are scaled via the PCSG
// and must not advertise their own selector.
func mutateSelector(pcsName string, pclq *grovecorev1alpha1.PodClique) error {
if pclq.Spec.ScaleConfig == nil {
if _, isPCSGMember := pclq.Labels[apicommon.LabelPodCliqueScalingGroup]; isPCSGMember {
return nil
}
labels := lo.Assign(
Expand Down
51 changes: 51 additions & 0 deletions operator/internal/controller/podclique/reconcilestatus_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
internalconstants "github.com/ai-dynamo/grove/operator/internal/constants"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/tools/record"
Expand Down Expand Up @@ -350,3 +351,53 @@ func TestComputeMinAvailableBreachedConditionPartialScheduleRegression(t *testin
})
}
}

// TestMutateSelector verifies the /scale selector is published for standalone PodCliques (with or
// without ScaleConfig) and suppressed for PodCliques that belong to a PodCliqueScalingGroup,
// regardless of whether the PodClique itself has ScaleConfig set.
func TestMutateSelector(t *testing.T) {
const (
pcsName = "test-pcs"
standaloneFQN = "test-pcs-0-frontend"
pcsgMemberFQN = "test-pcs-0-prefill-0-worker"
pcsgName = "test-pcs-0-prefill"
)
withScale := &grovecorev1alpha1.AutoScalingConfig{MaxReplicas: 5}

tests := []struct {
name string
pclqName string
pcsgMemberLabel string // empty == standalone
scaleConfig *grovecorev1alpha1.AutoScalingConfig
expectSelectorOK bool
}{
{name: "standalone, no ScaleConfig", pclqName: standaloneFQN, expectSelectorOK: true},
{name: "standalone, with ScaleConfig", pclqName: standaloneFQN, scaleConfig: withScale, expectSelectorOK: true},
{name: "PCSG member, no ScaleConfig", pclqName: pcsgMemberFQN, pcsgMemberLabel: pcsgName, expectSelectorOK: false},
// PCSG members are scaled through the PCSG; their own ScaleConfig (if any) must not
// resurrect the selector and re-expose them as an HPA target.
{name: "PCSG member, with ScaleConfig", pclqName: pcsgMemberFQN, pcsgMemberLabel: pcsgName, scaleConfig: withScale, expectSelectorOK: false},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
labels := map[string]string{}
if tt.pcsgMemberLabel != "" {
labels[apicommon.LabelPodCliqueScalingGroup] = tt.pcsgMemberLabel
}
pclq := &grovecorev1alpha1.PodClique{
ObjectMeta: metav1.ObjectMeta{Name: tt.pclqName, Labels: labels},
Spec: grovecorev1alpha1.PodCliqueSpec{ScaleConfig: tt.scaleConfig},
}

err := mutateSelector(pcsName, pclq)
assert.NoError(t, err)
if tt.expectSelectorOK {
require.NotNil(t, pclq.Status.Selector, "selector should be populated")
assert.Contains(t, *pclq.Status.Selector, apicommon.LabelPodClique+"="+pclq.Name)
} else {
assert.Nil(t, pclq.Status.Selector, "selector should not be populated for PCSG-member PCLQ")
}
})
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -312,24 +312,21 @@ func (r *Reconciler) getPodCliquesPerPCSGReplica(ctx context.Context, pcsName st
return pclqsPerPCSGReplica, nil
}

// mutateSelector creates and sets the label selector for autoscaler use when scaling is configured
// mutateSelector publishes the label selector on the PodCliqueScalingGroup /scale subresource so
// HPAs can target the PCSG.
func mutateSelector(pcs *grovecorev1alpha1.PodCliqueSet, pcsg *grovecorev1alpha1.PodCliqueScalingGroup) error {
pcsReplicaIndex, err := k8sutils.GetPodCliqueSetReplicaIndex(pcsg.ObjectMeta)
if err != nil {
return err
}
matchingPCSGConfig, ok := lo.Find(pcs.Spec.Template.PodCliqueScalingGroupConfigs, func(pcsgConfig grovecorev1alpha1.PodCliqueScalingGroupConfig) bool {
_, ok := lo.Find(pcs.Spec.Template.PodCliqueScalingGroupConfigs, func(pcsgConfig grovecorev1alpha1.PodCliqueScalingGroupConfig) bool {
pcsgFQN := apicommon.GeneratePodCliqueScalingGroupName(apicommon.ResourceNameReplica{Name: pcs.Name, Replica: pcsReplicaIndex}, pcsgConfig.Name)
return pcsgFQN == pcsg.Name
})
if !ok {
// This should ideally never happen but if you find a PCSG that is not defined in PCS then just ignore it.
return nil
}
// No ScaleConfig has been defined of this PCSG, therefore there is no need to add a selector in the status.
if matchingPCSGConfig.ScaleConfig == nil {
return nil
}
labels := lo.Assign(
apicommon.GetDefaultLabelsForPodCliqueSetManagedResources(pcs.Name),
map[string]string{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"testing"
"time"

apicommon "github.com/ai-dynamo/grove/operator/api/common"
"github.com/ai-dynamo/grove/operator/api/common/constants"
grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
internalconstants "github.com/ai-dynamo/grove/operator/internal/constants"
Expand Down Expand Up @@ -859,3 +860,62 @@ func assertCondition(t *testing.T, pcsg *grovecorev1alpha1.PodCliqueScalingGroup
isBreached := condition.Status == metav1.ConditionTrue
assert.Equal(t, expectBreached, isBreached, "condition breach status mismatch")
}

// TestMutateSelector verifies the /scale selector is published for PCSGs regardless of whether
// ScaleConfig is set in the parent PodCliqueSet template, and is suppressed for PCSGs whose name
// does not match any config in the PodCliqueSet template.
func TestMutateSelector(t *testing.T) {
const (
pcsName = "test-pcs"
pcsgConfigName = "prefill"
)
pcsgFQN := apicommon.GeneratePodCliqueScalingGroupName(apicommon.ResourceNameReplica{Name: pcsName, Replica: 0}, pcsgConfigName)
withScale := &grovecorev1alpha1.AutoScalingConfig{MaxReplicas: 5}

tests := []struct {
name string
pcsgName string // FQN to put on the PCSG; defaults to pcsgFQN
pcsgConfigScaleConfig *grovecorev1alpha1.AutoScalingConfig
expectSelectorPopulated bool
}{
{name: "no ScaleConfig still publishes selector", expectSelectorPopulated: true},
{name: "ScaleConfig present publishes selector", pcsgConfigScaleConfig: withScale, expectSelectorPopulated: true},
// PCSG whose name does not match any config in the PCS template (e.g. stale object
// during a rename) must not publish a selector.
{name: "unknown PCSG name does not publish selector", pcsgName: "test-pcs-0-stale", expectSelectorPopulated: false},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
pcs := &grovecorev1alpha1.PodCliqueSet{
ObjectMeta: metav1.ObjectMeta{Name: pcsName},
Spec: grovecorev1alpha1.PodCliqueSetSpec{
Template: grovecorev1alpha1.PodCliqueSetTemplateSpec{
PodCliqueScalingGroupConfigs: []grovecorev1alpha1.PodCliqueScalingGroupConfig{
{Name: pcsgConfigName, ScaleConfig: tt.pcsgConfigScaleConfig},
},
},
},
}
name := tt.pcsgName
if name == "" {
name = pcsgFQN
}
pcsg := &grovecorev1alpha1.PodCliqueScalingGroup{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Labels: map[string]string{apicommon.LabelPodCliqueSetReplicaIndex: "0"},
},
}

err := mutateSelector(pcs, pcsg)
require.NoError(t, err)
if tt.expectSelectorPopulated {
require.NotNil(t, pcsg.Status.Selector)
assert.Contains(t, *pcsg.Status.Selector, apicommon.LabelPodCliqueScalingGroup+"="+pcsg.Name)
} else {
assert.Nil(t, pcsg.Status.Selector)
}
})
}
}
21 changes: 21 additions & 0 deletions operator/internal/controller/podcliqueset/reconcilestatus.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"slices"
"strconv"

apicommon "github.com/ai-dynamo/grove/operator/api/common"
apicommonconstants "github.com/ai-dynamo/grove/operator/api/common/constants"
grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
"github.com/ai-dynamo/grove/operator/internal/clustertopology"
Expand All @@ -36,6 +37,7 @@ import (
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/client"
)

Expand All @@ -55,6 +57,10 @@ func (r *Reconciler) reconcileStatus(ctx context.Context, logger logr.Logger, pc
return ctrlcommon.ReconcileWithErrors("failed to mutate TopologyLevelsUnavailable condition", err)
}

if err = mutateSelector(pcs); err != nil {
return ctrlcommon.ReconcileWithErrors("failed to update selector for PodCliqueSet", err)
}

// Skip the status update when every mutate* above left status byte-identical to what
// the previous reconcile already persisted. The mutators are the only code writing
// pcs.Status here, so equality means there is nothing for the apiserver to store.
Expand Down Expand Up @@ -91,6 +97,21 @@ func (r *Reconciler) mutateReplicas(ctx context.Context, logger logr.Logger, pcs
return nil
}

// mutateSelector publishes the label selector on the PodCliqueSet /scale subresource so HPAs can
// target the whole PodCliqueSet. PCS is the top-level scope, so the default labels are already
// the complete selector; no per-resource narrowing label like PodClique/PodCliqueScalingGroup is
// needed.
func mutateSelector(pcs *grovecorev1alpha1.PodCliqueSet) error {
selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{
MatchLabels: apicommon.GetDefaultLabelsForPodCliqueSetManagedResources(pcs.Name),
})
if err != nil {
return fmt.Errorf("%w: failed to create label selector for PodCliqueSet %v", err, client.ObjectKeyFromObject(pcs))
}
pcs.Status.Selector = ptr.To(selector.String())
return nil
}

// pcsReplicaStats aggregates replica- and child-level update progress derived from a single
// pass over the informer cache.
type pcsReplicaStats struct {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"testing"
"time"

apicommon "github.com/ai-dynamo/grove/operator/api/common"
apicommonconstants "github.com/ai-dynamo/grove/operator/api/common/constants"
configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1"
grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
Expand All @@ -32,6 +33,7 @@ import (
"github.com/stretchr/testify/require"
apimeta "k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/util/uuid"
"k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/client"
Expand Down Expand Up @@ -882,3 +884,43 @@ func pcsgName(replicaIndex int) string {
return ""
}
}

// TestMutateSelector verifies the /scale selector is always published for PodCliqueSet, scoped to
// resources managed by Grove for this PodCliqueSet (matched by `app.kubernetes.io/managed-by` and
// `app.kubernetes.io/part-of`). It also asserts the rendered selector parses back into a usable
// label selector that matches a Pod carrying the PCS-managed default labels.
func TestMutateSelector(t *testing.T) {
tests := []struct {
name string
existingSelector *string
}{
{name: "publishes selector for fresh PodCliqueSet"},
// PCS always rewrites Status.Selector, so a stale value from a previous reconcile must
// not survive into the new status.
{name: "overwrites a stale selector", existingSelector: ptr.To("app.kubernetes.io/part-of=stale-pcs")},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
pcs := &grovecorev1alpha1.PodCliqueSet{
ObjectMeta: metav1.ObjectMeta{Name: testPCSName},
Status: grovecorev1alpha1.PodCliqueSetStatus{Selector: tt.existingSelector},
}

err := mutateSelector(pcs)
require.NoError(t, err)
require.NotNil(t, pcs.Status.Selector)

assert.Contains(t, *pcs.Status.Selector, apicommon.LabelManagedByKey+"="+apicommon.LabelManagedByValue)
assert.Contains(t, *pcs.Status.Selector, apicommon.LabelPartOfKey+"="+testPCSName)

parsed, err := labels.Parse(*pcs.Status.Selector)
require.NoError(t, err, "rendered selector must parse as a valid label selector")
podLabels := labels.Set{
apicommon.LabelManagedByKey: apicommon.LabelManagedByValue,
apicommon.LabelPartOfKey: testPCSName,
}
assert.True(t, parsed.Matches(podLabels), "selector should match a Pod carrying the PCS-managed default labels")
})
}
}
Loading