diff --git a/Makefile b/Makefile index fc3cccd84..d66c7c6bc 100644 --- a/Makefile +++ b/Makefile @@ -91,7 +91,7 @@ generate-api-docs: $(CRD_REF_DOCS) .PHONY: test-unit test-unit: @echo "> Running tests for operator/api" - @cd operator/api && go test ./... + @make --directory=operator/api test-unit @echo "> Running tests for operator" @make --directory=operator test-unit @echo "> Running tests for operator/client" @@ -109,12 +109,16 @@ test-unit: .PHONY: test-cover test-cover: + @echo "> Running tests with coverage for operator/api" + @make --directory=operator/api test-cover @echo "> Running tests with coverage for operator" @make --directory=operator test-cover # Generates HTML coverage reports for the entire codebase (all modules) .PHONY: cover-html cover-html: + @echo "> Generating HTML coverage report for operator/api" + @make --directory=operator/api cover-html @echo "> Generating HTML coverage report for operator" @make --directory=operator cover-html diff --git a/operator/api/Makefile b/operator/api/Makefile index 52df7a653..0bc1e089e 100644 --- a/operator/api/Makefile +++ b/operator/api/Makefile @@ -35,3 +35,22 @@ generate: $(CONTROLLER_GEN) .PHONY: lint lint: $(GOLANGCI_LINT) @$(GOLANGCI_LINT) run -c $(REPO_ROOT)/.golangci.yaml ./... + +# Make targets for tests +# ------------------------------------------------------------- + +# Run all unit tests +.PHONY: test-unit +test-unit: + @go test ./... + +# Run all unit tests with code coverage +.PHONY: test-cover +test-cover: + @go test ./... -coverprofile=coverage.out + +# Generate HTML coverage report +.PHONY: cover-html +cover-html: test-cover + @go tool cover -html=coverage.out -o coverage.html + @echo "Coverage report generated at coverage.html" diff --git a/operator/api/common/constants/constants.go b/operator/api/common/constants/constants.go index 873efa875..a6a4718fc 100644 --- a/operator/api/common/constants/constants.go +++ b/operator/api/common/constants/constants.go @@ -47,6 +47,8 @@ const ( // AnnotationDisableManagedResourceProtection is an annotation set by an operator on a PodCliqueSet to explicitly // disable protection of managed resources for a PodCliqueSet. AnnotationDisableManagedResourceProtection = "grove.io/disable-managed-resource-protection" + // AnnotationReconcileTrigger is an annotation set on a PodCliqueSet to explicitly trigger a reconcile without changing its spec. + AnnotationReconcileTrigger = "grove.io/reconcile-trigger" // AnnotationTopologyName is an annotation set on PodGang to allow KAI scheduler to discover which topology to use. AnnotationTopologyName = "grove.io/topology-name" ) diff --git a/operator/e2e/diagnostics/collector.go b/operator/e2e/diagnostics/collector.go index 3c2904248..b0cb99e87 100644 --- a/operator/e2e/diagnostics/collector.go +++ b/operator/e2e/diagnostics/collector.go @@ -483,6 +483,9 @@ func createDiagnosticsFile(testName, diagDir string) (*os.File, string, error) { if diagDir != "" { filename := filepath.Join(diagDir, baseFilename) + if err := os.MkdirAll(diagDir, 0755); err != nil { + return nil, "", fmt.Errorf("failed to create diagnostics directory %s: %w", diagDir, err) + } file, err := os.Create(filename) if err != nil { return nil, "", fmt.Errorf("failed to create diagnostics file in %s: %w", diagDir, err) diff --git a/operator/e2e/grove/workload/workload.go b/operator/e2e/grove/workload/workload.go index b048c9bb3..940d15660 100644 --- a/operator/e2e/grove/workload/workload.go +++ b/operator/e2e/grove/workload/workload.go @@ -23,6 +23,7 @@ import ( "fmt" "time" + "github.com/ai-dynamo/grove/operator/api/common/constants" grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" "github.com/ai-dynamo/grove/operator/e2e/grove/gvk" "github.com/ai-dynamo/grove/operator/e2e/k8s/k8sclient" @@ -79,7 +80,7 @@ func (wm *WorkloadManager) TriggerPCSReconcile(ctx context.Context, namespace, n Namespace: namespace, }, } - patch := []byte(fmt.Sprintf(`{"metadata":{"annotations":{"grove.io/reconcile-trigger":%q}}}`, triggerID)) + patch := []byte(fmt.Sprintf(`{"metadata":{"annotations":{%q:%q}}}`, constants.AnnotationReconcileTrigger, triggerID)) if err := wm.cl.Patch(ctx, pcs, client.RawPatch(types.MergePatchType, patch)); err != nil { return fmt.Errorf("trigger PCS reconcile %s/%s: %w", namespace, name, err) } diff --git a/operator/e2e/testctx/context.go b/operator/e2e/testctx/context.go index 81a5a1ad1..3ce08daa4 100644 --- a/operator/e2e/testctx/context.go +++ b/operator/e2e/testctx/context.go @@ -190,7 +190,12 @@ func (tc *TestContext) GetLabelSelector() string { // ListPods lists pods matching the current workload's label selector. func (tc *TestContext) ListPods() (*v1.PodList, error) { - return tc.newPodManager().List(tc.Ctx, tc.Namespace, tc.GetLabelSelector()) + return tc.ListPodsWithContext(tc.Ctx) +} + +// ListPodsWithContext lists pods matching the current workload's label selector using ctx. +func (tc *TestContext) ListPodsWithContext(ctx context.Context) (*v1.PodList, error) { + return tc.newPodManager().List(ctx, tc.Namespace, tc.GetLabelSelector()) } // WaitForPods waits for the expected pod count to be ready. diff --git a/operator/e2e/tests/update/legacy_hash_migration_test.go b/operator/e2e/tests/update/legacy_hash_migration_test.go new file mode 100644 index 000000000..90804fd82 --- /dev/null +++ b/operator/e2e/tests/update/legacy_hash_migration_test.go @@ -0,0 +1,730 @@ +//go:build e2e + +// /* +// Copyright 2026 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package update + +import ( + "context" + "fmt" + "os" + "path/filepath" + "sort" + "strings" + "testing" + "time" + + apicommon "github.com/ai-dynamo/grove/operator/api/common" + grovev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" + "github.com/ai-dynamo/grove/operator/e2e/grove/workload" + "github.com/ai-dynamo/grove/operator/e2e/k8s/pods" + "github.com/ai-dynamo/grove/operator/e2e/setup" + "github.com/ai-dynamo/grove/operator/e2e/testctx" + "github.com/ai-dynamo/grove/operator/e2e/tests" + "github.com/ai-dynamo/grove/operator/e2e/waiter" + componentutils "github.com/ai-dynamo/grove/operator/internal/controller/common/component/utils" + "gopkg.in/yaml.v3" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/watch" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +const ( + legacyHashMigrationWorkloadName = "legacy-hash-migration" + alpha8Version = "v0.1.0-alpha.8" + alpha8ChartRef = "oci://ghcr.io/ai-dynamo/grove/grove-charts" + alpha8ImageRepository = "ghcr.io/ai-dynamo/grove/grove-operator" + alpha8CRDImageRepository = "ghcr.io/ai-dynamo/grove/grove-install-crds" + alpha8InitImageRepository = "ghcr.io/ai-dynamo/grove/grove-initc" +) + +// Test_LegacyHashMigration_UpgradesAlpha8HashesWithoutPodRecreation verifies +// that resources created by v0.1.0-alpha.8 with legacy hashes are migrated to +// canonical hashes by this branch without recreating workload pods. +func Test_LegacyHashMigration_UpgradesAlpha8HashesWithoutPodRecreation(t *testing.T) { + ctx := context.Background() + tc, clusterCleanup := testctx.PrepareTest(ctx, t, 2, + testctx.WithWorkload(&testctx.WorkloadConfig{ + Name: legacyHashMigrationWorkloadName, + YAMLPath: "../../yaml/workload-legacy-hash-migration.yaml", + Namespace: "default", + ExpectedPods: 2, + }), + ) + + localChartDir, err := setup.GetGroveChartDir() + if err != nil { + clusterCleanup() + t.Fatalf("failed to locate local Grove chart: %v", err) + } + localChartVersion, err := readChartVersion(localChartDir) + if err != nil { + clusterCleanup() + t.Fatalf("failed to read local Grove chart version: %v", err) + } + localValues, err := captureCurrentOperatorImageValues(ctx, tc.Client) + if err != nil { + clusterCleanup() + t.Fatalf("failed to capture current operator image values: %v", err) + } + + usingLocalOperator := true + var tracker *updateTracker + defer func() { + if tracker != nil { + tracker.stop() + } + if !usingLocalOperator { + if restoreErr := upgradeOperator(ctx, tc, localChartDir, localChartVersion, true, localValues); restoreErr != nil { + t.Errorf("failed to restore local Grove operator: %v", restoreErr) + } + } + clusterCleanup() + }() + + tests.Logger.Info("1. Downgrade Grove operator to v0.1.0-alpha.8") + if err := upgradeOperator(ctx, tc, alpha8ChartRef, alpha8Version, false, alpha8HelmValues()); err != nil { + t.Fatalf("failed to downgrade Grove operator to %s: %v", alpha8Version, err) + } + usingLocalOperator = false + + tests.Logger.Info("2. Deploy workload and verify alpha.8 stored legacy hashes") + if _, err := tc.ApplyYAMLFile(tc.Workload.YAMLPath); err != nil { + t.Fatalf("failed to apply compatibility workload: %v", err) + } + if err := tc.WaitForPods(tc.Workload.ExpectedPods); err != nil { + t.Fatalf("failed to wait for compatibility workload pods: %v", err) + } + if err := waitForHashState(tc, assertLegacyHashState); err != nil { + t.Fatalf("workload did not converge to expected legacy hash state: %v", err) + } + + before, err := capturePodIdentities(tc) + if err != nil { + t.Fatalf("failed to capture pod identities before upgrade: %v", err) + } + + tests.Logger.Info("3. Watch workload pods, then upgrade Grove operator back to this branch") + tracker = newUpdateTracker() + if err := tracker.start(tc); err != nil { + t.Fatalf("failed to start pod update tracker: %v", err) + } + if err := upgradeOperator(ctx, tc, localChartDir, localChartVersion, true, localValues); err != nil { + t.Fatalf("failed to upgrade Grove operator back to local branch: %v", err) + } + usingLocalOperator = true + + tests.Logger.Info("4. Trigger a no-op PodCliqueSet reconcile and wait for canonical hash migration") + workloadManager := workload.NewWorkloadManager(tc.Client, tests.Logger) + if err := workloadManager.TriggerPCSReconcile(tc.Ctx, tc.Namespace, tc.Workload.Name, fmt.Sprintf("legacy-hash-migration-%d", time.Now().UnixNano())); err != nil { + t.Fatalf("failed to trigger no-op PCS reconcile: %v", err) + } + if err := waitForHashState(tc, assertCanonicalHashState); err != nil { + t.Fatalf("workload did not migrate to canonical hash state: %v", err) + } + if err := tc.WaitForPods(tc.Workload.ExpectedPods); err != nil { + t.Fatalf("failed to wait for pods after hash migration: %v", err) + } + + tracker.stop() + events := tracker.getEvents() + verifyNoPodReplacementsDuringHashMigration(t, events, before) + if err := verifyPodIdentitiesUnchanged(tc, before); err != nil { + t.Fatalf("pod identities changed during hash migration: %v", err) + } + + tests.Logger.Info("Legacy hash migration upgrade test completed successfully!") +} + +// chartYAML captures the Chart.yaml fields needed to restore the local operator. +type chartYAML struct { + Version string `yaml:"version"` +} + +// readChartVersion returns the Helm chart version from the local chart directory. +func readChartVersion(chartDir string) (string, error) { + data, err := os.ReadFile(filepath.Join(chartDir, "Chart.yaml")) + if err != nil { + return "", err + } + var chart chartYAML + if err := yaml.Unmarshal(data, &chart); err != nil { + return "", err + } + if chart.Version == "" { + return "", fmt.Errorf("version missing from Chart.yaml") + } + return chart.Version, nil +} + +// alpha8HelmValues pins every image that alpha.8 expects so the downgrade uses +// the released operator, CRD installer, and init container together. +func alpha8HelmValues() map[string]interface{} { + return map[string]interface{}{ + "image": map[string]interface{}{ + "repository": alpha8ImageRepository, + "tag": alpha8Version, + "pullPolicy": string(corev1.PullIfNotPresent), + }, + "crdInstaller": map[string]interface{}{ + "image": map[string]interface{}{ + "repository": alpha8CRDImageRepository, + "tag": alpha8Version, + "pullPolicy": string(corev1.PullIfNotPresent), + }, + }, + "deployment": map[string]interface{}{ + "env": []map[string]string{ + { + "name": "GROVE_INIT_CONTAINER_IMAGE", + "value": alpha8InitImageRepository, + }, + }, + }, + } +} + +// upgradeOperator installs the requested chart version and waits until the +// operator deployment is ready to reconcile workloads. +func upgradeOperator(ctx context.Context, tc *testctx.TestContext, chartRef, chartVersion string, reuseValues bool, values map[string]interface{}) error { + _, err := setup.UpgradeHelmChart(&setup.HelmInstallConfig{ + RestConfig: tc.Client.RestConfig, + ReleaseName: setup.OperatorDeploymentName, + ChartRef: chartRef, + ChartVersion: chartVersion, + Namespace: setup.OperatorNamespace, + Wait: true, + ReuseValues: reuseValues, + Values: values, + Timeout: 3 * time.Minute, + HelmLoggerFunc: tests.Logger.Debugf, + Logger: tests.Logger, + }) + if err != nil { + return err + } + return waitForOperatorReady(ctx, tc) +} + +// waitForOperatorReady blocks until the Grove operator pod is ready after a Helm upgrade. +func waitForOperatorReady(ctx context.Context, tc *testctx.TestContext) error { + podManager := pods.NewPodManager(tc.Client, tests.Logger) + return podManager.WaitForReady(ctx, []string{setup.OperatorNamespace}, setup.OperatorPodLabelSelector, 1, 3*time.Minute, 5*time.Second) +} + +// captureCurrentOperatorImageValues records the locally built operator images so +// the test can restore this branch after downgrading to alpha.8. +func captureCurrentOperatorImageValues(ctx context.Context, cl client.Client) (map[string]interface{}, error) { + var deployment appsv1.Deployment + if err := cl.Get(ctx, types.NamespacedName{Namespace: setup.OperatorNamespace, Name: setup.OperatorDeploymentName}, &deployment); err != nil { + return nil, err + } + + operatorContainer, ok := findContainer(deployment.Spec.Template.Spec.Containers, "grove-operator") + if !ok { + return nil, fmt.Errorf("grove-operator container not found in deployment") + } + imageValues, err := imageValueMap(operatorContainer.Image, operatorContainer.ImagePullPolicy) + if err != nil { + return nil, fmt.Errorf("parse operator image %q: %w", operatorContainer.Image, err) + } + + values := map[string]interface{}{ + "image": imageValues, + } + + if crdInstaller, ok := findContainer(deployment.Spec.Template.Spec.InitContainers, "crd-installer"); ok { + crdInstallerImage, err := imageValueMap(crdInstaller.Image, crdInstaller.ImagePullPolicy) + if err != nil { + return nil, fmt.Errorf("parse crd-installer image %q: %w", crdInstaller.Image, err) + } + values["crdInstaller"] = map[string]interface{}{ + "enabled": true, + "image": crdInstallerImage, + } + } + + if initImage, ok := findEnvValue(operatorContainer.Env, "GROVE_INIT_CONTAINER_IMAGE"); ok { + values["deployment"] = map[string]interface{}{ + "env": []map[string]string{ + { + "name": "GROVE_INIT_CONTAINER_IMAGE", + "value": initImage, + }, + }, + } + } + + return values, nil +} + +// findContainer returns the named container from a Kubernetes container slice. +func findContainer(containers []corev1.Container, name string) (corev1.Container, bool) { + for _, container := range containers { + if container.Name == name { + return container, true + } + } + return corev1.Container{}, false +} + +// findEnvValue returns an environment variable value from a container env slice. +func findEnvValue(env []corev1.EnvVar, name string) (string, bool) { + for _, item := range env { + if item.Name == name { + return item.Value, true + } + } + return "", false +} + +// imageValueMap converts a deployed image reference into the Helm values schema +// used by the Grove operator chart. +func imageValueMap(image string, pullPolicy corev1.PullPolicy) (map[string]interface{}, error) { + repository, tag, err := splitImageForHelm(image) + if err != nil { + return nil, err + } + return map[string]interface{}{ + "repository": repository, + "tag": tag, + "pullPolicy": string(pullPolicy), + }, nil +} + +// splitImageForHelm separates an image reference into repository and tag values +// while preserving digest-qualified tags when present. +func splitImageForHelm(image string) (string, string, error) { + base, digest, hasDigest := strings.Cut(image, "@") + tagSeparator := strings.LastIndex(base, ":") + lastSlash := strings.LastIndex(base, "/") + if tagSeparator > lastSlash { + tag := base[tagSeparator+1:] + if hasDigest { + tag += "@" + digest + } + return base[:tagSeparator], tag, nil + } + if hasDigest { + return base, digest, nil + } + return "", "", fmt.Errorf("image has no tag or digest") +} + +// podIdentity captures the stable pod fields that must survive hash migration. +type podIdentity struct { + uid types.UID + restartCounts map[string]int32 +} + +// capturePodIdentities records pod UIDs and container restart counts before migration. +func capturePodIdentities(tc *testctx.TestContext) (map[string]podIdentity, error) { + podList, err := tc.ListPods() + if err != nil { + return nil, err + } + identities := make(map[string]podIdentity, len(podList.Items)) + for _, pod := range podList.Items { + identities[pod.Name] = podIdentity{ + uid: pod.UID, + restartCounts: restartCountsByContainer(pod), + } + } + return identities, nil +} + +// restartCountsByContainer returns restart counts keyed by container name. +func restartCountsByContainer(pod corev1.Pod) map[string]int32 { + counts := make(map[string]int32, len(pod.Status.ContainerStatuses)) + for _, status := range pod.Status.ContainerStatuses { + counts[status.Name] = status.RestartCount + } + return counts +} + +// verifyNoPodReplacementsDuringHashMigration fails if migration deletes existing +// workload pods or adds replacement pods. +func verifyNoPodReplacementsDuringHashMigration(t *testing.T, events []podEvent, before map[string]podIdentity) { + t.Helper() + + var deleted []string + var added []string + for _, event := range events { + switch event.eventType { + case watch.Deleted: + if _, existed := before[event.pod.Name]; existed { + deleted = append(deleted, event.pod.Name) + } + case watch.Added: + if _, existed := before[event.pod.Name]; !existed { + added = append(added, event.pod.Name) + } + } + } + if len(deleted) > 0 || len(added) > 0 { + t.Fatalf("hash migration recreated workload pods: deleted=%v added=%v", deleted, added) + } +} + +// verifyPodIdentitiesUnchanged verifies that no pod UID or container restart +// count changed while legacy hashes were migrated. +func verifyPodIdentitiesUnchanged(tc *testctx.TestContext, before map[string]podIdentity) error { + after, err := capturePodIdentities(tc) + if err != nil { + return err + } + if len(after) != len(before) { + return fmt.Errorf("pod count changed: before=%d after=%d", len(before), len(after)) + } + for podName, beforeIdentity := range before { + afterIdentity, ok := after[podName] + if !ok { + return fmt.Errorf("pod %s missing after migration", podName) + } + if afterIdentity.uid != beforeIdentity.uid { + return fmt.Errorf("pod %s UID changed from %s to %s", podName, beforeIdentity.uid, afterIdentity.uid) + } + for containerName, beforeRestarts := range beforeIdentity.restartCounts { + afterRestarts, ok := afterIdentity.restartCounts[containerName] + if !ok { + return fmt.Errorf("pod %s container %s missing after migration", podName, containerName) + } + if afterRestarts != beforeRestarts { + return fmt.Errorf("pod %s container %s restart count changed from %d to %d", podName, containerName, beforeRestarts, afterRestarts) + } + } + } + return nil +} + +// hashAssertion validates a fetched hash state while the waiter polls. +type hashAssertion func(*hashState) error + +// hashState is a point-in-time snapshot of resources whose stored hashes must +// transition from legacy to canonical values. +type hashState struct { + pcs *grovev1alpha1.PodCliqueSet + pclqs []grovev1alpha1.PodClique + pcsgs []grovev1alpha1.PodCliqueScalingGroup + pods []corev1.Pod + pcsHash componentutils.HashCandidates + pclqHashes map[string]componentutils.HashCandidates +} + +// waitForHashState polls until the supplied assertion accepts the observed +// resource hashes, returning the last snapshot when timing out. +func waitForHashState(tc *testctx.TestContext, assertion hashAssertion) error { + var lastAssertionErr error + var lastState *hashState + fetch := waiter.FetchFunc[*hashState](func(ctx context.Context) (*hashState, error) { + state, err := fetchHashState(ctx, tc) + if err != nil { + return nil, err + } + lastState = state + if err := assertion(state); err != nil { + lastAssertionErr = err + tests.Logger.Debugf("[legacy-hash-migration] hash state not ready: %v", err) + return nil, nil + } + lastAssertionErr = nil + return state, nil + }) + w := waiter.New[*hashState](). + WithTimeout(3 * time.Minute). + WithInterval(5 * time.Second). + WithRetryOnError(). + WithLogger(tests.Logger) + _, err := w.WaitFor(tc.Ctx, fetch, func(state *hashState) bool { return state != nil }) + if err != nil { + if lastAssertionErr != nil { + return fmt.Errorf("%w; last assertion mismatch: %v; last observed hash state:\n%s", err, lastAssertionErr, formatHashStateSnapshot(lastState)) + } + if lastState != nil { + return fmt.Errorf("%w; last observed hash state:\n%s", err, formatHashStateSnapshot(lastState)) + } + } + return err +} + +// fetchHashState reads the PodCliqueSet and managed resources, then computes the +// canonical and legacy hash candidates expected for the current desired spec. +func fetchHashState(ctx context.Context, tc *testctx.TestContext) (*hashState, error) { + pcs := &grovev1alpha1.PodCliqueSet{} + if err := tc.Client.Get(ctx, types.NamespacedName{Namespace: tc.Namespace, Name: tc.Workload.Name}, pcs); err != nil { + return nil, err + } + + var pclqList grovev1alpha1.PodCliqueList + if err := tc.Client.List(ctx, &pclqList, client.InNamespace(tc.Namespace), client.MatchingLabels(apicommon.GetDefaultLabelsForPodCliqueSetManagedResources(tc.Workload.Name))); err != nil { + return nil, err + } + var pcsgList grovev1alpha1.PodCliqueScalingGroupList + if err := tc.Client.List(ctx, &pcsgList, client.InNamespace(tc.Namespace), client.MatchingLabels(apicommon.GetDefaultLabelsForPodCliqueSetManagedResources(tc.Workload.Name))); err != nil { + return nil, err + } + podList, err := tc.ListPodsWithContext(ctx) + if err != nil { + return nil, err + } + + pclqHashes := make(map[string]componentutils.HashCandidates, len(pclqList.Items)) + for _, pclq := range pclqList.Items { + candidates, err := componentutils.GetExpectedPCLQPodTemplateHashCandidates(pcs, pclq.ObjectMeta) + if err != nil { + return nil, err + } + pclqHashes[pclq.Name] = candidates + } + + return &hashState{ + pcs: pcs, + pclqs: pclqList.Items, + pcsgs: pcsgList.Items, + pods: podList.Items, + pcsHash: componentutils.ComputePCSGenerationHashCandidates(pcs), + pclqHashes: pclqHashes, + }, nil +} + +// formatHashStateSnapshot renders the last observed hash state for timeout errors. +func formatHashStateSnapshot(state *hashState) string { + if state == nil { + return "hash state was not fetched successfully" + } + + var b strings.Builder + fmt.Fprintf(&b, "PCS %s currentGenerationHash=%s expectedPCS={canonical:%s legacy:%s} updateProgress=%t replicas=%d updated=%d available=%d\n", + state.pcs.Name, + stringValue(state.pcs.Status.CurrentGenerationHash), + state.pcsHash.Canonical, + state.pcsHash.Legacy, + state.pcs.Status.UpdateProgress != nil, + state.pcs.Status.Replicas, + state.pcs.Status.UpdatedReplicas, + state.pcs.Status.AvailableReplicas) + + pclqs := append([]grovev1alpha1.PodClique(nil), state.pclqs...) + sort.Slice(pclqs, func(i, j int) bool { return pclqs[i].Name < pclqs[j].Name }) + for _, pclq := range pclqs { + candidates := state.pclqHashes[pclq.Name] + fmt.Fprintf(&b, "PCLQ %s labelHash=%s currentPodTemplateHash=%s currentPCSGenerationHash=%s expectedTemplate={canonical:%s legacy:%s} updateProgress=%t replicas=%d ready=%d updated=%d\n", + pclq.Name, + labelValue(pclq.Labels, apicommon.LabelPodTemplateHash), + stringValue(pclq.Status.CurrentPodTemplateHash), + stringValue(pclq.Status.CurrentPodCliqueSetGenerationHash), + candidates.Canonical, + candidates.Legacy, + pclq.Status.UpdateProgress != nil, + pclq.Status.Replicas, + pclq.Status.ReadyReplicas, + pclq.Status.UpdatedReplicas) + } + + pcsgs := append([]grovev1alpha1.PodCliqueScalingGroup(nil), state.pcsgs...) + sort.Slice(pcsgs, func(i, j int) bool { return pcsgs[i].Name < pcsgs[j].Name }) + for _, pcsg := range pcsgs { + fmt.Fprintf(&b, "PCSG %s currentPCSGenerationHash=%s expectedPCS={canonical:%s legacy:%s} updateProgress=%t replicas=%d scheduled=%d available=%d updated=%d\n", + pcsg.Name, + stringValue(pcsg.Status.CurrentPodCliqueSetGenerationHash), + state.pcsHash.Canonical, + state.pcsHash.Legacy, + pcsg.Status.UpdateProgress != nil, + pcsg.Status.Replicas, + pcsg.Status.ScheduledReplicas, + pcsg.Status.AvailableReplicas, + pcsg.Status.UpdatedReplicas) + } + + pods := append([]corev1.Pod(nil), state.pods...) + sort.Slice(pods, func(i, j int) bool { return pods[i].Name < pods[j].Name }) + for _, pod := range pods { + fmt.Fprintf(&b, "Pod %s pclq=%s labelHash=%s phase=%s ready=%t node=%s\n", + pod.Name, + labelValue(pod.Labels, apicommon.LabelPodClique), + labelValue(pod.Labels, apicommon.LabelPodTemplateHash), + pod.Status.Phase, + isPodReady(pod), + pod.Spec.NodeName) + } + + return strings.TrimSpace(b.String()) +} + +// stringValue formats optional status hash fields in diagnostic output. +func stringValue(value *string) string { + if value == nil { + return "" + } + return *value +} + +// labelValue formats optional label values in diagnostic output. +func labelValue(labels map[string]string, key string) string { + if labels == nil { + return "" + } + value, ok := labels[key] + if !ok { + return "" + } + return value +} + +// isPodReady reports whether the pod Ready condition is true. +func isPodReady(pod corev1.Pod) bool { + for _, condition := range pod.Status.Conditions { + if condition.Type == corev1.PodReady && condition.Status == corev1.ConditionTrue { + return true + } + } + return false +} + +// assertLegacyHashState verifies that alpha.8 populated every stored hash with +// the legacy value for the desired spec. +func assertLegacyHashState(state *hashState) error { + if err := requireLegacyHash("PCS status currentGenerationHash", state.pcs.Status.CurrentGenerationHash, state.pcsHash); err != nil { + return err + } + if len(state.pclqs) != 2 { + return fmt.Errorf("expected 2 PodCliques, got %d", len(state.pclqs)) + } + for _, pclq := range state.pclqs { + candidates := state.pclqHashes[pclq.Name] + if err := requireLegacyHash(fmt.Sprintf("PCLQ %s label", pclq.Name), stringPtrFromMap(pclq.Labels, apicommon.LabelPodTemplateHash), candidates); err != nil { + return err + } + if err := requireLegacyHash(fmt.Sprintf("PCLQ %s status currentPodTemplateHash", pclq.Name), pclq.Status.CurrentPodTemplateHash, candidates); err != nil { + return err + } + if err := requireLegacyHash(fmt.Sprintf("PCLQ %s status currentPodCliqueSetGenerationHash", pclq.Name), pclq.Status.CurrentPodCliqueSetGenerationHash, state.pcsHash); err != nil { + return err + } + } + if len(state.pcsgs) != 1 { + return fmt.Errorf("expected 1 PodCliqueScalingGroup, got %d", len(state.pcsgs)) + } + for _, pcsg := range state.pcsgs { + if err := requireLegacyHash(fmt.Sprintf("PCSG %s status currentPodCliqueSetGenerationHash", pcsg.Name), pcsg.Status.CurrentPodCliqueSetGenerationHash, state.pcsHash); err != nil { + return err + } + } + if len(state.pods) != 2 { + return fmt.Errorf("expected 2 pods, got %d", len(state.pods)) + } + for _, pod := range state.pods { + pclqName := pod.Labels[apicommon.LabelPodClique] + candidates, ok := state.pclqHashes[pclqName] + if !ok { + return fmt.Errorf("pod %s references unexpected PodClique %q", pod.Name, pclqName) + } + if err := requireLegacyHash(fmt.Sprintf("Pod %s label", pod.Name), stringPtrFromMap(pod.Labels, apicommon.LabelPodTemplateHash), candidates); err != nil { + return err + } + } + return nil +} + +// assertCanonicalHashState verifies that this branch migrated every stored hash +// to the canonical value without starting a rolling update. +func assertCanonicalHashState(state *hashState) error { + if err := requireCanonicalHash("PCS status currentGenerationHash", state.pcs.Status.CurrentGenerationHash, state.pcsHash); err != nil { + return err + } + if state.pcs.Status.UpdateProgress != nil { + return fmt.Errorf("PCS UpdateProgress was initialized during legacy hash migration") + } + if len(state.pclqs) != 2 { + return fmt.Errorf("expected 2 PodCliques, got %d", len(state.pclqs)) + } + for _, pclq := range state.pclqs { + candidates := state.pclqHashes[pclq.Name] + if err := requireCanonicalHash(fmt.Sprintf("PCLQ %s label", pclq.Name), stringPtrFromMap(pclq.Labels, apicommon.LabelPodTemplateHash), candidates); err != nil { + return err + } + if err := requireCanonicalHash(fmt.Sprintf("PCLQ %s status currentPodTemplateHash", pclq.Name), pclq.Status.CurrentPodTemplateHash, candidates); err != nil { + return err + } + if err := requireCanonicalHash(fmt.Sprintf("PCLQ %s status currentPodCliqueSetGenerationHash", pclq.Name), pclq.Status.CurrentPodCliqueSetGenerationHash, state.pcsHash); err != nil { + return err + } + } + if len(state.pcsgs) != 1 { + return fmt.Errorf("expected 1 PodCliqueScalingGroup, got %d", len(state.pcsgs)) + } + for _, pcsg := range state.pcsgs { + if err := requireCanonicalHash(fmt.Sprintf("PCSG %s status currentPodCliqueSetGenerationHash", pcsg.Name), pcsg.Status.CurrentPodCliqueSetGenerationHash, state.pcsHash); err != nil { + return err + } + } + if len(state.pods) != 2 { + return fmt.Errorf("expected 2 pods, got %d", len(state.pods)) + } + for _, pod := range state.pods { + pclqName := pod.Labels[apicommon.LabelPodClique] + candidates, ok := state.pclqHashes[pclqName] + if !ok { + return fmt.Errorf("pod %s references unexpected PodClique %q", pod.Name, pclqName) + } + if err := requireCanonicalHash(fmt.Sprintf("Pod %s label", pod.Name), stringPtrFromMap(pod.Labels, apicommon.LabelPodTemplateHash), candidates); err != nil { + return err + } + } + return nil +} + +// requireLegacyHash asserts that a stored hash equals the legacy candidate and +// not the canonical candidate. +func requireLegacyHash(name string, value *string, candidates componentutils.HashCandidates) error { + if candidates.Canonical == candidates.Legacy { + return fmt.Errorf("%s canonical and legacy hashes unexpectedly match: %s", name, candidates.Canonical) + } + if value == nil { + return fmt.Errorf("%s is nil", name) + } + if *value != candidates.Legacy { + return fmt.Errorf("%s = %s, want legacy %s and not canonical %s", name, *value, candidates.Legacy, candidates.Canonical) + } + return nil +} + +// requireCanonicalHash asserts that a stored hash equals the canonical candidate. +func requireCanonicalHash(name string, value *string, candidates componentutils.HashCandidates) error { + if candidates.Canonical == candidates.Legacy { + return fmt.Errorf("%s canonical and legacy hashes unexpectedly match: %s", name, candidates.Canonical) + } + if value == nil { + return fmt.Errorf("%s is nil", name) + } + if *value != candidates.Canonical { + return fmt.Errorf("%s = %s, want canonical %s", name, *value, candidates.Canonical) + } + return nil +} + +// stringPtrFromMap returns a pointer to a map value so hash assertion helpers can +// distinguish missing labels from present empty labels. +func stringPtrFromMap(values map[string]string, key string) *string { + value, ok := values[key] + if !ok { + return nil + } + return &value +} diff --git a/operator/e2e/tests/update/utils.go b/operator/e2e/tests/update/utils.go index 0471ee55e..433e0b3cb 100644 --- a/operator/e2e/tests/update/utils.go +++ b/operator/e2e/tests/update/utils.go @@ -381,8 +381,7 @@ func waitForRollingUpdateComplete(tc *testctx.TestContext, expectedReplicas int3 w := waiter.New[*grovev1alpha1.PodCliqueSet](). WithTimeout(tc.Timeout). WithInterval(tc.Interval) - err := w.WaitUntil(tc.Ctx, fetchPCS, predicate) - return err + return w.WaitUntil(tc.Ctx, fetchPCS, predicate) } // scalePodCliqueInPCS scales all PodClique instances for a given clique name across all PCS replicas. diff --git a/operator/e2e/waiter/waiter.go b/operator/e2e/waiter/waiter.go index 6efe1c6ab..261769d8d 100644 --- a/operator/e2e/waiter/waiter.go +++ b/operator/e2e/waiter/waiter.go @@ -131,6 +131,11 @@ func (w *Waiter[T]) WaitFor(ctx context.Context, fetchFn FetchFunc[T], predicate case <-ticker.C: } } + select { + case <-timeoutCtx.Done(): + return zero, fmt.Errorf("condition not met within timeout of %v", w.timeout) + default: + } result, err := fetchFn(timeoutCtx) if err != nil { diff --git a/operator/e2e/yaml/workload-legacy-hash-migration.yaml b/operator/e2e/yaml/workload-legacy-hash-migration.yaml new file mode 100644 index 000000000..c2674418f --- /dev/null +++ b/operator/e2e/yaml/workload-legacy-hash-migration.yaml @@ -0,0 +1,94 @@ +# Workload for the legacy-to-canonical hash migration upgrade test. +--- +apiVersion: grove.io/v1alpha1 +kind: PodCliqueSet +metadata: + name: legacy-hash-migration + labels: + app: legacy-hash-migration +spec: + replicas: 1 + template: + cliques: + - name: z-standalone + labels: + compat.grove.io/hash-role: standalone + kai.scheduler/queue: test + spec: + roleName: standalone + replicas: 1 + minAvailable: 1 + podSpec: + terminationGracePeriodSeconds: 5 + containers: + - name: sidecar + image: registry:5001/busybox:latest + command: ["sleep", "infinity"] + volumeMounts: + - name: shared + mountPath: /shared + - name: cache + mountPath: /cache + resources: + requests: + memory: 16Mi + - name: main + image: registry:5001/busybox:latest + command: ["sleep", "infinity"] + volumeMounts: + - name: shared + mountPath: /shared + - name: cache + mountPath: /cache + resources: + requests: + memory: 16Mi + volumes: + - name: shared + emptyDir: {} + - name: cache + emptyDir: {} + - name: a-grouped + labels: + compat.grove.io/hash-role: grouped + kai.scheduler/queue: test + spec: + roleName: grouped + replicas: 1 + minAvailable: 1 + podSpec: + terminationGracePeriodSeconds: 5 + containers: + - name: sidecar + image: registry:5001/busybox:latest + command: ["sleep", "infinity"] + volumeMounts: + - name: shared + mountPath: /shared + - name: cache + mountPath: /cache + resources: + requests: + memory: 16Mi + - name: main + image: registry:5001/busybox:latest + command: ["sleep", "infinity"] + volumeMounts: + - name: shared + mountPath: /shared + - name: cache + mountPath: /cache + resources: + requests: + memory: 16Mi + volumes: + - name: shared + emptyDir: {} + - name: cache + emptyDir: {} + podCliqueScalingGroups: + - name: sg + replicas: 1 + minAvailable: 1 + cliqueNames: + - a-grouped diff --git a/operator/internal/controller/common/component/utils/hash.go b/operator/internal/controller/common/component/utils/hash.go new file mode 100644 index 000000000..70010c49f --- /dev/null +++ b/operator/internal/controller/common/component/utils/hash.go @@ -0,0 +1,145 @@ +// /* +// Copyright 2026 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package utils + +import ( + "sort" + + grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" + k8sutils "github.com/ai-dynamo/grove/operator/internal/utils/kubernetes" + + "github.com/samber/lo" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// HashCandidates captures the two hashes accepted during the canonical-hash +// transition window: the canonical hash emitted by this version, and the +// legacy hash emitted by v0.1.0-alpha.8 for the same desired spec. +type HashCandidates struct { + Canonical string + Legacy string +} + +// Matches returns true when storedHash is either the canonical hash or the +// legacy hash for the current desired spec. +func (h HashCandidates) Matches(storedHash string) bool { + return storedHash == h.Canonical || storedHash == h.Legacy +} + +// IsLegacy returns true when storedHash is the legacy hash and differs from the +// canonical hash. When both hash functions produce the same value, there is +// nothing to migrate. +func (h HashCandidates) IsLegacy(storedHash string) bool { + return h.Legacy != h.Canonical && storedHash == h.Legacy +} + +// ComputePCLQPodTemplateHashLegacy computes the legacy pod-template hash for a +// PodClique template. It preserves the exact pre-canonical behavior by hashing +// the PodTemplateSpec without sorting Kubernetes API map-list fields. +func ComputePCLQPodTemplateHashLegacy(pclqTemplateSpec *grovecorev1alpha1.PodCliqueTemplateSpec, priorityClassName string) string { + return k8sutils.ComputeHashLegacy(newPCLQPodTemplateSpecForHash(pclqTemplateSpec, priorityClassName)) +} + +// ComputePCLQPodTemplateHashCandidates returns the canonical and legacy +// pod-template hashes for the current desired PodClique template. +func ComputePCLQPodTemplateHashCandidates(pclqTemplateSpec *grovecorev1alpha1.PodCliqueTemplateSpec, priorityClassName string) HashCandidates { + return HashCandidates{ + Canonical: ComputePCLQPodTemplateHash(pclqTemplateSpec, priorityClassName), + Legacy: ComputePCLQPodTemplateHashLegacy(pclqTemplateSpec, priorityClassName), + } +} + +// GetExpectedPCLQPodTemplateHashCandidates finds the matching +// PodCliqueTemplateSpec from the PodCliqueSet and computes its hash candidates. +func GetExpectedPCLQPodTemplateHashCandidates(pcs *grovecorev1alpha1.PodCliqueSet, pclqObjectMeta metav1.ObjectMeta) (HashCandidates, error) { + pclqTemplateSpec, err := GetExpectedPCLQTemplateSpec(pcs, pclqObjectMeta) + if err != nil { + return HashCandidates{}, err + } + return ComputePCLQPodTemplateHashCandidates(pclqTemplateSpec, pcs.Spec.Template.PriorityClassName), nil +} + +// ComputePCSGenerationHash calculates the canonical PodCliqueSet generation +// hash. AnyOrder and Explicit startup modes sort cliques by name because the +// slice represents a name-keyed map-list. InOrder preserves clique order and +// includes the clique names as order keys because the sequence is part of the +// startup contract. +func ComputePCSGenerationHash(pcs *grovecorev1alpha1.PodCliqueSet) string { + preserveCliqueOrder := isInOrderStartup(pcs) + cliquesForHash := append([]*grovecorev1alpha1.PodCliqueTemplateSpec(nil), pcs.Spec.Template.Cliques...) + if !preserveCliqueOrder { + sort.SliceStable(cliquesForHash, func(i, j int) bool { + return cliquesForHash[i].Name < cliquesForHash[j].Name + }) + } + + podTemplateSpecs := podTemplateSpecsForPCLQTemplates(cliquesForHash, pcs.Spec.Template.PriorityClassName) + if preserveCliqueOrder { + orderKeys := lo.Map(cliquesForHash, func(pclqTemplateSpec *grovecorev1alpha1.PodCliqueTemplateSpec, _ int) string { + return pclqTemplateSpec.Name + }) + return k8sutils.ComputeHashWithOrderKeys(orderKeys, podTemplateSpecs...) + } + + return k8sutils.ComputeHash(podTemplateSpecs...) +} + +// ComputePCSGenerationHashLegacy calculates the v0.1.0-alpha.8 PodCliqueSet +// generation hash: cliques are hashed in stored slice order and each +// PodTemplateSpec is fed directly into the legacy hash function. +func ComputePCSGenerationHashLegacy(pcs *grovecorev1alpha1.PodCliqueSet) string { + podTemplateSpecs := podTemplateSpecsForPCLQTemplates(pcs.Spec.Template.Cliques, pcs.Spec.Template.PriorityClassName) + return k8sutils.ComputeHashLegacy(podTemplateSpecs...) +} + +// ComputePCSGenerationHashCandidates returns the canonical and legacy +// generation hashes for the current desired PodCliqueSet spec. +func ComputePCSGenerationHashCandidates(pcs *grovecorev1alpha1.PodCliqueSet) HashCandidates { + return HashCandidates{ + Canonical: ComputePCSGenerationHash(pcs), + Legacy: ComputePCSGenerationHashLegacy(pcs), + } +} + +func podTemplateSpecsForPCLQTemplates(pclqTemplateSpecs []*grovecorev1alpha1.PodCliqueTemplateSpec, priorityClassName string) []*corev1.PodTemplateSpec { + return lo.Map(pclqTemplateSpecs, func(pclqTemplateSpec *grovecorev1alpha1.PodCliqueTemplateSpec, _ int) *corev1.PodTemplateSpec { + return newPCLQPodTemplateSpecForHash(pclqTemplateSpec, priorityClassName) + }) +} + +func newPCLQPodTemplateSpecForHash(pclqTemplateSpec *grovecorev1alpha1.PodCliqueTemplateSpec, priorityClassName string) *corev1.PodTemplateSpec { + podTemplateSpec := &corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: pclqTemplateSpec.Labels, + Annotations: pclqTemplateSpec.Annotations, + }, + Spec: pclqTemplateSpec.Spec.PodSpec, + } + podTemplateSpec.Spec.PriorityClassName = priorityClassName + return podTemplateSpec +} + +// isInOrderStartup returns true when the original clique slice order is part +// of the desired state. AnyOrder and Explicit are order-independent here. +func isInOrderStartup(pcs *grovecorev1alpha1.PodCliqueSet) bool { + st := grovecorev1alpha1.CliqueStartupTypeAnyOrder + if pcs.Spec.Template.StartupType != nil { + st = *pcs.Spec.Template.StartupType + } + return st == grovecorev1alpha1.CliqueStartupTypeInOrder +} diff --git a/operator/internal/controller/common/component/utils/hash_test.go b/operator/internal/controller/common/component/utils/hash_test.go new file mode 100644 index 000000000..b980df83d --- /dev/null +++ b/operator/internal/controller/common/component/utils/hash_test.go @@ -0,0 +1,196 @@ +// /* +// Copyright 2026 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package utils + +import ( + "testing" + + apicommon "github.com/ai-dynamo/grove/operator/api/common" + grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/ptr" +) + +func TestHashCandidates(t *testing.T) { + t.Run("matches_canonical_or_legacy_hash", func(t *testing.T) { + candidates := HashCandidates{ + Canonical: "canonical-hash", + Legacy: "legacy-hash", + } + + assert.True(t, candidates.Matches("canonical-hash")) + assert.True(t, candidates.Matches("legacy-hash")) + assert.False(t, candidates.Matches("other-hash")) + assert.False(t, candidates.IsLegacy("canonical-hash")) + assert.True(t, candidates.IsLegacy("legacy-hash")) + assert.False(t, candidates.IsLegacy("other-hash")) + }) + + t.Run("legacy_is_false_when_hashes_are_equal", func(t *testing.T) { + candidates := HashCandidates{ + Canonical: "same-hash", + Legacy: "same-hash", + } + + assert.True(t, candidates.Matches("same-hash")) + assert.False(t, candidates.IsLegacy("same-hash")) + }) +} + +func TestComputePCLQPodTemplateHashCandidates(t *testing.T) { + template := hashTestPCLQTemplate("worker", corev1.Container{Name: "sidecar"}, corev1.Container{Name: "main"}) + + candidates := ComputePCLQPodTemplateHashCandidates(template, "high-priority") + + assert.Equal(t, ComputePCLQPodTemplateHash(template, "high-priority"), candidates.Canonical) + assert.Equal(t, ComputePCLQPodTemplateHashLegacy(template, "high-priority"), candidates.Legacy) + assert.NotEqual(t, candidates.Canonical, candidates.Legacy, + "the fixture stores a name-keyed slice out of canonical order so the legacy compatibility hash should diverge") +} + +func TestGetExpectedPCLQPodTemplateHashCandidates(t *testing.T) { + pcs := hashTestPCSWithCliques(nil, + hashTestPCLQTemplate("worker", corev1.Container{Name: "sidecar"}, corev1.Container{Name: "main"}), + ) + pcs.Spec.Template.PriorityClassName = "high-priority" + + candidates, err := GetExpectedPCLQPodTemplateHashCandidates(pcs, hashTestStandalonePCLQObjectMeta(pcs, "worker")) + require.NoError(t, err) + assert.Equal(t, ComputePCLQPodTemplateHashCandidates(pcs.Spec.Template.Cliques[0], "high-priority"), candidates) + assert.NotEqual(t, + ComputePCLQPodTemplateHashCandidates(pcs.Spec.Template.Cliques[0], "").Canonical, + candidates.Canonical, + "PCS priorityClassName must be included in the expected pod-template hash") + + _, err = GetExpectedPCLQPodTemplateHashCandidates(pcs, hashTestStandalonePCLQObjectMeta(pcs, "missing")) + assert.Error(t, err) +} + +func TestComputePCSGenerationHash_StartupOrderPolicy(t *testing.T) { + tests := []struct { + name string + startup *grovecorev1alpha1.CliqueStartupType + wantEqual bool + }{ + { + name: "nil_startup_defaults_to_any_order", + startup: nil, + wantEqual: true, + }, + { + name: "any_order_sorts_cliques_by_name", + startup: ptr.To(grovecorev1alpha1.CliqueStartupTypeAnyOrder), + wantEqual: true, + }, + { + name: "explicit_startup_sorts_cliques_by_name", + startup: ptr.To(grovecorev1alpha1.CliqueStartupTypeExplicit), + wantEqual: true, + }, + { + name: "in_order_preserves_clique_sequence", + startup: ptr.To(grovecorev1alpha1.CliqueStartupTypeInOrder), + wantEqual: false, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + first := hashTestPCSWithIdenticalCliqueSpecs(tc.startup, "app", "database") + reordered := hashTestPCSWithIdenticalCliqueSpecs(tc.startup, "database", "app") + + hashFirst := ComputePCSGenerationHash(first) + hashReordered := ComputePCSGenerationHash(reordered) + if tc.wantEqual { + assert.Equal(t, hashFirst, hashReordered) + } else { + assert.NotEqual(t, hashFirst, hashReordered, + "InOrder hashes must include clique names as order keys even when the pod templates are otherwise identical") + } + }) + } +} + +func TestComputePCSGenerationHashLegacy_PreservesStoredCliqueOrder(t *testing.T) { + first := hashTestPCSWithCliques(nil, + hashTestPCLQTemplate("app", corev1.Container{Name: "main", Image: "app:v1"}), + hashTestPCLQTemplate("database", corev1.Container{Name: "main", Image: "database:v1"}), + ) + reordered := hashTestPCSWithCliques(nil, + hashTestPCLQTemplate("database", corev1.Container{Name: "main", Image: "database:v1"}), + hashTestPCLQTemplate("app", corev1.Container{Name: "main", Image: "app:v1"}), + ) + + assert.Equal(t, ComputePCSGenerationHash(first), ComputePCSGenerationHash(reordered), + "canonical default startup hashes should sort cliques by name") + assert.NotEqual(t, ComputePCSGenerationHashLegacy(first), ComputePCSGenerationHashLegacy(reordered), + "legacy hashes preserve the stored clique slice order for compatibility with v0.1.0-alpha.8") + + candidates := ComputePCSGenerationHashCandidates(reordered) + assert.Equal(t, ComputePCSGenerationHash(reordered), candidates.Canonical) + assert.Equal(t, ComputePCSGenerationHashLegacy(reordered), candidates.Legacy) +} + +func hashTestPCLQTemplate(name string, containers ...corev1.Container) *grovecorev1alpha1.PodCliqueTemplateSpec { + return &grovecorev1alpha1.PodCliqueTemplateSpec{ + Name: name, + Spec: grovecorev1alpha1.PodCliqueSpec{ + PodSpec: corev1.PodSpec{ + Containers: containers, + }, + }, + } +} + +func hashTestPCSWithIdenticalCliqueSpecs(startup *grovecorev1alpha1.CliqueStartupType, cliqueNames ...string) *grovecorev1alpha1.PodCliqueSet { + cliques := make([]*grovecorev1alpha1.PodCliqueTemplateSpec, 0, len(cliqueNames)) + for _, cliqueName := range cliqueNames { + cliques = append(cliques, hashTestPCLQTemplate(cliqueName, corev1.Container{Name: "main", Image: "worker:v1"})) + } + return hashTestPCSWithCliques(startup, cliques...) +} + +func hashTestPCSWithCliques(startup *grovecorev1alpha1.CliqueStartupType, cliques ...*grovecorev1alpha1.PodCliqueTemplateSpec) *grovecorev1alpha1.PodCliqueSet { + return &grovecorev1alpha1.PodCliqueSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pcs", + Namespace: "default", + }, + Spec: grovecorev1alpha1.PodCliqueSetSpec{ + Replicas: 1, + Template: grovecorev1alpha1.PodCliqueSetTemplateSpec{ + StartupType: startup, + Cliques: cliques, + }, + }, + } +} + +func hashTestStandalonePCLQObjectMeta(pcs *grovecorev1alpha1.PodCliqueSet, cliqueName string) metav1.ObjectMeta { + return metav1.ObjectMeta{ + Name: apicommon.GeneratePodCliqueName(apicommon.ResourceNameReplica{Name: pcs.Name, Replica: 0}, cliqueName), + Namespace: pcs.Namespace, + Labels: map[string]string{ + apicommon.LabelPartOfKey: pcs.Name, + apicommon.LabelPodCliqueSetReplicaIndex: "0", + }, + } +} diff --git a/operator/internal/controller/common/component/utils/podclique.go b/operator/internal/controller/common/component/utils/podclique.go index 971631560..d2f5eea8f 100644 --- a/operator/internal/controller/common/component/utils/podclique.go +++ b/operator/internal/controller/common/component/utils/podclique.go @@ -29,7 +29,6 @@ import ( k8sutils "github.com/ai-dynamo/grove/operator/internal/utils/kubernetes" "github.com/samber/lo" - corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client" @@ -143,15 +142,7 @@ func groupPCLQsByLabel(pclqs []grovecorev1alpha1.PodClique, labelKey string) map // ComputePCLQPodTemplateHash computes the pod template hash for the PCLQ pod spec. func ComputePCLQPodTemplateHash(pclqTemplateSpec *grovecorev1alpha1.PodCliqueTemplateSpec, priorityClassName string) string { - podTemplateSpec := corev1.PodTemplateSpec{ - ObjectMeta: metav1.ObjectMeta{ - Labels: pclqTemplateSpec.Labels, - Annotations: pclqTemplateSpec.Annotations, - }, - Spec: pclqTemplateSpec.Spec.PodSpec, - } - podTemplateSpec.Spec.PriorityClassName = priorityClassName - return k8sutils.ComputeHash(&podTemplateSpec) + return k8sutils.ComputeHash(newPCLQPodTemplateSpecForHash(pclqTemplateSpec, priorityClassName)) } // IsPCLQAutoUpdateInProgress checks if PodClique is under an auto-orchestrated update. @@ -168,15 +159,24 @@ func IsLastPCLQUpdateCompleted(pclq *grovecorev1alpha1.PodClique) bool { // GetExpectedPCLQPodTemplateHash finds the matching PodCliqueTemplateSpec from the PodCliqueSet and computes the pod template hash for the PCLQ pod spec. func GetExpectedPCLQPodTemplateHash(pcs *grovecorev1alpha1.PodCliqueSet, pclqObjectMeta metav1.ObjectMeta) (string, error) { - cliqueName, err := utils.GetPodCliqueNameFromPodCliqueFQN(pclqObjectMeta) + pclqTemplateSpec, err := GetExpectedPCLQTemplateSpec(pcs, pclqObjectMeta) if err != nil { return "", err } + return ComputePCLQPodTemplateHash(pclqTemplateSpec, pcs.Spec.Template.PriorityClassName), nil +} + +// GetExpectedPCLQTemplateSpec finds the matching PodCliqueTemplateSpec from the PodCliqueSet for a PodClique ObjectMeta. +func GetExpectedPCLQTemplateSpec(pcs *grovecorev1alpha1.PodCliqueSet, pclqObjectMeta metav1.ObjectMeta) (*grovecorev1alpha1.PodCliqueTemplateSpec, error) { + cliqueName, err := utils.GetPodCliqueNameFromPodCliqueFQN(pclqObjectMeta) + if err != nil { + return nil, err + } matchingPCLQTemplateSpec := FindPodCliqueTemplateSpecByName(pcs, cliqueName) if matchingPCLQTemplateSpec == nil { - return "", fmt.Errorf("pod clique template not found for cliqueName: %s", cliqueName) + return nil, fmt.Errorf("pod clique template not found for cliqueName: %s", cliqueName) } - return ComputePCLQPodTemplateHash(matchingPCLQTemplateSpec, pcs.Spec.Template.PriorityClassName), nil + return matchingPCLQTemplateSpec, nil } // FindPodCliqueTemplateSpecByName retrieves the PodCliqueTemplateSpec from the PodCliqueSet by its name. diff --git a/operator/internal/controller/common/component/utils/podclique_test.go b/operator/internal/controller/common/component/utils/podclique_test.go index a03df6e89..f0374993a 100644 --- a/operator/internal/controller/common/component/utils/podclique_test.go +++ b/operator/internal/controller/common/component/utils/podclique_test.go @@ -25,6 +25,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -412,3 +413,475 @@ func TestIsLastPCLQUpdateCompleted(t *testing.T) { }) } } + +// TestComputePCLQPodTemplateHash_PodSpecSliceOrderInvariants pins the +// per-PCLQ pod-template hash behavior with respect to slice order in the +// embedded PodSpec. +// +// The fix at internal/utils/kubernetes/pod.go canonicalizes name-keyed +// (+listType=map) slices before hashing so that two PodSpecs that represent +// the same desired state always produce the same hash, regardless of how +// the upstream controller happened to serialize the slices. +// +// Slices whose order has runtime semantic meaning remain order-sensitive +// regardless of their listType: Container.Env (+listType=map by name, but +// order participates in $(VAR) substitution); InitContainers +// (+listType=map by name, but executed in slice order); EnvFrom and +// Tolerations (+listType=atomic). Reordering any of these is a real spec +// change and must continue to flip the hash. +func TestComputePCLQPodTemplateHash_PodSpecSliceOrderInvariants(t *testing.T) { + envByName := map[string]corev1.EnvVar{ + "FOO": {Name: "FOO", Value: "foo-value"}, + "BAR": {Name: "BAR", Value: "bar-value"}, + "BAZ": {Name: "BAZ", Value: "baz-value"}, + "QUX": {Name: "QUX", Value: "qux-value"}, + "QUUX": {Name: "QUUX", Value: "quux-value"}, + "CORGE": {Name: "CORGE", Value: "corge-value"}, + } + makeContainerWithEnv := func(name string, envOrder []string) corev1.Container { + envs := make([]corev1.EnvVar, 0, len(envOrder)) + for _, n := range envOrder { + envs = append(envs, envByName[n]) + } + return corev1.Container{ + Name: name, + Image: name + ":latest", + Env: envs, + } + } + makeTemplate := func(spec corev1.PodSpec) *grovecorev1alpha1.PodCliqueTemplateSpec { + return &grovecorev1alpha1.PodCliqueTemplateSpec{ + Name: "worker", + Spec: grovecorev1alpha1.PodCliqueSpec{ + PodSpec: spec, + }, + } + } + + t.Run("container_reorder_does_not_change_hash", func(t *testing.T) { + envs := []string{"BAR", "BAZ", "CORGE", "FOO", "QUUX", "QUX"} + main := makeContainerWithEnv("main", envs) + side := corev1.Container{Name: "sidecar", Image: "sidecar:latest"} + + hashA := ComputePCLQPodTemplateHash(makeTemplate(corev1.PodSpec{Containers: []corev1.Container{main, side}}), "") + hashB := ComputePCLQPodTemplateHash(makeTemplate(corev1.PodSpec{Containers: []corev1.Container{side, main}}), "") + + assert.Equal(t, hashA, hashB, + "Containers is +listType=map keyed by name — order must not affect the hash") + }) + + t.Run("volume_reorder_does_not_change_hash", func(t *testing.T) { + volA := corev1.Volume{Name: "model-cache", VolumeSource: corev1.VolumeSource{EmptyDir: &corev1.EmptyDirVolumeSource{}}} + volB := corev1.Volume{Name: "shared-memory", VolumeSource: corev1.VolumeSource{EmptyDir: &corev1.EmptyDirVolumeSource{}}} + volC := corev1.Volume{Name: "scratch", VolumeSource: corev1.VolumeSource{EmptyDir: &corev1.EmptyDirVolumeSource{}}} + + hashA := ComputePCLQPodTemplateHash(makeTemplate(corev1.PodSpec{ + Containers: []corev1.Container{{Name: "main"}}, + Volumes: []corev1.Volume{volA, volB, volC}, + }), "") + hashB := ComputePCLQPodTemplateHash(makeTemplate(corev1.PodSpec{ + Containers: []corev1.Container{{Name: "main"}}, + Volumes: []corev1.Volume{volC, volA, volB}, + }), "") + + assert.Equal(t, hashA, hashB, + "Volumes is +listType=map keyed by name — order must not affect the hash") + }) + + t.Run("image_pull_secret_reorder_does_not_change_hash", func(t *testing.T) { + hashA := ComputePCLQPodTemplateHash(makeTemplate(corev1.PodSpec{ + Containers: []corev1.Container{{Name: "main"}}, + ImagePullSecrets: []corev1.LocalObjectReference{ + {Name: "docker-imagepullsecret"}, + {Name: "nvcr-imagepullsecret"}, + }, + }), "") + hashB := ComputePCLQPodTemplateHash(makeTemplate(corev1.PodSpec{ + Containers: []corev1.Container{{Name: "main"}}, + ImagePullSecrets: []corev1.LocalObjectReference{ + {Name: "nvcr-imagepullsecret"}, + {Name: "docker-imagepullsecret"}, + }, + }), "") + + assert.Equal(t, hashA, hashB, + "ImagePullSecrets is +listType=map keyed by name — order must not affect the hash") + }) + + t.Run("container_port_reorder_does_not_change_hash", func(t *testing.T) { + makeWithPorts := func(ports []corev1.ContainerPort) *grovecorev1alpha1.PodCliqueTemplateSpec { + return makeTemplate(corev1.PodSpec{ + Containers: []corev1.Container{{Name: "main", Image: "main:latest", Ports: ports}}, + }) + } + hashA := ComputePCLQPodTemplateHash(makeWithPorts([]corev1.ContainerPort{ + {Name: "http", ContainerPort: 8000, Protocol: corev1.ProtocolTCP}, + {Name: "metrics", ContainerPort: 9090, Protocol: corev1.ProtocolTCP}, + }), "") + hashB := ComputePCLQPodTemplateHash(makeWithPorts([]corev1.ContainerPort{ + {Name: "metrics", ContainerPort: 9090, Protocol: corev1.ProtocolTCP}, + {Name: "http", ContainerPort: 8000, Protocol: corev1.ProtocolTCP}, + }), "") + + assert.Equal(t, hashA, hashB, + "Container.Ports is +listType=map keyed by (containerPort, protocol) — order must not affect the hash") + }) + + t.Run("env_var_reorder_changes_hash", func(t *testing.T) { + envAlpha := []string{"BAR", "BAZ", "CORGE", "FOO", "QUUX", "QUX"} + envShuffled := []string{"FOO", "CORGE", "BAR", "QUUX", "BAZ", "QUX"} + + hashAlpha := ComputePCLQPodTemplateHash(makeTemplate(corev1.PodSpec{ + Containers: []corev1.Container{makeContainerWithEnv("main", envAlpha)}, + }), "") + hashShuffled := ComputePCLQPodTemplateHash(makeTemplate(corev1.PodSpec{ + Containers: []corev1.Container{makeContainerWithEnv("main", envShuffled)}, + }), "") + + assert.NotEqual(t, hashAlpha, hashShuffled, + "Container.Env is +listType=map by name but order participates in $(VAR) substitution — reorder is a real spec change and must change the hash") + }) + + t.Run("init_container_reorder_changes_hash", func(t *testing.T) { + initA := corev1.Container{Name: "init-a", Image: "init-a:latest"} + initB := corev1.Container{Name: "init-b", Image: "init-b:latest"} + + hashA := ComputePCLQPodTemplateHash(makeTemplate(corev1.PodSpec{ + Containers: []corev1.Container{{Name: "main"}}, + InitContainers: []corev1.Container{initA, initB}, + }), "") + hashB := ComputePCLQPodTemplateHash(makeTemplate(corev1.PodSpec{ + Containers: []corev1.Container{{Name: "main"}}, + InitContainers: []corev1.Container{initB, initA}, + }), "") + + assert.NotEqual(t, hashA, hashB, + "InitContainers run in slice order — reorder is a real spec change and must change the hash") + }) + + // Container.ResizePolicy is +listType=atomic in the API. Each entry is + // keyed by resourceName at runtime, so order has no semantic effect, but + // pod.go intentionally treats the whole slice as opaque (atomic). If + // canonicalization is ever extended to ResizePolicy, this assertion will + // need to flip to assert.Equal. + t.Run("resize_policy_reorder_changes_hash", func(t *testing.T) { + rpCPU := corev1.ContainerResizePolicy{ResourceName: corev1.ResourceCPU, RestartPolicy: corev1.NotRequired} + rpMem := corev1.ContainerResizePolicy{ResourceName: corev1.ResourceMemory, RestartPolicy: corev1.RestartContainer} + + hashA := ComputePCLQPodTemplateHash(makeTemplate(corev1.PodSpec{ + Containers: []corev1.Container{{Name: "main", Image: "main:v1", ResizePolicy: []corev1.ContainerResizePolicy{rpCPU, rpMem}}}, + }), "") + hashB := ComputePCLQPodTemplateHash(makeTemplate(corev1.PodSpec{ + Containers: []corev1.Container{{Name: "main", Image: "main:v1", ResizePolicy: []corev1.ContainerResizePolicy{rpMem, rpCPU}}}, + }), "") + + assert.NotEqual(t, hashA, hashB, + "Container.ResizePolicy is +listType=atomic — reorder is treated as a real spec change and must change the hash") + }) + + t.Run("identical_input_produces_identical_hash", func(t *testing.T) { + envs := []string{"BAR", "BAZ", "CORGE", "FOO", "QUUX", "QUX"} + containers := []corev1.Container{makeContainerWithEnv("main", envs), {Name: "sidecar", Image: "sidecar:latest"}} + + hashA := ComputePCLQPodTemplateHash(makeTemplate(corev1.PodSpec{Containers: containers}), "") + hashB := ComputePCLQPodTemplateHash(makeTemplate(corev1.PodSpec{Containers: containers}), "") + + assert.Equal(t, hashA, hashB, "sanity: identical input must produce identical hash") + }) +} + +// TestComputePCLQPodTemplateHash_RealisticDynamoLikePodSpec is the +// regression test that maps directly back to the latency-mode bug. It +// builds a PodSpec mirroring what the Dynamo operator emits for a vLLM +// prefill worker (multiple containers, many env vars, multiple volumes, +// multiple imagePullSecrets, multiple ports), then permutes every +// +listType=map slice across two builds. The hash must be stable. +// +// Without the canonicalization fix in +// internal/utils/kubernetes/pod.go this test would fail and the gang would +// roll on every Dynamo operator-driven PCS update. +func TestComputePCLQPodTemplateHash_RealisticDynamoLikePodSpec(t *testing.T) { + mainContainer := func(envs []corev1.EnvVar, ports []corev1.ContainerPort, volumeMounts []corev1.VolumeMount) corev1.Container { + return corev1.Container{ + Name: "main", + Image: "nvcr.io/nvstaging/ai-dynamo/vllm-runtime:1.1.0-rc5", + Env: envs, + Ports: ports, + VolumeMounts: volumeMounts, + } + } + volumeMountsAlpha := []corev1.VolumeMount{ + {Name: "model-cache", MountPath: "/opt/model-cache"}, + {Name: "shared-memory", MountPath: "/dev/shm"}, + } + volumeMountsShuffled := []corev1.VolumeMount{volumeMountsAlpha[1], volumeMountsAlpha[0]} + + envsAlpha := []corev1.EnvVar{ + {Name: "DYN_FORWARDPASS_METRIC_PORT", Value: "8081"}, + {Name: "HF_HOME", Value: "/opt/model-cache"}, + {Name: "NATS_SERVER", Value: "nats://dynamo-platform-nats.dynamo-system.svc.cluster.local:4222"}, + } + portsAlpha := []corev1.ContainerPort{ + {Name: "http", ContainerPort: 8000, Protocol: corev1.ProtocolTCP}, + {Name: "metrics", ContainerPort: 9090, Protocol: corev1.ProtocolTCP}, + } + + templateAsEmittedFirst := &grovecorev1alpha1.PodCliqueTemplateSpec{ + Name: "vllmprefillworker", + Spec: grovecorev1alpha1.PodCliqueSpec{ + PodSpec: corev1.PodSpec{ + ImagePullSecrets: []corev1.LocalObjectReference{ + {Name: "docker-imagepullsecret"}, + {Name: "nvcr-imagepullsecret"}, + }, + Volumes: []corev1.Volume{ + {Name: "model-cache", VolumeSource: corev1.VolumeSource{EmptyDir: &corev1.EmptyDirVolumeSource{}}}, + {Name: "shared-memory", VolumeSource: corev1.VolumeSource{EmptyDir: &corev1.EmptyDirVolumeSource{}}}, + }, + Containers: []corev1.Container{ + mainContainer(envsAlpha, portsAlpha, volumeMountsAlpha), + {Name: "sidecar", Image: "sidecar:latest"}, + }, + }, + }, + } + + // templateAsEmittedSecond is the SAME desired state but with every + // +listType=map slice in a different order. This mimics the + // Dynamo-operator scenario where Go map iteration produces a different + // sequence on each PCS write. + portsShuffled := []corev1.ContainerPort{portsAlpha[1], portsAlpha[0]} + templateAsEmittedSecond := &grovecorev1alpha1.PodCliqueTemplateSpec{ + Name: "vllmprefillworker", + Spec: grovecorev1alpha1.PodCliqueSpec{ + PodSpec: corev1.PodSpec{ + ImagePullSecrets: []corev1.LocalObjectReference{ + {Name: "nvcr-imagepullsecret"}, + {Name: "docker-imagepullsecret"}, + }, + Volumes: []corev1.Volume{ + {Name: "shared-memory", VolumeSource: corev1.VolumeSource{EmptyDir: &corev1.EmptyDirVolumeSource{}}}, + {Name: "model-cache", VolumeSource: corev1.VolumeSource{EmptyDir: &corev1.EmptyDirVolumeSource{}}}, + }, + Containers: []corev1.Container{ + {Name: "sidecar", Image: "sidecar:latest"}, + mainContainer(envsAlpha, portsShuffled, volumeMountsShuffled), + }, + }, + }, + } + + hashFirst := ComputePCLQPodTemplateHash(templateAsEmittedFirst, "") + hashSecond := ComputePCLQPodTemplateHash(templateAsEmittedSecond, "") + assert.Equal(t, hashFirst, hashSecond, + "two PodSpecs that represent the same desired state but differ only in +listType=map slice order must hash identically; otherwise the gang rolls on every operator reconcile") +} + +// TestComputePCLQPodTemplateHash_AdditionalListTypeMapSlices covers the +// +listType=map slices that aren't exercised by the older invariants test: +// VolumeMounts, VolumeDevices, HostAliases, TopologySpreadConstraints, +// ResourceClaims, SchedulingGates, Container.Resources.Claims, and +// EphemeralContainers. These were called out in the bug-report analysis as +// additional places where non-deterministic upstream serialization could flip +// the per-PCLQ hash. +func TestComputePCLQPodTemplateHash_AdditionalListTypeMapSlices(t *testing.T) { + makeTemplate := func(spec corev1.PodSpec) *grovecorev1alpha1.PodCliqueTemplateSpec { + return &grovecorev1alpha1.PodCliqueTemplateSpec{ + Name: "worker", + Spec: grovecorev1alpha1.PodCliqueSpec{PodSpec: spec}, + } + } + + t.Run("volume_mount_reorder_does_not_change_hash", func(t *testing.T) { + mounts := []corev1.VolumeMount{ + {Name: "model-cache", MountPath: "/opt/model-cache"}, + {Name: "shared-memory", MountPath: "/dev/shm"}, + } + hashA := ComputePCLQPodTemplateHash(makeTemplate(corev1.PodSpec{ + Containers: []corev1.Container{{Name: "main", Image: "main:v1", VolumeMounts: mounts}}, + }), "") + shuffled := []corev1.VolumeMount{mounts[1], mounts[0]} + hashB := ComputePCLQPodTemplateHash(makeTemplate(corev1.PodSpec{ + Containers: []corev1.Container{{Name: "main", Image: "main:v1", VolumeMounts: shuffled}}, + }), "") + assert.Equal(t, hashA, hashB, + "Container.VolumeMounts order must not affect the per-PCLQ hash — this is the explicit case from the bug report") + }) + + t.Run("volume_device_reorder_does_not_change_hash", func(t *testing.T) { + devs := []corev1.VolumeDevice{ + {Name: "raw-a", DevicePath: "/dev/xvda"}, + {Name: "raw-b", DevicePath: "/dev/xvdb"}, + } + hashA := ComputePCLQPodTemplateHash(makeTemplate(corev1.PodSpec{ + Containers: []corev1.Container{{Name: "main", Image: "main:v1", VolumeDevices: devs}}, + }), "") + hashB := ComputePCLQPodTemplateHash(makeTemplate(corev1.PodSpec{ + Containers: []corev1.Container{{Name: "main", Image: "main:v1", VolumeDevices: []corev1.VolumeDevice{devs[1], devs[0]}}}, + }), "") + assert.Equal(t, hashA, hashB, "Container.VolumeDevices order must not affect the per-PCLQ hash") + }) + + t.Run("host_alias_reorder_does_not_change_hash", func(t *testing.T) { + aliases := []corev1.HostAlias{ + {IP: "10.0.0.1", Hostnames: []string{"a"}}, + {IP: "10.0.0.2", Hostnames: []string{"b"}}, + } + hashA := ComputePCLQPodTemplateHash(makeTemplate(corev1.PodSpec{ + Containers: []corev1.Container{{Name: "main"}}, + HostAliases: aliases, + }), "") + hashB := ComputePCLQPodTemplateHash(makeTemplate(corev1.PodSpec{ + Containers: []corev1.Container{{Name: "main"}}, + HostAliases: []corev1.HostAlias{aliases[1], aliases[0]}, + }), "") + assert.Equal(t, hashA, hashB, "PodSpec.HostAliases order must not affect the per-PCLQ hash") + }) + + t.Run("topology_spread_constraint_reorder_does_not_change_hash", func(t *testing.T) { + tsc := []corev1.TopologySpreadConstraint{ + {MaxSkew: 1, TopologyKey: "topology.kubernetes.io/zone", WhenUnsatisfiable: corev1.DoNotSchedule}, + {MaxSkew: 1, TopologyKey: "kubernetes.io/hostname", WhenUnsatisfiable: corev1.ScheduleAnyway}, + } + hashA := ComputePCLQPodTemplateHash(makeTemplate(corev1.PodSpec{ + Containers: []corev1.Container{{Name: "main"}}, + TopologySpreadConstraints: tsc, + }), "") + hashB := ComputePCLQPodTemplateHash(makeTemplate(corev1.PodSpec{ + Containers: []corev1.Container{{Name: "main"}}, + TopologySpreadConstraints: []corev1.TopologySpreadConstraint{tsc[1], tsc[0]}, + }), "") + assert.Equal(t, hashA, hashB, "PodSpec.TopologySpreadConstraints order must not affect the per-PCLQ hash") + }) + + t.Run("resource_claim_reorder_does_not_change_hash", func(t *testing.T) { + claims := []corev1.PodResourceClaim{{Name: "claim-a"}, {Name: "claim-b"}} + hashA := ComputePCLQPodTemplateHash(makeTemplate(corev1.PodSpec{ + Containers: []corev1.Container{{Name: "main"}}, + ResourceClaims: claims, + }), "") + hashB := ComputePCLQPodTemplateHash(makeTemplate(corev1.PodSpec{ + Containers: []corev1.Container{{Name: "main"}}, + ResourceClaims: []corev1.PodResourceClaim{claims[1], claims[0]}, + }), "") + assert.Equal(t, hashA, hashB, "PodSpec.ResourceClaims order must not affect the per-PCLQ hash") + }) + + t.Run("scheduling_gate_reorder_does_not_change_hash", func(t *testing.T) { + gates := []corev1.PodSchedulingGate{{Name: "gate-a"}, {Name: "gate-b"}} + hashA := ComputePCLQPodTemplateHash(makeTemplate(corev1.PodSpec{ + Containers: []corev1.Container{{Name: "main"}}, + SchedulingGates: gates, + }), "") + hashB := ComputePCLQPodTemplateHash(makeTemplate(corev1.PodSpec{ + Containers: []corev1.Container{{Name: "main"}}, + SchedulingGates: []corev1.PodSchedulingGate{gates[1], gates[0]}, + }), "") + assert.Equal(t, hashA, hashB, "PodSpec.SchedulingGates order must not affect the per-PCLQ hash") + }) + + t.Run("container_resource_claim_reorder_does_not_change_hash", func(t *testing.T) { + claims := []corev1.ResourceClaim{{Name: "claim-a"}, {Name: "claim-b"}} + hashA := ComputePCLQPodTemplateHash(makeTemplate(corev1.PodSpec{ + Containers: []corev1.Container{{Name: "main", Image: "main:v1", Resources: corev1.ResourceRequirements{Claims: claims}}}, + }), "") + hashB := ComputePCLQPodTemplateHash(makeTemplate(corev1.PodSpec{ + Containers: []corev1.Container{{Name: "main", Image: "main:v1", Resources: corev1.ResourceRequirements{Claims: []corev1.ResourceClaim{claims[1], claims[0]}}}}, + }), "") + assert.Equal(t, hashA, hashB, "Container.Resources.Claims order must not affect the per-PCLQ hash") + }) + + t.Run("ephemeral_container_reorder_does_not_change_hash", func(t *testing.T) { + ec := []corev1.EphemeralContainer{ + {EphemeralContainerCommon: corev1.EphemeralContainerCommon{Name: "debug-a", Image: "busybox"}}, + {EphemeralContainerCommon: corev1.EphemeralContainerCommon{Name: "debug-b", Image: "busybox"}}, + } + hashA := ComputePCLQPodTemplateHash(makeTemplate(corev1.PodSpec{ + Containers: []corev1.Container{{Name: "main"}}, + EphemeralContainers: ec, + }), "") + hashB := ComputePCLQPodTemplateHash(makeTemplate(corev1.PodSpec{ + Containers: []corev1.Container{{Name: "main"}}, + EphemeralContainers: []corev1.EphemeralContainer{ec[1], ec[0]}, + }), "") + assert.Equal(t, hashA, hashB, "PodSpec.EphemeralContainers order must not affect the per-PCLQ hash") + }) + + t.Run("init_container_inner_volume_mounts_canonicalized", func(t *testing.T) { + // InitContainers themselves stay ordered (sequential execution), + // but VolumeMounts inside an InitContainer are order-independent. + mounts := []corev1.VolumeMount{ + {Name: "a", MountPath: "/a"}, + {Name: "b", MountPath: "/b"}, + } + hashA := ComputePCLQPodTemplateHash(makeTemplate(corev1.PodSpec{ + Containers: []corev1.Container{{Name: "main"}}, + InitContainers: []corev1.Container{{Name: "init", Image: "init:v1", VolumeMounts: mounts}}, + }), "") + hashB := ComputePCLQPodTemplateHash(makeTemplate(corev1.PodSpec{ + Containers: []corev1.Container{{Name: "main"}}, + InitContainers: []corev1.Container{{Name: "init", Image: "init:v1", VolumeMounts: []corev1.VolumeMount{mounts[1], mounts[0]}}}, + }), "") + assert.Equal(t, hashA, hashB, + "InitContainer.VolumeMounts order must not affect the per-PCLQ hash even though InitContainers slice order does") + }) + + t.Run("init_container_inner_resource_claims_canonicalized", func(t *testing.T) { + claims := []corev1.ResourceClaim{{Name: "claim-a"}, {Name: "claim-b"}} + hashA := ComputePCLQPodTemplateHash(makeTemplate(corev1.PodSpec{ + Containers: []corev1.Container{{Name: "main"}}, + InitContainers: []corev1.Container{{Name: "init", Image: "init:v1", Resources: corev1.ResourceRequirements{Claims: claims}}}, + }), "") + hashB := ComputePCLQPodTemplateHash(makeTemplate(corev1.PodSpec{ + Containers: []corev1.Container{{Name: "main"}}, + InitContainers: []corev1.Container{{Name: "init", Image: "init:v1", Resources: corev1.ResourceRequirements{Claims: []corev1.ResourceClaim{claims[1], claims[0]}}}}, + }), "") + assert.Equal(t, hashA, hashB, + "InitContainer.Resources.Claims order must not affect the per-PCLQ hash even though InitContainers slice order does") + }) +} + +// TestComputePCLQPodTemplateHash_RealSpecChangesStillFlipHash is the +// regression test in the other direction: the per-PCLQ hash must continue +// to change when the desired state actually changes. If a future +// over-canonicalization (e.g. accidentally sorting Env, or zeroing out a +// field) silently masked real changes, rolling updates would simply stop +// working and pass this test suite. Pin it. +func TestComputePCLQPodTemplateHash_RealSpecChangesStillFlipHash(t *testing.T) { + base := &grovecorev1alpha1.PodCliqueTemplateSpec{ + Name: "worker", + Spec: grovecorev1alpha1.PodCliqueSpec{PodSpec: corev1.PodSpec{ + Containers: []corev1.Container{ + {Name: "main", Image: "main:v1", Env: []corev1.EnvVar{{Name: "FOO", Value: "1"}}, VolumeMounts: []corev1.VolumeMount{{Name: "cache", MountPath: "/cache"}}}, + }, + Volumes: []corev1.Volume{{Name: "cache"}}, + }}, + } + baseHash := ComputePCLQPodTemplateHash(base, "") + + t.Run("image_change_flips_hash", func(t *testing.T) { + mut := base.DeepCopy() + mut.Spec.PodSpec.Containers[0].Image = "main:v2" + assert.NotEqual(t, baseHash, ComputePCLQPodTemplateHash(mut, ""), "image change must flip the per-PCLQ hash") + }) + t.Run("env_value_change_flips_hash", func(t *testing.T) { + mut := base.DeepCopy() + mut.Spec.PodSpec.Containers[0].Env[0].Value = "2" + assert.NotEqual(t, baseHash, ComputePCLQPodTemplateHash(mut, ""), "env var value change must flip the per-PCLQ hash") + }) + t.Run("volume_mount_path_change_flips_hash", func(t *testing.T) { + mut := base.DeepCopy() + mut.Spec.PodSpec.Containers[0].VolumeMounts[0].MountPath = "/cache-new" + assert.NotEqual(t, baseHash, ComputePCLQPodTemplateHash(mut, ""), "changing a volumeMount's mountPath must flip the per-PCLQ hash") + }) + t.Run("new_volume_flips_hash", func(t *testing.T) { + mut := base.DeepCopy() + mut.Spec.PodSpec.Volumes = append(mut.Spec.PodSpec.Volumes, corev1.Volume{Name: "scratch"}) + assert.NotEqual(t, baseHash, ComputePCLQPodTemplateHash(mut, ""), "adding a volume must flip the per-PCLQ hash") + }) + t.Run("pclq_template_label_change_flips_hash", func(t *testing.T) { + mut := base.DeepCopy() + mut.Labels = map[string]string{"role": "worker-v2"} + assert.NotEqual(t, baseHash, ComputePCLQPodTemplateHash(mut, ""), "PodCliqueTemplateSpec.Labels change must flip the per-PCLQ hash") + }) +} diff --git a/operator/internal/controller/common/component/utils/podcliquescalinggroup.go b/operator/internal/controller/common/component/utils/podcliquescalinggroup.go index 007f8385b..bb904a728 100644 --- a/operator/internal/controller/common/component/utils/podcliquescalinggroup.go +++ b/operator/internal/controller/common/component/utils/podcliquescalinggroup.go @@ -124,6 +124,14 @@ func GetPCSGsByPCSReplicaIndex(ctx context.Context, cl client.Client, pcsObjKey // GetPCLQTemplateHashes generates the Pod template hash for all PCLQs in a PCSG. Returns a map of [PCLQ Name : PodTemplateHas] func GetPCLQTemplateHashes(pcs *grovecorev1alpha1.PodCliqueSet, pcsg *grovecorev1alpha1.PodCliqueScalingGroup) map[string]string { + hashCandidates := GetPCLQTemplateHashCandidates(pcs, pcsg) + return lo.MapValues(hashCandidates, func(candidates HashCandidates, _ string) string { + return candidates.Canonical + }) +} + +// GetPCLQTemplateHashCandidates generates the canonical and legacy Pod template hash candidates for all PCLQs in a PCSG. +func GetPCLQTemplateHashCandidates(pcs *grovecorev1alpha1.PodCliqueSet, pcsg *grovecorev1alpha1.PodCliqueScalingGroup) map[string]HashCandidates { pclqTemplateSpecs := make([]*grovecorev1alpha1.PodCliqueTemplateSpec, 0, len(pcsg.Spec.CliqueNames)) for _, cliqueName := range pcsg.Spec.CliqueNames { pclqTemplateSpec := FindPodCliqueTemplateSpecByName(pcs, cliqueName) @@ -132,11 +140,11 @@ func GetPCLQTemplateHashes(pcs *grovecorev1alpha1.PodCliqueSet, pcsg *grovecorev } pclqTemplateSpecs = append(pclqTemplateSpecs, pclqTemplateSpec) } - cliqueTemplateSpecHashes := make(map[string]string, len(pclqTemplateSpecs)) + cliqueTemplateSpecHashes := make(map[string]HashCandidates, len(pclqTemplateSpecs)) for pcsgReplicaIndex := range int(pcsg.Spec.Replicas) { for _, pclqTemplateSpec := range pclqTemplateSpecs { pclqFQN := apicommon.GeneratePodCliqueName(apicommon.ResourceNameReplica{Name: pcsg.Name, Replica: pcsgReplicaIndex}, pclqTemplateSpec.Name) - cliqueTemplateSpecHashes[pclqFQN] = ComputePCLQPodTemplateHash(pclqTemplateSpec, pcs.Spec.Template.PriorityClassName) + cliqueTemplateSpecHashes[pclqFQN] = ComputePCLQPodTemplateHashCandidates(pclqTemplateSpec, pcs.Spec.Template.PriorityClassName) } } return cliqueTemplateSpecHashes @@ -147,12 +155,12 @@ func GetPCLQTemplateHashes(pcs *grovecorev1alpha1.PodCliqueSet, pcsg *grovecorev // a computed PodTemplateHash from the latest PodCliqueSet resource. func GetPCLQsInPCSGPendingUpdate(pcs *grovecorev1alpha1.PodCliqueSet, pcsg *grovecorev1alpha1.PodCliqueScalingGroup, existingPCLQs []grovecorev1alpha1.PodClique) []string { pclqFQNsPendingUpdate := make([]string, 0, len(existingPCLQs)) - expectedPCLQPodTemplateHashes := GetPCLQTemplateHashes(pcs, pcsg) + expectedPCLQPodTemplateHashes := GetPCLQTemplateHashCandidates(pcs, pcsg) for _, existingPCLQ := range existingPCLQs { existingPodTemplateHash := existingPCLQ.Labels[apicommon.LabelPodTemplateHash] expectedPodTemplateHash := expectedPCLQPodTemplateHashes[existingPCLQ.Name] - if existingPodTemplateHash != expectedPodTemplateHash { - pclqFQNsPendingUpdate = append(pclqFQNsPendingUpdate, expectedPodTemplateHash) + if !expectedPodTemplateHash.Matches(existingPodTemplateHash) { + pclqFQNsPendingUpdate = append(pclqFQNsPendingUpdate, existingPCLQ.Name) } } return pclqFQNsPendingUpdate @@ -164,8 +172,14 @@ func IsPCSGUpdateInProgress(pcsg *grovecorev1alpha1.PodCliqueScalingGroup) bool } // IsPCSGUpdateComplete returns whether the rolling update of the PodCliqueScalingGroup is complete. -func IsPCSGUpdateComplete(pcsg *grovecorev1alpha1.PodCliqueScalingGroup, pcsGenerationHash string) bool { - return pcsg.Status.CurrentPodCliqueSetGenerationHash != nil && *pcsg.Status.CurrentPodCliqueSetGenerationHash == pcsGenerationHash +func IsPCSGUpdateComplete(pcsg *grovecorev1alpha1.PodCliqueScalingGroup, pcsGenerationHash string, compatiblePCSGenerationHashes ...string) bool { + if pcsg.Status.CurrentPodCliqueSetGenerationHash == nil { + return false + } + if *pcsg.Status.CurrentPodCliqueSetGenerationHash == pcsGenerationHash { + return true + } + return slices.Contains(compatiblePCSGenerationHashes, *pcsg.Status.CurrentPodCliqueSetGenerationHash) } // GetPodCliqueFQNsForPCSG generates the PodClique FQNs for all PodCliques that are owned by a PodCliqueScalingGroup. diff --git a/operator/internal/controller/common/component/utils/podcliquescalinggroup_test.go b/operator/internal/controller/common/component/utils/podcliquescalinggroup_test.go index c57487aa3..a4fa67921 100644 --- a/operator/internal/controller/common/component/utils/podcliquescalinggroup_test.go +++ b/operator/internal/controller/common/component/utils/podcliquescalinggroup_test.go @@ -164,6 +164,7 @@ func TestIsPCSGUpdateComplete(t *testing.T) { name string pcsg *grovecorev1alpha1.PodCliqueScalingGroup pcsGenerationHash string + compatibleHashes []string expected bool }{ { @@ -196,11 +197,22 @@ func TestIsPCSGUpdateComplete(t *testing.T) { pcsGenerationHash: "new-hash", expected: false, }, + { + name: "returns_true_when_hash_matches_compatible_legacy_hash", + pcsg: &grovecorev1alpha1.PodCliqueScalingGroup{ + Status: grovecorev1alpha1.PodCliqueScalingGroupStatus{ + CurrentPodCliqueSetGenerationHash: ptr.To("legacy-hash"), + }, + }, + pcsGenerationHash: "canonical-hash", + compatibleHashes: []string{"legacy-hash"}, + expected: true, + }, } for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { - result := IsPCSGUpdateComplete(tc.pcsg, tc.pcsGenerationHash) + result := IsPCSGUpdateComplete(tc.pcsg, tc.pcsGenerationHash, tc.compatibleHashes...) assert.Equal(t, tc.expected, result) }) } diff --git a/operator/internal/controller/podclique/components/pod/deletionsort.go b/operator/internal/controller/podclique/components/pod/deletionsort.go index 5373b95ab..902f4c168 100644 --- a/operator/internal/controller/podclique/components/pod/deletionsort.go +++ b/operator/internal/controller/podclique/components/pod/deletionsort.go @@ -18,6 +18,7 @@ package pod import ( apicommon "github.com/ai-dynamo/grove/operator/api/common" + componentutils "github.com/ai-dynamo/grove/operator/internal/controller/common/component/utils" corev1 "k8s.io/api/core/v1" ) @@ -27,6 +28,8 @@ type DeletionSorter struct { Pods []*corev1.Pod // ExpectedPodTemplateHash is the hash that is expected as a label on the updated pods ExpectedPodTemplateHash string + // ExpectedPodTemplateHashes are the canonical and legacy hashes accepted during hash migration. + ExpectedPodTemplateHashes componentutils.HashCandidates } // Len returns the length of the DeletionSorter @@ -65,7 +68,7 @@ func (s DeletionSorter) Less(i, j int) bool { // 4. Pods with older hashes < Pods with newer hashes if s.Pods[i].Labels[apicommon.LabelPodTemplateHash] != s.Pods[j].Labels[apicommon.LabelPodTemplateHash] { - return s.Pods[i].Labels[apicommon.LabelPodTemplateHash] != s.ExpectedPodTemplateHash + return !s.expectedHashMatches(s.Pods[i].Labels[apicommon.LabelPodTemplateHash]) } // 5. Empty creation time pods < newer pods < older pods @@ -75,6 +78,13 @@ func (s DeletionSorter) Less(i, j int) bool { return s.Pods[i].CreationTimestamp.After(s.Pods[j].CreationTimestamp.Time) } +func (s DeletionSorter) expectedHashMatches(hash string) bool { + if s.ExpectedPodTemplateHashes.Canonical != "" { + return s.ExpectedPodTemplateHashes.Matches(hash) + } + return hash == s.ExpectedPodTemplateHash +} + // isPodReady checks if a pod is ready by looking for the PodReady condition with status True func isPodReady(pod *corev1.Pod) bool { for _, condition := range pod.Status.Conditions { diff --git a/operator/internal/controller/podclique/components/pod/rollingupdate.go b/operator/internal/controller/podclique/components/pod/rollingupdate.go index 77f71309a..304d9a11f 100644 --- a/operator/internal/controller/podclique/components/pod/rollingupdate.go +++ b/operator/internal/controller/podclique/components/pod/rollingupdate.go @@ -141,7 +141,7 @@ func (r _resource) processPendingUpdates(logger logr.Logger, sc *syncContext) er func (r _resource) computeUpdateWork(logger logr.Logger, sc *syncContext) *updateWork { work := &updateWork{} for _, pod := range sc.existingPCLQPods { - if pod.Labels[common.LabelPodTemplateHash] != sc.expectedPodTemplateHash { + if !sc.podTemplateHashMatchesExpected(pod.Labels[common.LabelPodTemplateHash]) { // Old-hash pod — skip if deletion already in flight. if r.hasPodDeletionBeenTriggered(sc, pod) { logger.Info("skipping old Pod since its deletion has already been triggered", "pod", client.ObjectKeyFromObject(pod)) @@ -172,6 +172,13 @@ func (r _resource) computeUpdateWork(logger logr.Logger, sc *syncContext) *updat return work } +func (sc *syncContext) podTemplateHashMatchesExpected(hash string) bool { + if sc.expectedPodTemplateHashes.Canonical != "" { + return sc.expectedPodTemplateHashes.Matches(hash) + } + return hash == sc.expectedPodTemplateHash +} + // hasPodDeletionBeenTriggered checks if a pod is already terminating or has a delete expectation recorded func (r _resource) hasPodDeletionBeenTriggered(sc *syncContext, pod *corev1.Pod) bool { return k8sutils.IsResourceTerminating(pod.ObjectMeta) || r.expectationsStore.HasDeleteExpectation(sc.pclqExpectationsStoreKey, pod.GetUID()) diff --git a/operator/internal/controller/podclique/components/pod/rollingupdate_test.go b/operator/internal/controller/podclique/components/pod/rollingupdate_test.go index 9b5c72fb5..82d6613c2 100644 --- a/operator/internal/controller/podclique/components/pod/rollingupdate_test.go +++ b/operator/internal/controller/podclique/components/pod/rollingupdate_test.go @@ -17,17 +17,26 @@ limitations under the License. package pod import ( + "context" "fmt" "testing" "github.com/ai-dynamo/grove/operator/api/common" + grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" + componentutils "github.com/ai-dynamo/grove/operator/internal/controller/common/component/utils" "github.com/ai-dynamo/grove/operator/internal/expect" "github.com/go-logr/logr" + "github.com/samber/lo" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/tools/record" "k8s.io/utils/ptr" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" ) const ( @@ -93,6 +102,44 @@ func TestComputeUpdateWork(t *testing.T) { } } +// TestComputeUpdateWorkTreatsLegacyCurrentHashAsNew verifies that a ready pod +// stamped with the legacy form of the current template hash is not considered +// old work during the hash migration. Only pods whose hash matches neither the +// canonical nor legacy candidate should be queued for rolling replacement. +func TestComputeUpdateWorkTreatsLegacyCurrentHashAsNew(t *testing.T) { + r := _resource{expectationsStore: expect.NewExpectationsStore()} + sc := &syncContext{ + // The canonical and legacy pods both represent the desired template. + // The stale pod simulates an actually outdated template hash. + existingPCLQPods: []*corev1.Pod{ + newTestPod("canonical-ready", "canonical-hash", withPhase(corev1.PodRunning), withReadyCondition(), withContainerStatus(ptr.To(true), true)), + newTestPod("legacy-ready", "legacy-hash", withPhase(corev1.PodRunning), withReadyCondition(), withContainerStatus(ptr.To(true), true)), + newTestPod("stale-ready", "stale-hash", withPhase(corev1.PodRunning), withReadyCondition(), withContainerStatus(ptr.To(true), true)), + }, + expectedPodTemplateHash: "canonical-hash", + expectedPodTemplateHashes: componentutils.HashCandidates{ + Canonical: "canonical-hash", + Legacy: "legacy-hash", + }, + pclqExpectationsStoreKey: "test-key", + } + + work := r.computeUpdateWork(logr.Discard(), sc) + + assert.ElementsMatch(t, []string{"canonical-ready", "legacy-ready"}, podNames(work.newTemplateHashReadyPods)) + assert.ElementsMatch(t, []string{"stale-ready"}, podNames(work.oldTemplateHashReadyPods)) + assert.Empty(t, work.oldTemplateHashPendingPods) + assert.Empty(t, work.oldTemplateHashUnhealthyPods) + assert.Empty(t, work.oldTemplateHashStartingPods) + assert.Empty(t, work.oldTemplateHashUncategorizedPods) +} + +func podNames(pods []*corev1.Pod) []string { + return lo.Map(pods, func(pod *corev1.Pod, _ int) string { + return pod.Name + }) +} + // newTestPod creates a pod with the given name, template hash label, and options applied. func newTestPod(name, templateHash string, opts ...func(*corev1.Pod)) *corev1.Pod { pod := &corev1.Pod{ @@ -149,6 +196,190 @@ func withDeletionTimestamp() func(*corev1.Pod) { } } +// TestComputeUpdateWorkPCSHashFlipDoesNotProduceOldHashWork verifies that a +// PCS-level hash change does not schedule pod deletion when each pod's per-PCLQ +// template hash is unchanged. +// +// This is the narrow classification test: it calls computeUpdateWork directly +// and asserts that the stable-hash pods land in the new-template bucket, with +// every old-template bucket empty. +func TestComputeUpdateWorkPCSHashFlipDoesNotProduceOldHashWork(t *testing.T) { + const sharedHash = "per-pclq-hash-stable" + + // All four pods carry the same per-PCLQ pod-template hash on the + // LabelPodTemplateHash label — the same hash the PCS template would + // produce for this clique even after a sibling clique was reordered in + // the cliques slice. + existingPods := []*corev1.Pod{ + newTestPod("frontend-r0-pod-0", sharedHash, withPhase(corev1.PodRunning), withReadyCondition(), withContainerStatus(ptr.To(true), true)), + newTestPod("planner-r0-pod-0", sharedHash, withPhase(corev1.PodRunning), withReadyCondition(), withContainerStatus(ptr.To(true), true)), + newTestPod("decode-r0-pod-0", sharedHash, withPhase(corev1.PodRunning), withReadyCondition(), withContainerStatus(ptr.To(true), true)), + newTestPod("prefill-r0-pod-0", sharedHash, withPhase(corev1.PodRunning), withReadyCondition(), withContainerStatus(ptr.To(true), true)), + } + + r := _resource{expectationsStore: expect.NewExpectationsStore()} + sc := &syncContext{ + existingPCLQPods: existingPods, + // expectedPodTemplateHash matches every pod's LabelPodTemplateHash + // because ComputePCLQPodTemplateHash for an unchanged per-clique + // template returns an unchanged value — irrespective of whether the + // PCS-level computeGenerationHash flipped due to clique reorder. + expectedPodTemplateHash: sharedHash, + pclqExpectationsStoreKey: "test-key", + } + + work := r.computeUpdateWork(logr.Discard(), sc) + + assert.Empty(t, work.oldTemplateHashPendingPods, "no pod should be classified as old-pending when per-PCLQ hash is unchanged") + assert.Empty(t, work.oldTemplateHashUnhealthyPods, "no pod should be classified as old-unhealthy when per-PCLQ hash is unchanged") + assert.Empty(t, work.oldTemplateHashStartingPods, "no pod should be classified as old-starting when per-PCLQ hash is unchanged") + assert.Empty(t, work.oldTemplateHashUncategorizedPods, "no pod should be classified as old-uncategorized when per-PCLQ hash is unchanged") + assert.Empty(t, work.oldTemplateHashReadyPods, "no pod should be classified as old-ready when per-PCLQ hash is unchanged") + assert.Len(t, work.newTemplateHashReadyPods, len(existingPods), + "every existing ready pod must land in the new-template-hash bucket when per-PCLQ hash is unchanged; otherwise processPendingUpdates would delete them") + + // processPendingUpdates uses these same buckets to decide what to delete. + // With every "old" bucket empty: + // - deleteOldNonReadyPods is a no-op (lo.Union of empty slices) + // - getPodNamesPendingUpdate returns no pods + // - nextPodToUpdate is nil + // - control falls through to markRollingUpdateEnd + // i.e. NO pods are deleted from a pure clique-slice reorder. + allOldBuckets := append([]*corev1.Pod{}, work.oldTemplateHashPendingPods...) + allOldBuckets = append(allOldBuckets, work.oldTemplateHashUnhealthyPods...) + allOldBuckets = append(allOldBuckets, work.oldTemplateHashStartingPods...) + allOldBuckets = append(allOldBuckets, work.oldTemplateHashUncategorizedPods...) + allOldBuckets = append(allOldBuckets, work.oldTemplateHashReadyPods...) + assert.Empty(t, allOldBuckets, + "sanity: union of all old-hash buckets must be empty — this is the precondition that prevents pod deletion in the rolling-update path") +} + +// TestProcessPendingUpdatesPCSHashFlipDoesNotDeletePods verifies that +// processPendingUpdates finishes a rolling update without deleting pods when +// the PCS-level hash changed but the per-PCLQ pod-template hash did not. +// +// This is the full-flow regression test: it runs processPendingUpdates with a +// fake client and asserts both externally visible effects - no pods are deleted +// and the rolling update is marked complete. +func TestProcessPendingUpdatesPCSHashFlipDoesNotDeletePods(t *testing.T) { + const sharedHash = "per-pclq-hash-stable" + const pcsHashAfterReorder = "pcs-generation-hash-after-clique-reorder" + const namespace = testNS + const pclqName = "test-pclq" + + scheme := runtime.NewScheme() + require.NoError(t, grovecorev1alpha1.AddToScheme(scheme)) + require.NoError(t, corev1.AddToScheme(scheme)) + + pclq := &grovecorev1alpha1.PodClique{ + ObjectMeta: metav1.ObjectMeta{ + Name: pclqName, + Namespace: namespace, + Labels: map[string]string{ + common.LabelPodTemplateHash: sharedHash, + }, + }, + Spec: grovecorev1alpha1.PodCliqueSpec{ + Replicas: 4, + MinAvailable: ptr.To(int32(1)), + }, + Status: grovecorev1alpha1.PodCliqueStatus{ + Replicas: 4, + ReadyReplicas: 4, + UpdatedReplicas: 4, + // Update was just reset by the PCLQ reconciler in response to a + // PCS generation hash flip. The per-PCLQ template hash recorded + // here is identical to what is already on the pod labels because + // only the cliques map-list was reordered. + UpdateProgress: &grovecorev1alpha1.PodCliqueUpdateProgress{ + UpdateStartedAt: metav1.Now(), + PodCliqueSetGenerationHash: pcsHashAfterReorder, + PodTemplateHash: sharedHash, + }, + }, + } + + makePod := func(name string) *corev1.Pod { + return &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + Labels: map[string]string{ + common.LabelPodTemplateHash: sharedHash, + }, + }, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + Conditions: []corev1.PodCondition{ + {Type: corev1.PodReady, Status: corev1.ConditionTrue}, + }, + ContainerStatuses: []corev1.ContainerStatus{ + {Name: "main", Started: ptr.To(true), Ready: true}, + }, + }, + } + } + + pods := []*corev1.Pod{ + makePod("pclq-pod-0"), + makePod("pclq-pod-1"), + makePod("pclq-pod-2"), + makePod("pclq-pod-3"), + } + originalUIDs := make(map[string]string, len(pods)) + for _, p := range pods { + originalUIDs[p.Name] = string(p.UID) + } + + objs := []client.Object{pclq} + for i := range pods { + objs = append(objs, pods[i]) + } + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(objs...). + WithStatusSubresource(&grovecorev1alpha1.PodClique{}). + Build() + + r := _resource{ + client: fakeClient, + scheme: scheme, + eventRecorder: record.NewFakeRecorder(32), + expectationsStore: expect.NewExpectationsStore(), + } + + sc := &syncContext{ + ctx: context.Background(), + pclq: pclq, + existingPCLQPods: pods, + expectedPodTemplateHash: sharedHash, + pclqExpectationsStoreKey: namespace + "/" + pclqName, + } + + require.NoError(t, r.processPendingUpdates(logr.Discard(), sc), + "processPendingUpdates must not error when there is no work to do") + + // 1. No pods deleted. + remainingPods := &corev1.PodList{} + require.NoError(t, fakeClient.List(context.Background(), remainingPods, client.InNamespace(namespace))) + assert.Len(t, remainingPods.Items, len(pods), + "no pods should be deleted when per-PCLQ pod-template hash is unchanged across the PCS hash flip") + for _, p := range remainingPods.Items { + assert.Nil(t, p.DeletionTimestamp, + "pod %q must not be marked for deletion", p.Name) + } + + // 2. Rolling update was marked complete (markRollingUpdateEnd ran). + updatedPCLQ := &grovecorev1alpha1.PodClique{} + require.NoError(t, fakeClient.Get(context.Background(), + client.ObjectKey{Namespace: namespace, Name: pclqName}, updatedPCLQ)) + require.NotNil(t, updatedPCLQ.Status.UpdateProgress, "UpdateProgress must remain set") + require.NotNil(t, updatedPCLQ.Status.UpdateProgress.UpdateEndedAt, + "UpdateEndedAt must be set — markRollingUpdateEnd should have run because there was no real per-PCLQ work") + assert.Nil(t, updatedPCLQ.Status.UpdateProgress.ReadyPodsSelectedToUpdate, + "ReadyPodsSelectedToUpdate must be cleared by markRollingUpdateEnd") +} + // bucket identifies which updateWork bucket a pod should land in. type bucket int diff --git a/operator/internal/controller/podclique/components/pod/syncflow.go b/operator/internal/controller/podclique/components/pod/syncflow.go index 1e3d63f6d..621f91275 100644 --- a/operator/internal/controller/podclique/components/pod/syncflow.go +++ b/operator/internal/controller/podclique/components/pod/syncflow.go @@ -60,7 +60,7 @@ func (r _resource) prepareSyncFlow(ctx context.Context, logger logr.Logger, pclq ) } - sc.expectedPodTemplateHash, err = componentutils.GetExpectedPCLQPodTemplateHash(sc.pcs, pclq.ObjectMeta) + sc.expectedPodTemplateHashes, err = componentutils.GetExpectedPCLQPodTemplateHashCandidates(sc.pcs, pclq.ObjectMeta) if err != nil { return nil, groveerr.WrapError(err, errCodeGetPodCliqueTemplate, @@ -68,6 +68,7 @@ func (r _resource) prepareSyncFlow(ctx context.Context, logger logr.Logger, pclq fmt.Sprintf("failed to compute pod clique template hash for PodClique: %v in PodCliqueSet", client.ObjectKeyFromObject(pclq)), ) } + sc.expectedPodTemplateHash = sc.expectedPodTemplateHashes.Canonical // get the PCLQ expectations key sc.pclqExpectationsStoreKey, err = getPodCliqueExpectationsStoreKey(logger, component.OperationSync, pclq.ObjectMeta) @@ -101,6 +102,9 @@ func (r _resource) prepareSyncFlow(ctx context.Context, logger logr.Logger, pclq fmt.Sprintf("failed to list pods that belong to the PodClique %v", client.ObjectKeyFromObject(pclq)), ) } + if err = r.migrateLegacyCurrentPodLabels(ctx, logger, sc); err != nil { + return nil, err + } return sc, nil } @@ -234,7 +238,8 @@ func selectExcessPodsToDelete(sc *syncContext, logger logr.Logger) []*corev1.Pod if diff := len(sc.existingPCLQPods) - int(sc.pclq.Spec.Replicas); diff > 0 { logger.Info("found excess pods for PodClique", "numExcessPods", diff) sorter := DeletionSorter{ - Pods: sc.existingPCLQPods, + Pods: sc.existingPCLQPods, + ExpectedPodTemplateHashes: sc.expectedPodTemplateHashes, } sorter.ExpectedPodTemplateHash = sc.getExpectedPodTemplateHash() sort.Sort(sorter) @@ -252,6 +257,32 @@ func (sc *syncContext) getExpectedPodTemplateHash() string { return sc.pclq.Labels[apicommon.LabelPodTemplateHash] } +// Pods carry the PodClique pod-template hash in a label so the controller can +// tell which desired template created each live pod. Older controllers hashed +// the same template before canonicalizing Kubernetes map-list fields, so those +// pods may have a legacy hash that should converge without a rollout. +func (r _resource) migrateLegacyCurrentPodLabels(ctx context.Context, logger logr.Logger, sc *syncContext) error { + for _, pod := range sc.existingPCLQPods { + if !sc.expectedPodTemplateHashes.IsLegacy(pod.Labels[apicommon.LabelPodTemplateHash]) { + continue + } + patch := client.MergeFrom(pod.DeepCopy()) + if pod.Labels == nil { + pod.Labels = make(map[string]string, 1) + } + pod.Labels[apicommon.LabelPodTemplateHash] = sc.expectedPodTemplateHashes.Canonical + if err := client.IgnoreNotFound(r.client.Patch(ctx, pod, patch)); err != nil { + return groveerr.WrapError(err, + errCodeBuildPodResource, + component.OperationSync, + fmt.Sprintf("failed to migrate legacy PodTemplateHash label for Pod %v", client.ObjectKeyFromObject(pod)), + ) + } + logger.Info("migrated legacy PodTemplateHash label on Pod", "pod", client.ObjectKeyFromObject(pod), "podTemplateHash", sc.expectedPodTemplateHashes.Canonical) + } + return nil +} + // checkAndRemovePodSchedulingGates removes scheduling gates from pods when their dependencies are satisfied func (r _resource) checkAndRemovePodSchedulingGates(sc *syncContext, logger logr.Logger) ([]string, error) { tasks := make([]utils.Task, 0, len(sc.existingPCLQPods)) @@ -451,6 +482,7 @@ type syncContext struct { podNamesUpdatedInPCLQPodGangSet componentutils.Set[string] pclqExpectationsStoreKey string expectedPodTemplateHash string + expectedPodTemplateHashes componentutils.HashCandidates } // syncFlowResult captures the result of a sync flow run. diff --git a/operator/internal/controller/podclique/reconcilespec.go b/operator/internal/controller/podclique/reconcilespec.go index 9b88e9124..3c4b6ebaa 100644 --- a/operator/internal/controller/podclique/reconcilespec.go +++ b/operator/internal/controller/podclique/reconcilespec.go @@ -76,6 +76,13 @@ func (r *Reconciler) processUpdate(ctx context.Context, logger logr.Logger, pclq return ctrlcommon.ReconcileWithErrors(fmt.Sprintf("could not get owner PodCliqueSet for PodClique: %v", pclqObjectKey), err) } + // Older controllers may have labeled this same desired PodClique template + // with the pre-canonical pod-template hash. Rewrite it before comparing + // update state. + if err = r.migrateLegacyCurrentPodCliqueLabel(ctx, pcs, pclq); err != nil { + return ctrlcommon.ReconcileWithErrors("could not migrate legacy PodClique template hash label", err) + } + // Handle OnDelete strategy first if !componentutils.IsAutoUpdateStrategy(pcs) { if shouldResetOrTriggerUpdate(pcs, pclq) { @@ -135,25 +142,55 @@ func shouldCheckPendingUpdatesForPCLQ(logger logr.Logger, pcs *grovecorev1alpha1 return true, nil } +// migrateLegacyCurrentPodCliqueLabel rewrites the pod-template hash label to +// its canonical value when the PodClique still carries the legacy hash from +// v0.1.0-alpha.8. Without this, an unchanged template would look drifted and +// trigger a spurious rolling update. No-op if the label is already canonical. +func (r *Reconciler) migrateLegacyCurrentPodCliqueLabel(ctx context.Context, pcs *grovecorev1alpha1.PodCliqueSet, pclq *grovecorev1alpha1.PodClique) error { + templateHashCandidates, err := componentutils.GetExpectedPCLQPodTemplateHashCandidates(pcs, pclq.ObjectMeta) + if err != nil { + return err + } + if !templateHashCandidates.IsLegacy(pclq.Labels[apicommon.LabelPodTemplateHash]) { + return nil + } + patch := client.MergeFrom(pclq.DeepCopy()) + if pclq.Labels == nil { + pclq.Labels = make(map[string]string, 1) + } + pclq.Labels[apicommon.LabelPodTemplateHash] = templateHashCandidates.Canonical + if err := r.client.Patch(ctx, pclq, patch); err != nil { + return fmt.Errorf("failed to migrate PodClique %s pod template hash label: %w", client.ObjectKeyFromObject(pclq), err) + } + return nil +} + // shouldResetOrTriggerUpdate determines if an update should be started or reset based on generation hash comparison func shouldResetOrTriggerUpdate(pcs *grovecorev1alpha1.PodCliqueSet, pclq *grovecorev1alpha1.PodClique) bool { + if pcs.Status.CurrentGenerationHash == nil { + return false + } + pcsGenerationHashCandidates := componentutils.ComputePCSGenerationHashCandidates(pcs) + matchesCurrentPCSGeneration := func(hash string) bool { + return hash == *pcs.Status.CurrentGenerationHash || pcsGenerationHashCandidates.Matches(hash) + } + // PCLQ has never been updated yet and PCS has a new generation hash. - firstEverUpdateRequired := pclq.Status.UpdateProgress == nil && pclq.Status.CurrentPodCliqueSetGenerationHash != nil && *pcs.Status.CurrentGenerationHash != *pclq.Status.CurrentPodCliqueSetGenerationHash + firstEverUpdateRequired := pclq.Status.UpdateProgress == nil && + pclq.Status.CurrentPodCliqueSetGenerationHash != nil && + !matchesCurrentPCSGeneration(*pclq.Status.CurrentPodCliqueSetGenerationHash) if firstEverUpdateRequired { return true } - // Wait for the first reconciliation of the PodCliqueSet - if pcs.Status.CurrentGenerationHash == nil { - return false - } - // PCLQ is undergoing an update for a different PCS generation hash // Irrespective of whether the pod template hash has changed or not, the in-progress update is stale and needs to be // reset in order to set the correct updateProgress.PodCliqueSetGenerationHash - inProgressPCLQUpdateNotStale := componentutils.IsPCLQAutoUpdateInProgress(pclq) && pclq.Status.UpdateProgress.PodCliqueSetGenerationHash == *pcs.Status.CurrentGenerationHash + inProgressPCLQUpdateNotStale := componentutils.IsPCLQAutoUpdateInProgress(pclq) && + matchesCurrentPCSGeneration(pclq.Status.UpdateProgress.PodCliqueSetGenerationHash) // PCLQ had an update in the past but that was for an older PCS generation hash. - lastCompletedUpdateIsNotStale := componentutils.IsLastPCLQUpdateCompleted(pclq) && pclq.Status.UpdateProgress.PodCliqueSetGenerationHash == *pcs.Status.CurrentGenerationHash + lastCompletedUpdateIsNotStale := componentutils.IsLastPCLQUpdateCompleted(pclq) && + matchesCurrentPCSGeneration(pclq.Status.UpdateProgress.PodCliqueSetGenerationHash) if inProgressPCLQUpdateNotStale || lastCompletedUpdateIsNotStale { return false } diff --git a/operator/internal/controller/podclique/reconcilestatus.go b/operator/internal/controller/podclique/reconcilestatus.go index 74ded2305..e3125e72c 100644 --- a/operator/internal/controller/podclique/reconcilestatus.go +++ b/operator/internal/controller/podclique/reconcilestatus.go @@ -69,7 +69,7 @@ func (r *Reconciler) reconcileStatus(ctx context.Context, logger logr.Logger, pc } // mutate PodClique Status Replicas, ReadyReplicas, ScheduleGatedReplicas and UpdatedReplicas. mutateReplicas(pclq, podCategories, len(existingPods)) - mutateUpdatedReplica(pclq, existingPods) + mutateUpdatedReplica(pcs, pclq, existingPods) // mutate the conditions only if the PodClique has been successfully reconciled at least once. // This prevents prematurely setting incorrect conditions. @@ -112,23 +112,42 @@ func mutateCurrentHashes(logger logr.Logger, pcs *grovecorev1alpha1.PodCliqueSet logger.Info("PodClique is currently updating, cannot set PodCliqueSet CurrentGenerationHash yet") return nil } + if pcs.Status.CurrentGenerationHash == nil { + return nil + } + expectedPodTemplateHashes, err := componentutils.GetExpectedPCLQPodTemplateHashCandidates(pcs, pclq.ObjectMeta) + if err != nil { + return err + } + pcsGenerationHashes := componentutils.ComputePCSGenerationHashCandidates(pcs) + if pclq.Status.UpdateProgress == nil { - expectedPodTemplateHash, err := componentutils.GetExpectedPCLQPodTemplateHash(pcs, pclq.ObjectMeta) - if err != nil { - return err - } - if pclq.Status.CurrentPodTemplateHash == nil || *pclq.Status.CurrentPodTemplateHash == expectedPodTemplateHash { - pclq.Status.CurrentPodTemplateHash = ptr.To(expectedPodTemplateHash) - pclq.Status.CurrentPodCliqueSetGenerationHash = pcs.Status.CurrentGenerationHash + if isPodCliqueTemplateHashCurrent(pclq, expectedPodTemplateHashes) { + pclq.Status.CurrentPodTemplateHash = ptr.To(expectedPodTemplateHashes.Canonical) + pclq.Status.CurrentPodCliqueSetGenerationHash = ptr.To(pcsGenerationHashes.Canonical) } } else if componentutils.IsLastPCLQUpdateCompleted(pclq) { logger.Info("PodClique update has completed, setting CurrentPodCliqueSetGenerationHash") + if expectedPodTemplateHashes.Matches(pclq.Status.UpdateProgress.PodTemplateHash) && + pcsGenerationHashes.Matches(pclq.Status.UpdateProgress.PodCliqueSetGenerationHash) { + pclq.Status.UpdateProgress.PodTemplateHash = expectedPodTemplateHashes.Canonical + pclq.Status.UpdateProgress.PodCliqueSetGenerationHash = pcsGenerationHashes.Canonical + } pclq.Status.CurrentPodTemplateHash = ptr.To(pclq.Status.UpdateProgress.PodTemplateHash) pclq.Status.CurrentPodCliqueSetGenerationHash = ptr.To(pclq.Status.UpdateProgress.PodCliqueSetGenerationHash) } return nil } +func isPodCliqueTemplateHashCurrent(pclq *grovecorev1alpha1.PodClique, expectedPodTemplateHashes componentutils.HashCandidates) bool { + labelPodTemplateHash, ok := pclq.Labels[apicommon.LabelPodTemplateHash] + if !ok || !expectedPodTemplateHashes.Matches(labelPodTemplateHash) { + return false + } + return pclq.Status.CurrentPodTemplateHash == nil || + expectedPodTemplateHashes.Matches(*pclq.Status.CurrentPodTemplateHash) +} + // mutateReplicas updates the PodClique status with current replica counts based on pod categorization func mutateReplicas(pclq *grovecorev1alpha1.PodClique, podCategories map[corev1.PodConditionType][]*corev1.Pod, numExistingPods int) { // mutate the PCLQ status with current number of schedule gated, ready pods and updated pods. @@ -140,23 +159,35 @@ func mutateReplicas(pclq *grovecorev1alpha1.PodClique, podCategories map[corev1. } // mutateUpdatedReplica calculates and sets the number of pods with the expected template hash -func mutateUpdatedReplica(pclq *grovecorev1alpha1.PodClique, existingPods []*corev1.Pod) { - var expectedPodTemplateHash string +func mutateUpdatedReplica(pcs *grovecorev1alpha1.PodCliqueSet, pclq *grovecorev1alpha1.PodClique, existingPods []*corev1.Pod) { + var expectedPodTemplateHashes componentutils.HashCandidates // If UpdateProgress exists (update in progress or recently completed), use the target hash from it. // This covers both the active update phase and the window after completion before CurrentPodTemplateHash is synced. if pclq.Status.UpdateProgress != nil { - expectedPodTemplateHash = pclq.Status.UpdateProgress.PodTemplateHash + expectedPodTemplateHashes = componentutils.HashCandidates{ + Canonical: pclq.Status.UpdateProgress.PodTemplateHash, + Legacy: pclq.Status.UpdateProgress.PodTemplateHash, + } } else if pclq.Status.CurrentPodTemplateHash != nil { // Steady state: no rolling update tracking exists. // Use the stable current hash for pods that have been reconciled. - expectedPodTemplateHash = *pclq.Status.CurrentPodTemplateHash + expectedPodTemplateHashes = componentutils.HashCandidates{ + Canonical: *pclq.Status.CurrentPodTemplateHash, + Legacy: *pclq.Status.CurrentPodTemplateHash, + } + } + if expectedPodTemplateHashes.Canonical != "" && pcs != nil { + currentDesiredHashes, err := componentutils.GetExpectedPCLQPodTemplateHashCandidates(pcs, pclq.ObjectMeta) + if err == nil && currentDesiredHashes.Matches(expectedPodTemplateHashes.Canonical) { + expectedPodTemplateHashes = currentDesiredHashes + } } // If expectedPodTemplateHash is empty, it means that the PCLQ has never been successfully reconciled and therefore no pods should be considered as updated. // This prevents incorrectly marking all existing pods as updated when the PCLQ is first created. // Once the PCLQ is successfully reconciled, the expectedPodTemplateHash will be set and the updated replicas can be calculated correctly. - if expectedPodTemplateHash != "" { + if expectedPodTemplateHashes.Canonical != "" { updatedReplicas := lo.Reduce(existingPods, func(agg int, pod *corev1.Pod, _ int) int { - if pod.Labels[apicommon.LabelPodTemplateHash] == expectedPodTemplateHash { + if expectedPodTemplateHashes.Matches(pod.Labels[apicommon.LabelPodTemplateHash]) { return agg + 1 } return agg diff --git a/operator/internal/controller/podclique/reconcilestatus_test.go b/operator/internal/controller/podclique/reconcilestatus_test.go index 132ef8c57..67fccb87b 100644 --- a/operator/internal/controller/podclique/reconcilestatus_test.go +++ b/operator/internal/controller/podclique/reconcilestatus_test.go @@ -24,8 +24,11 @@ import ( "github.com/ai-dynamo/grove/operator/api/common/constants" grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" internalconstants "github.com/ai-dynamo/grove/operator/internal/constants" + componentutils "github.com/ai-dynamo/grove/operator/internal/controller/common/component/utils" + "github.com/go-logr/logr" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/tools/record" @@ -176,7 +179,7 @@ func TestMutateUpdatedReplica(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { // Call the function - mutateUpdatedReplica(tt.pclq, tt.existingPods) + mutateUpdatedReplica(nil, tt.pclq, tt.existingPods) // Assert the result assert.Equal(t, tt.expectedUpdatedReplicas, tt.pclq.Status.UpdatedReplicas, @@ -185,6 +188,180 @@ func TestMutateUpdatedReplica(t *testing.T) { } } +// TestMutateUpdatedReplicaCountsCanonicalAndLegacyCurrentPodLabels verifies that +// mutateUpdatedReplica treats both the canonical and legacy pod-template-hash +// values as "current" when counting UpdatedReplicas. This protects in-flight +// rollouts during a hash-format migration: pods labeled with either hash variant +// of the current template are counted as updated, while pods with an unrelated +// (stale) hash are not. +func TestMutateUpdatedReplicaCountsCanonicalAndLegacyCurrentPodLabels(t *testing.T) { + template := &grovecorev1alpha1.PodCliqueTemplateSpec{ + Name: "worker", + Spec: grovecorev1alpha1.PodCliqueSpec{ + PodSpec: corev1.PodSpec{ + Containers: []corev1.Container{ + {Name: "sidecar", Image: "sidecar:v1"}, + {Name: "main", Image: "main:v1"}, + }, + }, + }, + } + hashes := componentutils.ComputePCLQPodTemplateHashCandidates(template, "") + require.NotEqual(t, hashes.Canonical, hashes.Legacy, "test must exercise canonical/legacy divergence") + + pcs := &grovecorev1alpha1.PodCliqueSet{ + ObjectMeta: metav1.ObjectMeta{Name: "pcs", Namespace: "default"}, + Spec: grovecorev1alpha1.PodCliqueSetSpec{ + Template: grovecorev1alpha1.PodCliqueSetTemplateSpec{ + Cliques: []*grovecorev1alpha1.PodCliqueTemplateSpec{template}, + }, + }, + } + pclq := &grovecorev1alpha1.PodClique{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pcs-0-worker", + Namespace: "default", + Labels: map[string]string{ + apicommon.LabelPartOfKey: "pcs", + apicommon.LabelPodCliqueSetReplicaIndex: "0", + }, + }, + Status: grovecorev1alpha1.PodCliqueStatus{ + CurrentPodTemplateHash: ptr.To(hashes.Canonical), + }, + } + + mutateUpdatedReplica(pcs, pclq, []*corev1.Pod{ + createPodWithHash("pod-canonical", hashes.Canonical), + createPodWithHash("pod-legacy", hashes.Legacy), + createPodWithHash("pod-stale", "stale-hash"), + }) + assert.Equal(t, int32(2), pclq.Status.UpdatedReplicas) +} + +// TestMutateCurrentHashesAdvancesGenerationWhenTemplateHashIsCurrent verifies that +// once a PodClique has fully converged on the current template (Replicas == +// UpdatedReplicas) and its CurrentPodTemplateHash matches the template — even +// via the legacy hash — mutateCurrentHashes advances both the +// CurrentPodTemplateHash and the CurrentPodCliqueSetGenerationHash to their +// canonical values, completing the migration off of legacy hashes. +func TestMutateCurrentHashesAdvancesGenerationWhenTemplateHashIsCurrent(t *testing.T) { + pcs, pclq, templateHashes, generationHashes := newPodCliqueHashConvergenceFixture() + oldGenerationHash := "old-generation-hash" + pclq.Status.CurrentPodTemplateHash = ptr.To(templateHashes.Legacy) + pclq.Status.CurrentPodCliqueSetGenerationHash = ptr.To(oldGenerationHash) + pclq.Status.Replicas = 2 + pclq.Status.UpdatedReplicas = 2 + + err := mutateCurrentHashes(logr.Discard(), pcs, pclq) + + require.NoError(t, err) + assert.Equal(t, templateHashes.Canonical, *pclq.Status.CurrentPodTemplateHash) + assert.Equal(t, generationHashes.Canonical, *pclq.Status.CurrentPodCliqueSetGenerationHash) +} + +// TestMutateCurrentHashesDoesNotAdvanceWhenTemplateHashIsStale verifies that +// mutateCurrentHashes refuses to advance CurrentPodTemplateHash or +// CurrentPodCliqueSetGenerationHash when the PodClique has not actually +// converged on the current PodCliqueSet template, even though the replica +// counts (Replicas == UpdatedReplicas) would superficially suggest it has. +// +// "Stale" here means a hash value that matches neither the canonical nor the +// legacy form of the expected pod-template hash for the current PCS template +// (i.e. it is left over from some prior, no-longer-current template). Because +// convergence is established by checking both the pod-template-hash label on +// the PodClique and Status.CurrentPodTemplateHash against those expected +// canonical/legacy hashes, a stale value in either field must block the +// advance. The two table cases exercise exactly those inputs: +// +// - "stale label": the PodClique's pod-template-hash label is a stale value +// while Status.CurrentPodTemplateHash is unset. Convergence must fail on +// the label check, so neither the template hash nor the generation hash +// may move to canonical. +// - "stale current template hash": the label is the current canonical hash +// (so the label check passes), but Status.CurrentPodTemplateHash already +// holds a stale value from a prior template. Convergence must fail on the +// status check, the stale Status.CurrentPodTemplateHash must be preserved +// as-is (not overwritten with the canonical hash), and the generation +// hash must not advance. +func TestMutateCurrentHashesDoesNotAdvanceWhenTemplateHashIsStale(t *testing.T) { + tests := []struct { + name string + labelPodTemplateHash string + currentPodTemplateHash string + wantPCSGenerationHash string + wantCurrentTemplateHash string + }{ + { + name: "stale label", + labelPodTemplateHash: "stale-template-hash", + currentPodTemplateHash: "", + wantPCSGenerationHash: "old-generation-hash", + wantCurrentTemplateHash: "", + }, + { + name: "stale current template hash", + labelPodTemplateHash: "", + currentPodTemplateHash: "stale-template-hash", + wantPCSGenerationHash: "old-generation-hash", + wantCurrentTemplateHash: "stale-template-hash", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + pcs, pclq, templateHashes, _ := newPodCliqueHashConvergenceFixture() + if tt.labelPodTemplateHash == "" { + tt.labelPodTemplateHash = templateHashes.Canonical + } + pclq.Labels[apicommon.LabelPodTemplateHash] = tt.labelPodTemplateHash + pclq.Status.CurrentPodTemplateHash = ptr.To(tt.currentPodTemplateHash) + pclq.Status.CurrentPodCliqueSetGenerationHash = ptr.To(tt.wantPCSGenerationHash) + pclq.Status.Replicas = 2 + pclq.Status.UpdatedReplicas = 2 + + err := mutateCurrentHashes(logr.Discard(), pcs, pclq) + + require.NoError(t, err) + assert.Equal(t, tt.wantCurrentTemplateHash, *pclq.Status.CurrentPodTemplateHash) + assert.Equal(t, tt.wantPCSGenerationHash, *pclq.Status.CurrentPodCliqueSetGenerationHash) + }) + } +} + +func newPodCliqueHashConvergenceFixture() (*grovecorev1alpha1.PodCliqueSet, *grovecorev1alpha1.PodClique, componentutils.HashCandidates, componentutils.HashCandidates) { + template := &grovecorev1alpha1.PodCliqueTemplateSpec{ + Name: "worker", + Spec: grovecorev1alpha1.PodCliqueSpec{ + PodSpec: corev1.PodSpec{ + Containers: []corev1.Container{{Name: "main", Image: "main:v1"}}, + }, + }, + } + pcs := &grovecorev1alpha1.PodCliqueSet{ + ObjectMeta: metav1.ObjectMeta{Name: "pcs", Namespace: "default"}, + Spec: grovecorev1alpha1.PodCliqueSetSpec{ + Template: grovecorev1alpha1.PodCliqueSetTemplateSpec{ + Cliques: []*grovecorev1alpha1.PodCliqueTemplateSpec{template}, + }, + }, + } + templateHashes := componentutils.ComputePCLQPodTemplateHashCandidates(template, "") + generationHashes := componentutils.ComputePCSGenerationHashCandidates(pcs) + pcs.Status.CurrentGenerationHash = ptr.To(generationHashes.Canonical) + pclq := &grovecorev1alpha1.PodClique{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pcs-0-worker", + Namespace: "default", + Labels: map[string]string{ + apicommon.LabelPartOfKey: "pcs", + apicommon.LabelPodCliqueSetReplicaIndex: "0", + apicommon.LabelPodTemplateHash: templateHashes.Canonical, + }, + }, + } + return pcs, pclq, templateHashes, generationHashes +} + // createPodWithHash creates a test pod with the specified template hash label func createPodWithHash(name string, templateHash string) *corev1.Pod { return &corev1.Pod{ diff --git a/operator/internal/controller/podclique/register.go b/operator/internal/controller/podclique/register.go index 12d791ba6..05073841e 100644 --- a/operator/internal/controller/podclique/register.go +++ b/operator/internal/controller/podclique/register.go @@ -220,12 +220,31 @@ func podCliqueSetPredicate() predicate.Predicate { if !okOld || !okNew { return false } - return oldPCS.Status.CurrentGenerationHash != newPCS.Status.CurrentGenerationHash + return !stringPointersEqual(oldPCS.Status.CurrentGenerationHash, newPCS.Status.CurrentGenerationHash) || + pcsCurrentlyUpdatingReplicaChanged(oldPCS.Status.UpdateProgress, newPCS.Status.UpdateProgress) }, GenericFunc: func(_ event.GenericEvent) bool { return false }, } } +// pcsCurrentlyUpdatingReplicaChanged reports whether the replica currently being updated has changed between the old and new PodCliqueSet update progress. +func pcsCurrentlyUpdatingReplicaChanged(oldProgress, newProgress *grovecorev1alpha1.PodCliqueSetUpdateProgress) bool { + oldReplicaIndex, oldOK := currentPCSReplicaInUpdate(oldProgress) + newReplicaIndex, newOK := currentPCSReplicaInUpdate(newProgress) + if oldOK != newOK { + return true + } + return oldOK && oldReplicaIndex != newReplicaIndex +} + +// currentPCSReplicaInUpdate returns the replica index of the PodCliqueSet replica currently being updated, if any. +func currentPCSReplicaInUpdate(progress *grovecorev1alpha1.PodCliqueSetUpdateProgress) (int32, bool) { + if progress == nil || len(progress.CurrentlyUpdating) == 0 { + return 0, false + } + return progress.CurrentlyUpdating[0].ReplicaIndex, true +} + // mapPodCliqueScalingGroupToPCLQs maps a PodCliqueScalingGroup to one or more reconcile.Request(s) to its constituent PodCliques. // These events are needed to keep the PodClique.Status.CurrentPodCliqueSetGenerationHash in sync with the PodCliqueSet. func mapPodCliqueScalingGroupToPCLQs() handler.MapFunc { @@ -254,13 +273,38 @@ func podCliqueScalingGroupPredicate() predicate.Predicate { if !okOld || !okNew { return false } - return oldPCSG.Status.CurrentPodCliqueSetGenerationHash != nil && newPCSG.Status.UpdateProgress != nil && - *oldPCSG.Status.CurrentPodCliqueSetGenerationHash != newPCSG.Status.UpdateProgress.PodCliqueSetGenerationHash + return !stringPointersEqual(oldPCSG.Status.CurrentPodCliqueSetGenerationHash, newPCSG.Status.CurrentPodCliqueSetGenerationHash) || + pcsgUpdateTargetGenerationChanged(oldPCSG.Status.UpdateProgress, newPCSG.Status.UpdateProgress) }, GenericFunc: func(_ event.GenericEvent) bool { return false }, } } +// pcsgUpdateTargetGenerationChanged reports whether the PodCliqueScalingGroup update's target PodCliqueSet generation hash has changed between the old and new update progress. +func pcsgUpdateTargetGenerationChanged(oldProgress, newProgress *grovecorev1alpha1.PodCliqueScalingGroupUpdateProgress) bool { + oldTarget, oldOK := pcsgUpdateTargetGeneration(oldProgress) + newTarget, newOK := pcsgUpdateTargetGeneration(newProgress) + if oldOK != newOK { + return true + } + return oldOK && oldTarget != newTarget +} + +// pcsgUpdateTargetGeneration returns the target PodCliqueSet generation hash for an in-progress PodCliqueScalingGroup update, if any. +func pcsgUpdateTargetGeneration(progress *grovecorev1alpha1.PodCliqueScalingGroupUpdateProgress) (string, bool) { + if progress == nil { + return "", false + } + return progress.PodCliqueSetGenerationHash, true +} + +func stringPointersEqual(oldValue, newValue *string) bool { + if oldValue == nil || newValue == nil { + return oldValue == newValue + } + return *oldValue == *newValue +} + // mapPodGangToPCLQs maps a PodGang to one or more reconcile.Request(s) for its constituent PodClique's. func mapPodGangToPCLQs() handler.MapFunc { return func(_ context.Context, obj client.Object) []reconcile.Request { diff --git a/operator/internal/controller/podclique/register_test.go b/operator/internal/controller/podclique/register_test.go index 333c79bd3..f04dde5fc 100644 --- a/operator/internal/controller/podclique/register_test.go +++ b/operator/internal/controller/podclique/register_test.go @@ -77,6 +77,127 @@ func TestPodPredicate_Delete(t *testing.T) { }) } +// TestPodCliqueSetPredicateCurrentlyUpdatingReplicaChanges verifies that the PodCliqueSet +// watch predicate enqueues PodClique reconciles when the replica currently being rolled out +// changes. The predicate intentionally ignores most PodCliqueSet updates to avoid reconcile +// storms, so it must still fire when CurrentlyUpdating starts, stops, or shifts to a different +// replica index, and must stay quiet when the in-progress replica is unchanged. +func TestPodCliqueSetPredicateCurrentlyUpdatingReplicaChanges(t *testing.T) { + pred, ok := podCliqueSetPredicate().(predicate.Funcs) + require.True(t, ok, "predicate must be predicate.Funcs") + + tests := []struct { + name string + oldProgress *grovecorev1alpha1.PodCliqueSetUpdateProgress + newProgress *grovecorev1alpha1.PodCliqueSetUpdateProgress + want bool + }{ + { + name: "currently updating starts", + newProgress: &grovecorev1alpha1.PodCliqueSetUpdateProgress{ + CurrentlyUpdating: []grovecorev1alpha1.PodCliqueSetReplicaUpdateProgress{{ReplicaIndex: 0}}, + }, + want: true, + }, + { + name: "currently updating clears", + oldProgress: &grovecorev1alpha1.PodCliqueSetUpdateProgress{ + CurrentlyUpdating: []grovecorev1alpha1.PodCliqueSetReplicaUpdateProgress{{ReplicaIndex: 0}}, + }, + newProgress: &grovecorev1alpha1.PodCliqueSetUpdateProgress{}, + want: true, + }, + { + name: "currently updating moves", + oldProgress: &grovecorev1alpha1.PodCliqueSetUpdateProgress{ + CurrentlyUpdating: []grovecorev1alpha1.PodCliqueSetReplicaUpdateProgress{{ReplicaIndex: 0}}, + }, + newProgress: &grovecorev1alpha1.PodCliqueSetUpdateProgress{ + CurrentlyUpdating: []grovecorev1alpha1.PodCliqueSetReplicaUpdateProgress{{ReplicaIndex: 1}}, + }, + want: true, + }, + { + name: "currently updating unchanged", + oldProgress: &grovecorev1alpha1.PodCliqueSetUpdateProgress{ + CurrentlyUpdating: []grovecorev1alpha1.PodCliqueSetReplicaUpdateProgress{{ReplicaIndex: 0}}, + }, + newProgress: &grovecorev1alpha1.PodCliqueSetUpdateProgress{ + CurrentlyUpdating: []grovecorev1alpha1.PodCliqueSetReplicaUpdateProgress{{ReplicaIndex: 0}}, + }, + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := pred.UpdateFunc(event.UpdateEvent{ + ObjectOld: &grovecorev1alpha1.PodCliqueSet{Status: grovecorev1alpha1.PodCliqueSetStatus{CurrentGenerationHash: ptr.To("generation"), UpdateProgress: tt.oldProgress}}, + ObjectNew: &grovecorev1alpha1.PodCliqueSet{Status: grovecorev1alpha1.PodCliqueSetStatus{CurrentGenerationHash: ptr.To("generation"), UpdateProgress: tt.newProgress}}, + }) + assert.Equal(t, tt.want, got) + }) + } +} + +// TestPodCliqueScalingGroupPredicateGenerationStatusChanges verifies that the +// PodCliqueScalingGroup watch predicate triggers PodClique reconciles when the PCSG's view +// of the PodCliqueSet generation changes during a rolling update. PodCliques rely on this +// signal to keep Status.CurrentPodCliqueSetGenerationHash in sync, so the predicate must +// fire on changes to either the current generation hash or the in-progress update target, +// and must stay quiet when both are unchanged. +func TestPodCliqueScalingGroupPredicateGenerationStatusChanges(t *testing.T) { + pred, ok := podCliqueScalingGroupPredicate().(predicate.Funcs) + require.True(t, ok, "predicate must be predicate.Funcs") + + tests := []struct { + name string + oldPCSG *grovecorev1alpha1.PodCliqueScalingGroup + newPCSG *grovecorev1alpha1.PodCliqueScalingGroup + want bool + }{ + { + name: "current generation changes", + oldPCSG: &grovecorev1alpha1.PodCliqueScalingGroup{Status: grovecorev1alpha1.PodCliqueScalingGroupStatus{ + CurrentPodCliqueSetGenerationHash: ptr.To("old-generation"), + }}, + newPCSG: &grovecorev1alpha1.PodCliqueScalingGroup{Status: grovecorev1alpha1.PodCliqueScalingGroupStatus{ + CurrentPodCliqueSetGenerationHash: ptr.To("new-generation"), + }}, + want: true, + }, + { + name: "update target generation changes", + oldPCSG: &grovecorev1alpha1.PodCliqueScalingGroup{Status: grovecorev1alpha1.PodCliqueScalingGroupStatus{ + UpdateProgress: &grovecorev1alpha1.PodCliqueScalingGroupUpdateProgress{PodCliqueSetGenerationHash: "old-generation"}, + }}, + newPCSG: &grovecorev1alpha1.PodCliqueScalingGroup{Status: grovecorev1alpha1.PodCliqueScalingGroupStatus{ + UpdateProgress: &grovecorev1alpha1.PodCliqueScalingGroupUpdateProgress{PodCliqueSetGenerationHash: "new-generation"}, + }}, + want: true, + }, + { + name: "generation status unchanged", + oldPCSG: &grovecorev1alpha1.PodCliqueScalingGroup{Status: grovecorev1alpha1.PodCliqueScalingGroupStatus{ + CurrentPodCliqueSetGenerationHash: ptr.To("generation"), + UpdateProgress: &grovecorev1alpha1.PodCliqueScalingGroupUpdateProgress{PodCliqueSetGenerationHash: "generation"}, + }}, + newPCSG: &grovecorev1alpha1.PodCliqueScalingGroup{Status: grovecorev1alpha1.PodCliqueScalingGroupStatus{ + CurrentPodCliqueSetGenerationHash: ptr.To("generation"), + UpdateProgress: &grovecorev1alpha1.PodCliqueScalingGroupUpdateProgress{PodCliqueSetGenerationHash: "generation"}, + }}, + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := pred.UpdateFunc(event.UpdateEvent{ObjectOld: tt.oldPCSG, ObjectNew: tt.newPCSG}) + assert.Equal(t, tt.want, got) + }) + } +} + // Test_isMarkedForDeletion tests if a deletion timestamp is set on the pod func Test_isMarkedForDeletion(t *testing.T) { now := ptr.To(metav1.Now()) diff --git a/operator/internal/controller/podcliquescalinggroup/components/podclique/podclique_test.go b/operator/internal/controller/podcliquescalinggroup/components/podclique/podclique_test.go index 45d5d9b35..a18a59ed0 100644 --- a/operator/internal/controller/podcliquescalinggroup/components/podclique/podclique_test.go +++ b/operator/internal/controller/podcliquescalinggroup/components/podclique/podclique_test.go @@ -26,6 +26,7 @@ import ( grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" groveclientscheme "github.com/ai-dynamo/grove/operator/internal/client" "github.com/ai-dynamo/grove/operator/internal/constants" + componentutils "github.com/ai-dynamo/grove/operator/internal/controller/common/component/utils" "github.com/ai-dynamo/grove/operator/internal/mnnvl" "github.com/go-logr/logr" @@ -236,6 +237,47 @@ func TestGetPCSReplicaFromPCSG(t *testing.T) { } } +// TestIsReplicaUpdatedAcceptsLegacyCurrentHashesAndRejectsStale verifies that isReplicaUpdated +// treats a PCSG replica as up-to-date when its PodCliques carry either the canonical or the legacy +// pod-template hash form for the current spec, and still flags the replica as stale when any +// PodClique's hash matches neither candidate. This guards the legacy hash migration path so that +// PCLQs labeled with the pre-migration hash are not needlessly rolled. +func TestIsReplicaUpdatedAcceptsLegacyCurrentHashesAndRejectsStale(t *testing.T) { + expectedHashes := map[string]componentutils.HashCandidates{ + "pcsg-0-frontend": {Canonical: "frontend-canonical", Legacy: "frontend-legacy"}, + "pcsg-0-worker": {Canonical: "worker-canonical", Legacy: "worker-legacy"}, + } + + current, err := isReplicaUpdated(expectedHashes, []grovecorev1alpha1.PodClique{ + podCliqueWithTemplateHash("pcsg-0-frontend", "frontend-legacy"), + podCliqueWithTemplateHash("pcsg-0-worker", "worker-canonical"), + }) + require.NoError(t, err) + assert.True(t, current, "legacy-current PCLQ labels should not make a PCSG replica look stale") + + stale, err := isReplicaUpdated(expectedHashes, []grovecorev1alpha1.PodClique{ + podCliqueWithTemplateHash("pcsg-0-frontend", "frontend-legacy"), + podCliqueWithTemplateHash("pcsg-0-worker", "worker-stale"), + }) + require.NoError(t, err) + assert.False(t, stale, "hashes matching neither current canonical nor current legacy must remain stale") +} + +// podCliqueWithTemplateHash builds a minimal PodClique fixture identified by name and stamped with +// the given value on the apicommon.LabelPodTemplateHash label. It is intended for tests of +// hash-comparison logic (e.g. isReplicaUpdated) where only the pclq name and template-hash label +// are relevant. +func podCliqueWithTemplateHash(name, hash string) grovecorev1alpha1.PodClique { + return grovecorev1alpha1.PodClique{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: map[string]string{ + apicommon.LabelPodTemplateHash: hash, + }, + }, + } +} + // TestGetPodCliqueSelectorLabels tests generating selector labels for PodCliques func TestGetPodCliqueSelectorLabels(t *testing.T) { tests := []struct { diff --git a/operator/internal/controller/podcliquescalinggroup/components/podclique/rollingupdate.go b/operator/internal/controller/podcliquescalinggroup/components/podclique/rollingupdate.go index fd71cc045..0dfdf8cae 100644 --- a/operator/internal/controller/podcliquescalinggroup/components/podclique/rollingupdate.go +++ b/operator/internal/controller/podcliquescalinggroup/components/podclique/rollingupdate.go @@ -150,8 +150,12 @@ func (r _resource) markRollingUpdateEnd(ctx context.Context, logger logr.Logger, fmt.Sprintf("failed to mark end of rolling update in status of PodCliqueScalingGroup: %v", client.ObjectKeyFromObject(pcsg)), ) } - logger.Info("Marked the end of rolling update for PodCliqueScalingGroup") - return nil + logger.Info("Marked the end of rolling update of PodCliqueScalingGroup") + return groveerr.New( + groveerr.ErrCodeContinueReconcileAndRequeue, + component.OperationSync, + fmt.Sprintf("rolling update of PodCliqueScalingGroup %v has ended, requeuing for status convergence", client.ObjectKeyFromObject(pcsg)), + ) } // computePendingUpdateWork analyzes existing replicas and categorizes them by update status and availability state @@ -169,7 +173,7 @@ func computePendingUpdateWork(sc *syncContext) (*updateWork, error) { sc.pcsg.Status.UpdateProgress.ReadyReplicaIndicesSelectedToUpdate.Current == int32(pcsgReplicaIndex) { continue } - isUpdated, err := isReplicaUpdated(sc.expectedPCLQPodTemplateHashMap, existingPCSGReplicaPCLQs) + isUpdated, err := isReplicaUpdated(sc.expectedPCLQPodTemplateHashes, existingPCSGReplicaPCLQs) if err != nil { return nil, err } @@ -215,20 +219,20 @@ func isCurrentReplicaUpdateComplete(sc *syncContext) bool { return false } return lo.EveryBy(existingPCSGReplicaPCLQs, func(pclq grovecorev1alpha1.PodClique) bool { - return pclq.Status.CurrentPodTemplateHash != nil && *pclq.Status.CurrentPodTemplateHash == sc.expectedPCLQPodTemplateHashMap[pclq.Name] && - pclq.Status.CurrentPodCliqueSetGenerationHash != nil && *pclq.Status.CurrentPodCliqueSetGenerationHash == *sc.pcs.Status.CurrentGenerationHash && + expectedPodTemplateHashes := sc.expectedPCLQPodTemplateHashes[pclq.Name] + return pclq.Status.CurrentPodTemplateHash != nil && expectedPodTemplateHashes.Matches(*pclq.Status.CurrentPodTemplateHash) && pclq.Status.ReadyReplicas >= *pclq.Spec.MinAvailable }) } // isReplicaUpdated checks if all PodCliques in a PCSG replica have the expected pod template hash -func isReplicaUpdated(expectedPCLQPodTemplateHashes map[string]string, pcsgReplicaPCLQs []grovecorev1alpha1.PodClique) (bool, error) { +func isReplicaUpdated(expectedPCLQPodTemplateHashes map[string]componentutils.HashCandidates, pcsgReplicaPCLQs []grovecorev1alpha1.PodClique) (bool, error) { for _, pclq := range pcsgReplicaPCLQs { podTemplateHash, ok := pclq.Labels[apicommon.LabelPodTemplateHash] if !ok { return false, groveerr.ErrMissingPodTemplateHashLabel } - if podTemplateHash != expectedPCLQPodTemplateHashes[pclq.Name] { + if !expectedPCLQPodTemplateHashes[pclq.Name].Matches(podTemplateHash) { return false, nil } } diff --git a/operator/internal/controller/podcliquescalinggroup/components/podclique/sync.go b/operator/internal/controller/podcliquescalinggroup/components/podclique/sync.go index 78d7d2d51..142bea3ed 100644 --- a/operator/internal/controller/podcliquescalinggroup/components/podclique/sync.go +++ b/operator/internal/controller/podcliquescalinggroup/components/podclique/sync.go @@ -51,6 +51,7 @@ type syncContext struct { pcsgIndicesToRequeue []string expectedPCLQFQNsPerPCSGReplica map[int][]string expectedPCLQPodTemplateHashMap map[string]string + expectedPCLQPodTemplateHashes map[string]componentutils.HashCandidates } // prepareSyncContext creates and initializes the synchronization context with all necessary data for PCSG reconciliation @@ -94,7 +95,13 @@ func (r _resource) prepareSyncContext(ctx context.Context, logger logr.Logger, p syncCtx.pcsgIndicesToTerminate, syncCtx.pcsgIndicesToRequeue = getMinAvailableBreachedPCSGIndices(logger, syncCtx.existingPCLQs, syncCtx.pcs.Spec.Template.TerminationDelay.Duration) // pre-compute expected PodTemplateHash for each PCLQ - syncCtx.expectedPCLQPodTemplateHashMap = getExpectedPCLQPodTemplateHashMap(syncCtx.pcs, pcsg) + syncCtx.expectedPCLQPodTemplateHashes = getExpectedPCLQPodTemplateHashCandidatesMap(syncCtx.pcs, pcsg) + syncCtx.expectedPCLQPodTemplateHashMap = lo.MapValues(syncCtx.expectedPCLQPodTemplateHashes, func(candidates componentutils.HashCandidates, _ string) string { + return candidates.Canonical + }) + if err = r.migrateLegacyCurrentPodCliqueLabels(ctx, syncCtx); err != nil { + return nil, err + } return syncCtx, nil } @@ -335,16 +342,40 @@ func (r _resource) getExistingPCLQs(ctx context.Context, pcsg *grovecorev1alpha1 return existingPCLQs, nil } -// getExpectedPCLQPodTemplateHashMap computes the expected pod template hash for each PodClique in the PCSG -func getExpectedPCLQPodTemplateHashMap(pcs *grovecorev1alpha1.PodCliqueSet, pcsg *grovecorev1alpha1.PodCliqueScalingGroup) map[string]string { - pclqFQNToHash := make(map[string]string) +// migrateLegacyCurrentPodCliqueLabels patches existing PodCliques whose pod-template-hash label matches a legacy hash, updating it in place to the current canonical hash. +func (r _resource) migrateLegacyCurrentPodCliqueLabels(ctx context.Context, sc *syncContext) error { + for i := range sc.existingPCLQs { + pclq := &sc.existingPCLQs[i] + hashCandidates, ok := sc.expectedPCLQPodTemplateHashes[pclq.Name] + if !ok || !hashCandidates.IsLegacy(pclq.Labels[apicommon.LabelPodTemplateHash]) { + continue + } + patch := client.MergeFrom(pclq.DeepCopy()) + if pclq.Labels == nil { + pclq.Labels = make(map[string]string, 1) + } + pclq.Labels[apicommon.LabelPodTemplateHash] = hashCandidates.Canonical + if err := r.client.Patch(ctx, pclq, patch); err != nil { + return groveerr.WrapError(err, + errCodeCreateOrUpdatePodCliques, + component.OperationSync, + fmt.Sprintf("failed to migrate legacy PodClique label for %v", client.ObjectKeyFromObject(pclq)), + ) + } + } + return nil +} + +// getExpectedPCLQPodTemplateHashCandidatesMap returns a map keyed by the fully-qualified PodClique name to the expected pod-template hash candidates (canonical and legacy) for every replica of the given PodCliqueScalingGroup. +func getExpectedPCLQPodTemplateHashCandidatesMap(pcs *grovecorev1alpha1.PodCliqueSet, pcsg *grovecorev1alpha1.PodCliqueScalingGroup) map[string]componentutils.HashCandidates { + pclqFQNToHash := make(map[string]componentutils.HashCandidates) pcsgPCLQNames := pcsg.Spec.CliqueNames for _, pcsgCliqueName := range pcsgPCLQNames { pclqTemplateSpec := componentutils.FindPodCliqueTemplateSpecByName(pcs, pcsgCliqueName) if pclqTemplateSpec == nil { continue } - podTemplateHash := componentutils.ComputePCLQPodTemplateHash(pclqTemplateSpec, pcs.Spec.Template.PriorityClassName) + podTemplateHash := componentutils.ComputePCLQPodTemplateHashCandidates(pclqTemplateSpec, pcs.Spec.Template.PriorityClassName) for pcsgReplicaIndex := range int(pcsg.Spec.Replicas) { cliqueFQN := apicommon.GeneratePodCliqueName(apicommon.ResourceNameReplica{ Name: pcsg.Name, diff --git a/operator/internal/controller/podcliquescalinggroup/reconcilespec.go b/operator/internal/controller/podcliquescalinggroup/reconcilespec.go index 659923c6c..c829ba5a2 100644 --- a/operator/internal/controller/podcliquescalinggroup/reconcilespec.go +++ b/operator/internal/controller/podcliquescalinggroup/reconcilespec.go @@ -113,10 +113,17 @@ func (r *Reconciler) processUpdate(ctx context.Context, logger logr.Logger, pcsg // shouldResetOrTriggerUpdate determines if a rolling update should be initiated based on generation hash changes func shouldResetOrTriggerUpdate(pcs *grovecorev1alpha1.PodCliqueSet, pcsg *grovecorev1alpha1.PodCliqueScalingGroup) bool { + if pcs.Status.CurrentGenerationHash == nil { + return false + } + pcsGenerationHashCandidates := componentutils.ComputePCSGenerationHashCandidates(pcs) + matchesCurrentPCSGeneration := func(hash string) bool { + return hash == *pcs.Status.CurrentGenerationHash || pcsGenerationHashCandidates.Matches(hash) + } + // If processing of rolling update of PCSG for PCS CurrentGenerationHash is either completed or in-progress, // there is no need to reset or trigger another rolling update of this PCSG for the same PCS CurrentGenerationHash. - if pcsg.Status.UpdateProgress != nil && pcs.Status.CurrentGenerationHash != nil && - pcsg.Status.UpdateProgress.PodCliqueSetGenerationHash == *pcs.Status.CurrentGenerationHash { + if pcsg.Status.UpdateProgress != nil && matchesCurrentPCSGeneration(pcsg.Status.UpdateProgress.PodCliqueSetGenerationHash) { return false } return true diff --git a/operator/internal/controller/podcliquescalinggroup/reconcilestatus.go b/operator/internal/controller/podcliquescalinggroup/reconcilestatus.go index 4682478be..3e3f7b5a5 100644 --- a/operator/internal/controller/podcliquescalinggroup/reconcilestatus.go +++ b/operator/internal/controller/podcliquescalinggroup/reconcilestatus.go @@ -74,7 +74,7 @@ func (r *Reconciler) reconcileStatus(ctx context.Context, logger logr.Logger, pc // Replica-index strays (idx >= Spec.Replicas) are also dropped for hygiene, though // mutateReplicas already ignores them via its [0, Spec.Replicas) loop bounds. pclqsPerPCSGReplica = pruneStrayPCSGPCLQs(pcsg, pclqsPerPCSGReplica) - mutateReplicas(logger, pcs.Status.CurrentGenerationHash, pcsg, pclqsPerPCSGReplica) + mutateReplicas(logger, pcs, pcsg, pclqsPerPCSGReplica) mutateMinAvailableBreachedCondition(logger, pcsg, pclqsPerPCSGReplica) r.emitAllScheduledReplicasLostIfNeeded(pcsg, originalStatus.ScheduledReplicas) @@ -108,14 +108,15 @@ func (r *Reconciler) reconcileStatus(ctx context.Context, logger logr.Logger, pc // It also derives child-PCLQ update progress counts when an update is in flight. The iteration is bounded to // expected replica indexes [0, Spec.Replicas) — the caller has already pruned stray children — so counters stay // consistent with the spec-derived totals during scale-down. -func mutateReplicas(logger logr.Logger, currentPCSGenerationHash *string, pcsg *grovecorev1alpha1.PodCliqueScalingGroup, pclqsPerPCSGReplica map[string][]grovecorev1alpha1.PodClique) { +func mutateReplicas(logger logr.Logger, pcs *grovecorev1alpha1.PodCliqueSet, pcsg *grovecorev1alpha1.PodCliqueScalingGroup, pclqsPerPCSGReplica map[string][]grovecorev1alpha1.PodClique) { pcsg.Status.Replicas = pcsg.Spec.Replicas var scheduledReplicas, availableReplicas, updatedReplicas, updatedPCLQs, totalPCLQs int32 cliqueNamesPerReplica := int32(len(pcsg.Spec.CliqueNames)) + currentPCSGenerationHashes := componentutils.ComputePCSGenerationHashCandidates(pcs) for replicaIndex := 0; replicaIndex < int(pcsg.Spec.Replicas); replicaIndex++ { pcsgReplicaIndex := strconv.Itoa(replicaIndex) pclqs := pclqsPerPCSGReplica[pcsgReplicaIndex] - isScheduled, isAvailable, isUpdated := computeReplicaStatus(logger, currentPCSGenerationHash, pcsgReplicaIndex, len(pcsg.Spec.CliqueNames), pclqs) + isScheduled, isAvailable, isUpdated := computeReplicaStatus(logger, currentPCSGenerationHashes, pcs.Status.CurrentGenerationHash, pcsgReplicaIndex, len(pcsg.Spec.CliqueNames), pclqs) if isScheduled { scheduledReplicas++ } @@ -125,7 +126,7 @@ func mutateReplicas(logger logr.Logger, currentPCSGenerationHash *string, pcsg * if isUpdated { updatedReplicas++ } - updatedPCLQs += countPCSGReplicaUpdatedPCLQs(currentPCSGenerationHash, pclqs) + updatedPCLQs += countPCSGReplicaUpdatedPCLQs(currentPCSGenerationHashes, pcs.Status.CurrentGenerationHash, pclqs) } totalPCLQs = pcsg.Spec.Replicas * cliqueNamesPerReplica logger.Info("Mutating PodCliqueScalingGroup replicas", @@ -143,8 +144,8 @@ func mutateReplicas(logger logr.Logger, currentPCSGenerationHash *string, pcsg * // countPCSGReplicaUpdatedPCLQs counts non-terminating PCLQs in a PCSG replica whose generation // hash matches the parent PCS hash. -func countPCSGReplicaUpdatedPCLQs(pcsGenerationHash *string, pclqs []grovecorev1alpha1.PodClique) int32 { - if pcsGenerationHash == nil { +func countPCSGReplicaUpdatedPCLQs(pcsGenerationHashes componentutils.HashCandidates, currentPCSGenerationHash *string, pclqs []grovecorev1alpha1.PodClique) int32 { + if currentPCSGenerationHash == nil { return 0 } var n int32 @@ -154,7 +155,7 @@ func countPCSGReplicaUpdatedPCLQs(pcsGenerationHash *string, pclqs []grovecorev1 continue } if pclq.Status.CurrentPodCliqueSetGenerationHash != nil && - *pclq.Status.CurrentPodCliqueSetGenerationHash == *pcsGenerationHash { + pclqGenerationHashMatchesCurrent(pcsGenerationHashes, currentPCSGenerationHash, *pclq.Status.CurrentPodCliqueSetGenerationHash) { n++ } } @@ -162,7 +163,7 @@ func countPCSGReplicaUpdatedPCLQs(pcsGenerationHash *string, pclqs []grovecorev1 } // computeReplicaStatus processes a single PodCliqueScalingGroup replica and returns whether it is scheduled and available. -func computeReplicaStatus(logger logr.Logger, currentPCSGenerationHash *string, pcsgReplicaIndex string, numPCSGCliqueNames int, pclqs []grovecorev1alpha1.PodClique) (isScheduled, isAvailable, isUpdated bool) { +func computeReplicaStatus(logger logr.Logger, currentPCSGenerationHashes componentutils.HashCandidates, currentPCSGenerationHash *string, pcsgReplicaIndex string, numPCSGCliqueNames int, pclqs []grovecorev1alpha1.PodClique) (isScheduled, isAvailable, isUpdated bool) { nonTerminatedPCSGPodCliques := lo.Filter(pclqs, func(pclq grovecorev1alpha1.PodClique, _ int) bool { return !k8sutils.IsResourceTerminating(pclq.ObjectMeta) }) @@ -183,9 +184,9 @@ func computeReplicaStatus(logger logr.Logger, currentPCSGenerationHash *string, }) isAvailable = isAvailable && len(nonTerminatedPCSGPodCliques) == numPCSGCliqueNames isUpdated = isAvailable && - currentPCSGenerationHash != nil && lo.EveryBy(nonTerminatedPCSGPodCliques, func(pclq grovecorev1alpha1.PodClique) bool { - return pclq.Status.CurrentPodCliqueSetGenerationHash != nil && *pclq.Status.CurrentPodCliqueSetGenerationHash == *currentPCSGenerationHash + return pclq.Status.CurrentPodCliqueSetGenerationHash != nil && + pclqGenerationHashMatchesCurrent(currentPCSGenerationHashes, currentPCSGenerationHash, *pclq.Status.CurrentPodCliqueSetGenerationHash) }) } return @@ -203,6 +204,14 @@ func (r *Reconciler) emitAllScheduledReplicasLostIfNeeded(pcsg *grovecorev1alpha } } +func pclqGenerationHashMatchesCurrent(candidates componentutils.HashCandidates, currentGenerationHash *string, storedGenerationHash string) bool { + if currentGenerationHash == nil { + return false + } + return candidates.Matches(storedGenerationHash) || + storedGenerationHash == *currentGenerationHash +} + // mutateMinAvailableBreachedCondition updates the MinAvailableBreached condition based on replica availability func mutateMinAvailableBreachedCondition(logger logr.Logger, pcsg *grovecorev1alpha1.PodCliqueScalingGroup, pclqsPerPCSGReplica map[string][]grovecorev1alpha1.PodClique) { newCondition := computeMinAvailableBreachedCondition(logger, pcsg, pclqsPerPCSGReplica) @@ -346,16 +355,54 @@ func mutateSelector(pcs *grovecorev1alpha1.PodCliqueSet, pcsg *grovecorev1alpha1 // mutateCurrentPodCliqueSetGenerationHash updates the current generation hash when all PodCliques are updated and no rolling update is in progress func mutateCurrentPodCliqueSetGenerationHash(logger logr.Logger, pcs *grovecorev1alpha1.PodCliqueSet, pcsg *grovecorev1alpha1.PodCliqueScalingGroup, existingPCLQs []grovecorev1alpha1.PodClique) { + pcsGenerationHashes := componentutils.ComputePCSGenerationHashCandidates(pcs) pclqFQNsPendingUpdate := componentutils.GetPCLQsInPCSGPendingUpdate(pcs, pcsg, existingPCLQs) if len(pclqFQNsPendingUpdate) > 0 { - logger.Info("Found PodCliques associated to PodCliqueScalingGroup pending update", "pclqFQNsPendingUpdate", pclqFQNsPendingUpdate) + logger.V(1).Info("PodCliqueScalingGroup has PodCliques pending update", "pcsg", client.ObjectKeyFromObject(pcsg), "pclqFQNsPendingUpdate", pclqFQNsPendingUpdate) return } if componentutils.IsPCSGUpdateInProgress(pcsg) { - logger.Info("PodCliqueScalingGroup is currently updating, cannot set PodCliqueSet CurrentGenerationHash yet") return } - pcsg.Status.CurrentPodCliqueSetGenerationHash = pcs.Status.CurrentGenerationHash + if pcs.Status.CurrentGenerationHash == nil { + return + } + if !havePCSGPodCliquesConverged(pcs, pcsg, existingPCLQs) { + return + } + if pcsg.Status.UpdateProgress != nil && pcsGenerationHashes.Matches(pcsg.Status.UpdateProgress.PodCliqueSetGenerationHash) { + pcsg.Status.UpdateProgress.PodCliqueSetGenerationHash = pcsGenerationHashes.Canonical + } + pcsg.Status.CurrentPodCliqueSetGenerationHash = &pcsGenerationHashes.Canonical +} + +// havePCSGPodCliquesConverged reports whether every expected PodClique in the +// PodCliqueScalingGroup has fully reconciled to the current PodCliqueSet spec. +// It gates advancing the PCSG's CurrentPodCliqueSetGenerationHash so the group +// is only marked up-to-date once all child PodCliques have converged. +func havePCSGPodCliquesConverged(pcs *grovecorev1alpha1.PodCliqueSet, pcsg *grovecorev1alpha1.PodCliqueScalingGroup, existingPCLQs []grovecorev1alpha1.PodClique) bool { + expectedPCLQPodTemplateHashes := componentutils.GetPCLQTemplateHashCandidates(pcs, pcsg) + existingPCLQsByName := lo.SliceToMap(existingPCLQs, func(pclq grovecorev1alpha1.PodClique) (string, grovecorev1alpha1.PodClique) { + return pclq.Name, pclq + }) + pcsGenerationHashes := componentutils.ComputePCSGenerationHashCandidates(pcs) + for pclqName, expectedPodTemplateHashes := range expectedPCLQPodTemplateHashes { + pclq, ok := existingPCLQsByName[pclqName] + if !ok || k8sutils.IsResourceTerminating(pclq.ObjectMeta) { + return false + } + if !expectedPodTemplateHashes.Matches(pclq.Labels[apicommon.LabelPodTemplateHash]) { + return false + } + if pclq.Status.CurrentPodTemplateHash == nil || !expectedPodTemplateHashes.Matches(*pclq.Status.CurrentPodTemplateHash) { + return false + } + if pclq.Status.CurrentPodCliqueSetGenerationHash == nil || + !pclqGenerationHashMatchesCurrent(pcsGenerationHashes, pcs.Status.CurrentGenerationHash, *pclq.Status.CurrentPodCliqueSetGenerationHash) { + return false + } + } + return true } // pruneStrayPCSGPCLQs drops children whose replica index is outside [0, Spec.Replicas) or whose FQN diff --git a/operator/internal/controller/podcliquescalinggroup/reconcilestatus_test.go b/operator/internal/controller/podcliquescalinggroup/reconcilestatus_test.go index 7c6a01497..ca8183464 100644 --- a/operator/internal/controller/podcliquescalinggroup/reconcilestatus_test.go +++ b/operator/internal/controller/podcliquescalinggroup/reconcilestatus_test.go @@ -21,9 +21,11 @@ import ( "testing" "time" + apicommon "github.com/ai-dynamo/grove/operator/api/common" "github.com/ai-dynamo/grove/operator/api/common/constants" grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" internalconstants "github.com/ai-dynamo/grove/operator/internal/constants" + componentutils "github.com/ai-dynamo/grove/operator/internal/controller/common/component/utils" testutils "github.com/ai-dynamo/grove/operator/test/utils" "github.com/go-logr/logr" @@ -95,7 +97,7 @@ func TestComputeReplicaStatus(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - scheduled, available, updated := computeReplicaStatus(logr.Discard(), nil, "0", tt.expectedSize, tt.cliques) + scheduled, available, updated := computeReplicaStatus(logr.Discard(), componentutils.HashCandidates{}, tt.pcsGenerationHash, "0", tt.expectedSize, tt.cliques) assert.Equal(t, tt.wantScheduled, scheduled, "scheduled mismatch") assert.Equal(t, tt.wantAvailable, available, "available mismatch") @@ -590,6 +592,62 @@ func TestReconcileStatus_EdgeCases(t *testing.T) { } } +// TestMutateCurrentPodCliqueSetGenerationHashWaitsForPodCliqueGenerationConvergence verifies that +// the PCSG's CurrentPodCliqueSetGenerationHash is only advanced to the canonical PCS hash once all +// of its PodCliques report having converged to that hash. While any child PodClique still reports +// the old generation hash, the PCSG must continue to surface the old hash to avoid prematurely +// signaling that a rollout has completed. +func TestMutateCurrentPodCliqueSetGenerationHashWaitsForPodCliqueGenerationConvergence(t *testing.T) { + pcs := testutils.NewPodCliqueSetBuilder("test-pcs", "test-ns", uuid.NewUUID()). + WithScalingGroupConfig("compute", []string{"worker"}, 2, 1). + Build() + pcsGenerationHashes := componentutils.ComputePCSGenerationHashCandidates(pcs) + pcs.Status.CurrentGenerationHash = ptr.To(pcsGenerationHashes.Canonical) + + pcsg := testutils.NewPodCliqueScalingGroupBuilder("test-pcs-0-compute", "test-ns", "test-pcs", 0). + WithReplicas(2). + WithCliqueNames([]string{"worker"}). + WithOptions(testutils.WithPCSGCurrentPCSGenerationHash("old-generation-hash")). + Build() + expectedTemplateHashes := componentutils.GetPCLQTemplateHashCandidates(pcs, pcsg) + pclqs := []grovecorev1alpha1.PodClique{ + buildConvergedPCSGPodClique(t, pcsg, "worker", 0, expectedTemplateHashes, pcsGenerationHashes.Canonical), + buildConvergedPCSGPodClique(t, pcsg, "worker", 1, expectedTemplateHashes, "old-generation-hash"), + } + + mutateCurrentPodCliqueSetGenerationHash(logr.Discard(), pcs, pcsg, pclqs) + require.NotNil(t, pcsg.Status.CurrentPodCliqueSetGenerationHash) + assert.Equal(t, "old-generation-hash", *pcsg.Status.CurrentPodCliqueSetGenerationHash) + + pclqs[1].Status.CurrentPodCliqueSetGenerationHash = ptr.To(pcsGenerationHashes.Canonical) + mutateCurrentPodCliqueSetGenerationHash(logr.Discard(), pcs, pcsg, pclqs) + require.NotNil(t, pcsg.Status.CurrentPodCliqueSetGenerationHash) + assert.Equal(t, pcsGenerationHashes.Canonical, *pcsg.Status.CurrentPodCliqueSetGenerationHash) +} + +// buildConvergedPCSGPodClique constructs a PodClique that is fully converged on the given +// template hash, with its CurrentPodCliqueSetGenerationHash set to pcsGenerationHash so callers +// can simulate cliques at a specific PCS generation. +func buildConvergedPCSGPodClique( + t *testing.T, + pcsg *grovecorev1alpha1.PodCliqueScalingGroup, + cliqueName string, + pcsgReplicaIndex int, + expectedTemplateHashes map[string]componentutils.HashCandidates, + pcsGenerationHash string, +) grovecorev1alpha1.PodClique { + t.Helper() + pclqName := apicommon.GeneratePodCliqueName(apicommon.ResourceNameReplica{Name: pcsg.Name, Replica: pcsgReplicaIndex}, cliqueName) + templateHashes, ok := expectedTemplateHashes[pclqName] + require.True(t, ok, "expected template hash for %s", pclqName) + pclq := testutils.NewPCSGPodCliqueBuilder(pclqName, pcsg.Namespace, "test-pcs", pcsg.Name, 0, pcsgReplicaIndex). + WithLabels(map[string]string{apicommon.LabelPodTemplateHash: templateHashes.Canonical}). + Build() + pclq.Status.CurrentPodTemplateHash = ptr.To(templateHashes.Canonical) + pclq.Status.CurrentPodCliqueSetGenerationHash = ptr.To(pcsGenerationHash) + return *pclq +} + // Test helpers func buildHealthyClique(name string) grovecorev1alpha1.PodClique { return *testutils.NewPodCliqueBuilder("test-pcs", uuid.NewUUID(), name, "test-ns", 0). @@ -703,7 +761,12 @@ func TestPCSGMutateReplicasWritesUpdateProgressCounts(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - mutateReplicas(logr.Discard(), &pcsHash, tt.pcsg, tt.pclqsPerReplica) + pcs := &grovecorev1alpha1.PodCliqueSet{ + Status: grovecorev1alpha1.PodCliqueSetStatus{ + CurrentGenerationHash: &pcsHash, + }, + } + mutateReplicas(logr.Discard(), pcs, tt.pcsg, tt.pclqsPerReplica) if !tt.wantWritten { require.Nil(t, tt.pcsg.Status.UpdateProgress, "UpdateProgress must remain nil") @@ -744,7 +807,7 @@ func TestCountPCSGReplicaUpdatedPCLQs(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - assert.Equal(t, tt.want, countPCSGReplicaUpdatedPCLQs(tt.hash, tt.in)) + assert.Equal(t, tt.want, countPCSGReplicaUpdatedPCLQs(componentutils.HashCandidates{}, tt.hash, tt.in)) }) } } diff --git a/operator/internal/controller/podcliqueset/components/podcliquesetreplica/rollingupdate.go b/operator/internal/controller/podcliqueset/components/podcliquesetreplica/rollingupdate.go index 4a94c0d1b..6ded39ff8 100644 --- a/operator/internal/controller/podcliqueset/components/podcliquesetreplica/rollingupdate.go +++ b/operator/internal/controller/podcliqueset/components/podcliquesetreplica/rollingupdate.go @@ -237,16 +237,17 @@ func (w *pendingUpdateWork) getNextReplicaToUpdate(pcs *grovecorev1alpha1.PodCli // computeUpdateProgress calculates update completion for a PCS replica. func (pri *pcsReplicaInfo) computeUpdateProgress(pcs *grovecorev1alpha1.PodCliqueSet) { - currentHash := *pcs.Status.CurrentGenerationHash + pcsGenerationHashCandidates := componentutils.ComputePCSGenerationHashCandidates(pcs) updatedPCLQs := 0 for _, pclq := range pri.pclqs { - if isPCLQUpdateComplete(&pclq, currentHash) { + expectedTemplateHashes, err := componentutils.GetExpectedPCLQPodTemplateHashCandidates(pcs, pclq.ObjectMeta) + if err == nil && isPCLQUpdateComplete(&pclq, expectedTemplateHashes, pcsGenerationHashCandidates) { updatedPCLQs++ } } updatedPCSGs := 0 for _, pcsg := range pri.pcsgs { - if componentutils.IsPCSGUpdateComplete(&pcsg, currentHash) { + if componentutils.IsPCSGUpdateComplete(&pcsg, pcsGenerationHashCandidates.Canonical, pcsGenerationHashCandidates.Legacy) { updatedPCSGs++ } } @@ -273,14 +274,14 @@ func (pri *pcsReplicaInfo) getNumScheduledPods(pcs *grovecorev1alpha1.PodCliqueS } // isPCLQUpdateComplete checks if a PodClique has completed its update to the target generation. -func isPCLQUpdateComplete(pclq *grovecorev1alpha1.PodClique, currentPCSGenerationHash string) bool { - if pclq.Status.CurrentPodCliqueSetGenerationHash != nil && - *pclq.Status.CurrentPodCliqueSetGenerationHash == currentPCSGenerationHash && +func isPCLQUpdateComplete(pclq *grovecorev1alpha1.PodClique, expectedPodTemplateHashes, currentPCSGenerationHashes componentutils.HashCandidates) bool { + return expectedPodTemplateHashes.Matches(pclq.Labels[apicommon.LabelPodTemplateHash]) && + pclq.Status.CurrentPodTemplateHash != nil && + expectedPodTemplateHashes.Matches(*pclq.Status.CurrentPodTemplateHash) && + pclq.Status.CurrentPodCliqueSetGenerationHash != nil && + currentPCSGenerationHashes.Matches(*pclq.Status.CurrentPodCliqueSetGenerationHash) && pclq.Status.UpdatedReplicas >= *pclq.Spec.MinAvailable && - pclq.Status.ReadyReplicas >= *pclq.Spec.MinAvailable { - return true - } - return false + pclq.Status.ReadyReplicas >= *pclq.Spec.MinAvailable } // isAutoUpdateInProgress checks if an update is currently in progress. diff --git a/operator/internal/controller/podcliqueset/reconcilespec.go b/operator/internal/controller/podcliqueset/reconcilespec.go index a0de15b29..29763d205 100644 --- a/operator/internal/controller/podcliqueset/reconcilespec.go +++ b/operator/internal/controller/podcliqueset/reconcilespec.go @@ -26,13 +26,11 @@ import ( "github.com/ai-dynamo/grove/operator/internal/constants" ctrlcommon "github.com/ai-dynamo/grove/operator/internal/controller/common" "github.com/ai-dynamo/grove/operator/internal/controller/common/component" + componentutils "github.com/ai-dynamo/grove/operator/internal/controller/common/component/utils" ctrlutils "github.com/ai-dynamo/grove/operator/internal/controller/utils" "github.com/ai-dynamo/grove/operator/internal/utils" - k8sutils "github.com/ai-dynamo/grove/operator/internal/utils/kubernetes" "github.com/go-logr/logr" - "github.com/samber/lo" - corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/tools/cache" "k8s.io/utils/ptr" @@ -82,19 +80,27 @@ func (r *Reconciler) processGenerationHashChange(ctx context.Context, logger log } r.pcsGenerationHashExpectations.Delete(pcsObjectName) - newGenerationHash := computeGenerationHash(pcs) + generationHashCandidates := componentutils.ComputePCSGenerationHashCandidates(pcs) if pcs.Status.CurrentGenerationHash == nil { // update the generation hash and continue reconciliation. No rolling update is required. - if err := r.setGenerationHashAndUpdateStatus(ctx, pcs, pcsObjectName, newGenerationHash); err != nil { - logger.Error(err, "failed to set generation hash on PCS", "newGenerationHash", newGenerationHash) + if err := r.setGenerationHashAndUpdateStatus(ctx, pcs, pcsObjectName, generationHashCandidates.Canonical); err != nil { + logger.Error(err, "failed to set generation hash on PCS", "newGenerationHash", generationHashCandidates.Canonical) return ctrlcommon.ReconcileWithErrors("error updating generation hash", err) } return ctrlcommon.ContinueReconcile() } - if newGenerationHash != *pcs.Status.CurrentGenerationHash { + if generationHashCandidates.IsLegacy(*pcs.Status.CurrentGenerationHash) { + if err := r.setGenerationHashAndUpdateStatus(ctx, pcs, pcsObjectName, generationHashCandidates.Canonical); err != nil { + logger.Error(err, "failed to migrate legacy generation hash on PCS", "legacyGenerationHash", generationHashCandidates.Legacy, "canonicalGenerationHash", generationHashCandidates.Canonical) + return ctrlcommon.ReconcileWithErrors("error migrating generation hash", err) + } + return ctrlcommon.ContinueReconcile() + } + + if !generationHashCandidates.Matches(*pcs.Status.CurrentGenerationHash) { // trigger rolling update by setting or overriding pcs.Status.UpdateProgress. - if err := r.initUpdateProgress(ctx, pcs, pcsObjectName, newGenerationHash); err != nil { + if err := r.initUpdateProgress(ctx, pcs, pcsObjectName, generationHashCandidates.Canonical); err != nil { return ctrlcommon.ReconcileWithErrors(fmt.Sprintf("could not triggering rolling update for PCS: %v", pcsObjectKey), err) } } @@ -108,20 +114,23 @@ func (r *Reconciler) isGenerationHashExpectationSatisfied(pcsObjectName string, return !ok || (pcsGenerationHash != nil && expectedGenerationHash.(string) == *pcsGenerationHash) } -// computeGenerationHash calculates a hash of the PodCliqueSet pod template specifications. +// computeGenerationHash calculates a hash of the PodCliqueSet pod template +// specifications. +// +// pcs.Spec.Template.Cliques is +listType=map +listMapKey=name, so it is an +// unordered set; the hash must be stable across reorderings to avoid falsely +// triggering rolling recreates when an upstream operator emits the same +// cliques in a different sequence. +// +// Order is semantically meaningful when StartupType is CliqueStartupTypeInOrder, +// so in that mode the original clique order is preserved and clique names are +// passed as order keys to the shared hash helper. func computeGenerationHash(pcs *grovecorev1alpha1.PodCliqueSet) string { - podTemplateSpecs := lo.Map(pcs.Spec.Template.Cliques, func(pclqTemplateSpec *grovecorev1alpha1.PodCliqueTemplateSpec, _ int) *corev1.PodTemplateSpec { - podTemplateSpec := &corev1.PodTemplateSpec{ - ObjectMeta: metav1.ObjectMeta{ - Labels: pclqTemplateSpec.Labels, - Annotations: pclqTemplateSpec.Annotations, - }, - Spec: pclqTemplateSpec.Spec.PodSpec, - } - podTemplateSpec.Spec.PriorityClassName = pcs.Spec.Template.PriorityClassName - return podTemplateSpec - }) - return k8sutils.ComputeHash(podTemplateSpecs...) + return componentutils.ComputePCSGenerationHash(pcs) +} + +func computeGenerationHashLegacy(pcs *grovecorev1alpha1.PodCliqueSet) string { + return componentutils.ComputePCSGenerationHashLegacy(pcs) } // setGenerationHashAndUpdateStatus updates the PodCliqueSet status with the new generation hash, stores the expectation, and updates the status subresource. diff --git a/operator/internal/controller/podcliqueset/reconcilespec_test.go b/operator/internal/controller/podcliqueset/reconcilespec_test.go index 3501a1ad6..e5bc0406f 100644 --- a/operator/internal/controller/podcliqueset/reconcilespec_test.go +++ b/operator/internal/controller/podcliqueset/reconcilespec_test.go @@ -29,6 +29,7 @@ import ( "github.com/go-logr/logr" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/uuid" "k8s.io/utils/ptr" @@ -149,6 +150,338 @@ func TestGetKindSyncGroups(t *testing.T) { } } +// TestComputeGenerationHash_ReplicaOnlyAndCliqueReorderAreNoOps documents the +// invariants of computeGenerationHash that protect against false rolling +// updates: +// +// 1. replica-only changes do not change the hash (HPA-style scale events +// must not trigger a gang roll); and +// 2. reordering pcs.Spec.Template.Cliques does not change the hash, because +// Cliques is +listType=map +listMapKey=name and represents a name-keyed +// set with no inherent order. An upstream operator that emits the same +// cliques in a different sequence (e.g. from non-deterministic Go map +// iteration) must not cause Grove to roll the gang. +// +// This was previously TestComputeGenerationHashCurrentSensitivity and +// asserted the bugged behavior. After fixing computeGenerationHash to sort +// the cliques map-list before hashing, the assertion is flipped: the +// reordered hash must be IDENTICAL to the original. +func TestComputeGenerationHash_ReplicaOnlyAndCliqueReorderAreNoOps(t *testing.T) { + pcs := testutils.NewPodCliqueSetBuilder(testPCSName, testNamespace, uuid.NewUUID()). + WithPodCliqueTemplateSpec(testutils.NewPodCliqueTemplateSpecBuilder("frontend"). + WithLabels(map[string]string{"role": "frontend"}). + WithReplicas(1). + Build()). + WithPodCliqueTemplateSpec(testutils.NewPodCliqueTemplateSpecBuilder("planner"). + WithLabels(map[string]string{"role": "planner"}). + WithReplicas(1). + Build()). + WithPodCliqueTemplateSpec(testutils.NewPodCliqueTemplateSpecBuilder("prefill"). + WithLabels(map[string]string{"role": "prefill"}). + WithReplicas(1). + Build()). + Build() + + originalHash := computeGenerationHash(pcs) + + replicaOnlyChange := pcs.DeepCopy() + replicaOnlyChange.Spec.Template.Cliques[2].Spec.Replicas = 3 + assert.Equal(t, originalHash, computeGenerationHash(replicaOnlyChange), + "replica-only changes are intentionally excluded from the generation hash") + + reordered := pcs.DeepCopy() + reordered.Spec.Template.Cliques = []*grovecorev1alpha1.PodCliqueTemplateSpec{ + pcs.Spec.Template.Cliques[2].DeepCopy(), + pcs.Spec.Template.Cliques[0].DeepCopy(), + pcs.Spec.Template.Cliques[1].DeepCopy(), + } + assert.Equal(t, originalHash, computeGenerationHash(reordered), + "clique slice order must not affect the generation hash — Cliques is +listType=map keyed by name") +} + +// TestProcessGenerationHashChange_CliqueReorderIsNoOp asserts the +// fixed-behavior end-to-end: when an existing PCS already has its current +// generation hash recorded and only the cliques map-list is reordered, +// processGenerationHashChange must NOT initialize a new UpdateProgress +// (which would cascade into a full gang rolling recreate via the +// PodCliqueSetReplica orchestrator and the per-PodClique controllers). +// +// This was previously TestProcessGenerationHashChangeTreatsCliqueReorderAsUpdate +// asserting the bugged behavior. After fixing computeGenerationHash, the +// assertion is flipped: UpdateProgress must remain nil. +func TestProcessGenerationHashChange_CliqueReorderIsNoOp(t *testing.T) { + pcs := testutils.NewPodCliqueSetBuilder(testPCSName, testNamespace, uuid.NewUUID()). + WithPodCliqueTemplateSpec(testutils.NewPodCliqueTemplateSpecBuilder("frontend"). + WithLabels(map[string]string{"role": "frontend"}). + Build()). + WithPodCliqueTemplateSpec(testutils.NewPodCliqueTemplateSpecBuilder("planner"). + WithLabels(map[string]string{"role": "planner"}). + Build()). + WithPodCliqueTemplateSpec(testutils.NewPodCliqueTemplateSpecBuilder("prefill"). + WithLabels(map[string]string{"role": "prefill"}). + Build()). + Build() + originalHash := computeGenerationHash(pcs) + + reordered := pcs.DeepCopy() + reordered.Spec.Template.Cliques = []*grovecorev1alpha1.PodCliqueTemplateSpec{ + pcs.Spec.Template.Cliques[2].DeepCopy(), + pcs.Spec.Template.Cliques[0].DeepCopy(), + pcs.Spec.Template.Cliques[1].DeepCopy(), + } + reordered.Status.CurrentGenerationHash = ptr.To(originalHash) + + fakeClient := testutils.SetupFakeClient(reordered) + reconciler := &Reconciler{ + client: fakeClient, + pcsGenerationHashExpectations: sync.Map{}, + } + + result := reconciler.processGenerationHashChange(context.Background(), logr.Discard(), reordered) + require.False(t, result.HasErrors()) + + updatedPCS := &grovecorev1alpha1.PodCliqueSet{} + err := fakeClient.Get(context.Background(), client.ObjectKeyFromObject(reordered), updatedPCS) + require.NoError(t, err) + assert.Nil(t, updatedPCS.Status.UpdateProgress, + "clique reorder must not start a rolling update — Cliques is name-keyed and slice order is not part of the desired state") + assert.Equal(t, ptr.To(originalHash), updatedPCS.Status.CurrentGenerationHash, + "clique reorder must not change the recorded generation hash") +} + +func TestProcessGenerationHashChange_LegacyCurrentMigratesWithoutUpdateProgress(t *testing.T) { + pcs := testutils.NewPodCliqueSetBuilder(testPCSName, testNamespace, uuid.NewUUID()). + WithPodCliqueTemplateSpec(testutils.NewPodCliqueTemplateSpecBuilder("planner"). + WithLabels(map[string]string{"role": "planner"}). + Build()). + WithPodCliqueTemplateSpec(testutils.NewPodCliqueTemplateSpecBuilder("frontend"). + WithLabels(map[string]string{"role": "frontend"}). + Build()). + Build() + + canonicalHash := computeGenerationHash(pcs) + legacyHash := computeGenerationHashLegacy(pcs) + require.NotEqual(t, canonicalHash, legacyHash, "test must exercise the transition hash window") + pcs.Status.CurrentGenerationHash = ptr.To(legacyHash) + + fakeClient := testutils.SetupFakeClient(pcs) + reconciler := &Reconciler{ + client: fakeClient, + pcsGenerationHashExpectations: sync.Map{}, + } + + result := reconciler.processGenerationHashChange(context.Background(), logr.Discard(), pcs) + require.False(t, result.HasErrors()) + + updatedPCS := &grovecorev1alpha1.PodCliqueSet{} + err := fakeClient.Get(context.Background(), client.ObjectKeyFromObject(pcs), updatedPCS) + require.NoError(t, err) + require.NotNil(t, updatedPCS.Status.CurrentGenerationHash) + assert.Equal(t, canonicalHash, *updatedPCS.Status.CurrentGenerationHash) + assert.Nil(t, updatedPCS.Status.UpdateProgress, "legacy-current hash migration must not start a rolling update") +} + +func TestProcessGenerationHashChange_StaleHashStillTriggersUpdateProgress(t *testing.T) { + pcs := testutils.NewPodCliqueSetBuilder(testPCSName, testNamespace, uuid.NewUUID()). + WithPodCliqueTemplateSpec(testutils.NewPodCliqueTemplateSpecBuilder("frontend").Build()). + WithPodCliqueTemplateSpec(testutils.NewPodCliqueTemplateSpecBuilder("planner").Build()). + Build() + pcs.Status.CurrentGenerationHash = ptr.To("stale-generation-hash") + + fakeClient := testutils.SetupFakeClient(pcs) + reconciler := &Reconciler{ + client: fakeClient, + pcsGenerationHashExpectations: sync.Map{}, + } + + result := reconciler.processGenerationHashChange(context.Background(), logr.Discard(), pcs) + require.False(t, result.HasErrors()) + + updatedPCS := &grovecorev1alpha1.PodCliqueSet{} + err := fakeClient.Get(context.Background(), client.ObjectKeyFromObject(pcs), updatedPCS) + require.NoError(t, err) + require.NotNil(t, updatedPCS.Status.UpdateProgress) + require.NotNil(t, updatedPCS.Status.CurrentGenerationHash) + assert.Equal(t, computeGenerationHash(pcs), *updatedPCS.Status.CurrentGenerationHash) +} + +// TestComputeGenerationHash_InOrderStartupIsSensitiveToCliqueOrder pins the +// counterpart to the AnyOrder/Explicit invariant: when the PCS uses +// CliqueStartupTypeInOrder, the slice order DOES carry semantic meaning (it +// defines the startup chain), so the generation hash must change if the +// slice is reordered. Otherwise a real change to the startup ordering would +// silently be ignored. +func TestComputeGenerationHash_InOrderStartupIsSensitiveToCliqueOrder(t *testing.T) { + startupInOrder := grovecorev1alpha1.CliqueStartupTypeInOrder + + pcs := testutils.NewPodCliqueSetBuilder(testPCSName, testNamespace, uuid.NewUUID()). + WithCliqueStartupType(&startupInOrder). + WithPodCliqueTemplateSpec(testutils.NewPodCliqueTemplateSpecBuilder("frontend"). + WithLabels(map[string]string{"role": "frontend"}). + Build()). + WithPodCliqueTemplateSpec(testutils.NewPodCliqueTemplateSpecBuilder("planner"). + WithLabels(map[string]string{"role": "planner"}). + Build()). + WithPodCliqueTemplateSpec(testutils.NewPodCliqueTemplateSpecBuilder("prefill"). + WithLabels(map[string]string{"role": "prefill"}). + Build()). + Build() + originalHash := computeGenerationHash(pcs) + + reordered := pcs.DeepCopy() + reordered.Spec.Template.Cliques = []*grovecorev1alpha1.PodCliqueTemplateSpec{ + pcs.Spec.Template.Cliques[2].DeepCopy(), + pcs.Spec.Template.Cliques[0].DeepCopy(), + pcs.Spec.Template.Cliques[1].DeepCopy(), + } + assert.NotEqual(t, originalHash, computeGenerationHash(reordered), + "under CliqueStartupTypeInOrder, slice order encodes the startup chain — a different order is a real spec change") +} + +// TestComputeGenerationHash_InOrderStartupOrderUsesCliqueNames verifies that +// the InOrder startup chain is represented by clique names, not merely by the +// sequence of pod template hashes. A "do not sort InOrder cliques" implementation +// would miss this case when the reordered cliques have identical pod templates. +func TestComputeGenerationHash_InOrderStartupOrderUsesCliqueNames(t *testing.T) { + startupInOrder := grovecorev1alpha1.CliqueStartupTypeInOrder + + pcs := testutils.NewPodCliqueSetBuilder(testPCSName, testNamespace, uuid.NewUUID()). + WithCliqueStartupType(&startupInOrder). + WithPodCliqueTemplateSpec(testutils.NewPodCliqueTemplateSpecBuilder("database").Build()). + WithPodCliqueTemplateSpec(testutils.NewPodCliqueTemplateSpecBuilder("app").Build()). + Build() + originalHash := computeGenerationHash(pcs) + + reordered := pcs.DeepCopy() + reordered.Spec.Template.Cliques = []*grovecorev1alpha1.PodCliqueTemplateSpec{ + pcs.Spec.Template.Cliques[1].DeepCopy(), + pcs.Spec.Template.Cliques[0].DeepCopy(), + } + assert.NotEqual(t, originalHash, computeGenerationHash(reordered), + "under CliqueStartupTypeInOrder, changing only the ordered clique names changes the startup chain") +} + +// TestComputeGenerationHash_AnyOrderEqualsExplicit_WhenPodSpecsMatch pins that +// AnyOrder and Explicit do not add synthetic startup metadata to the generation +// hash. Their slice order is not part of the template hash input, and +// StartupType itself is immutable after creation, so including it would only +// broaden upgrade-time hash churn. +// +// Also pins the corollary that under Explicit, reordering the clique slice is +// a no-op — the property most at risk of regressing if someone later extends +// the InOrder-only startup hash input to also engage on Explicit. +func TestComputeGenerationHash_AnyOrderEqualsExplicit_WhenPodSpecsMatch(t *testing.T) { + startupAnyOrder := grovecorev1alpha1.CliqueStartupTypeAnyOrder + startupExplicit := grovecorev1alpha1.CliqueStartupTypeExplicit + + // Distinct WithLabels per clique is load-bearing: the clique Name itself is + // not in the per-clique hash input (only Labels, Annotations, and PodSpec + // are), so without distinguishing labels the three per-clique pod templates + // would be byte-identical and the slice-order-invariance assertion below + // would pass trivially even against the bugged pre-canonicalization code. + build := func(st *grovecorev1alpha1.CliqueStartupType) *grovecorev1alpha1.PodCliqueSet { + return testutils.NewPodCliqueSetBuilder(testPCSName, testNamespace, uuid.NewUUID()). + WithCliqueStartupType(st). + WithPodCliqueTemplateSpec(testutils.NewPodCliqueTemplateSpecBuilder("frontend"). + WithLabels(map[string]string{"role": "frontend"}).Build()). + WithPodCliqueTemplateSpec(testutils.NewPodCliqueTemplateSpecBuilder("planner"). + WithLabels(map[string]string{"role": "planner"}).Build()). + WithPodCliqueTemplateSpec(testutils.NewPodCliqueTemplateSpecBuilder("prefill"). + WithLabels(map[string]string{"role": "prefill"}).Build()). + Build() + } + hashAnyOrder := computeGenerationHash(build(&startupAnyOrder)) + hashExplicit := computeGenerationHash(build(&startupExplicit)) + + assert.Equal(t, hashAnyOrder, hashExplicit, + "AnyOrder and Explicit must not add startup-order hash inputs for otherwise identical templates") + + explicit := build(&startupExplicit) + explicitReordered := explicit.DeepCopy() + explicitReordered.Spec.Template.Cliques = []*grovecorev1alpha1.PodCliqueTemplateSpec{ + explicit.Spec.Template.Cliques[2].DeepCopy(), + explicit.Spec.Template.Cliques[0].DeepCopy(), + explicit.Spec.Template.Cliques[1].DeepCopy(), + } + assert.Equal(t, computeGenerationHash(explicit), computeGenerationHash(explicitReordered), + "under CliqueStartupTypeExplicit, clique slice order must not affect the generation hash") +} + +// TestComputeGenerationHash_InOrderToAnyOrderFlipsHash pins that switching +// the StartupType from InOrder to AnyOrder is captured by the hash. Under +// InOrder the slice order encodes the startup chain; under AnyOrder it +// doesn't. This is the boundary case for the canonicalization split — a +// regression here would mean a real semantic change in startup behavior +// goes silently undetected by the rolling-update path. +func TestComputeGenerationHash_InOrderToAnyOrderFlipsHash(t *testing.T) { + startupInOrder := grovecorev1alpha1.CliqueStartupTypeInOrder + startupAnyOrder := grovecorev1alpha1.CliqueStartupTypeAnyOrder + + build := func(st *grovecorev1alpha1.CliqueStartupType) *grovecorev1alpha1.PodCliqueSet { + return testutils.NewPodCliqueSetBuilder(testPCSName, testNamespace, uuid.NewUUID()). + WithCliqueStartupType(st). + WithPodCliqueTemplateSpec(testutils.NewPodCliqueTemplateSpecBuilder("frontend").Build()). + WithPodCliqueTemplateSpec(testutils.NewPodCliqueTemplateSpecBuilder("planner").Build()). + WithPodCliqueTemplateSpec(testutils.NewPodCliqueTemplateSpecBuilder("prefill").Build()). + Build() + } + assert.NotEqual(t, computeGenerationHash(build(&startupInOrder)), computeGenerationHash(build(&startupAnyOrder)), + "InOrder ↔ AnyOrder changes startup semantics and must flip the hash") +} + +// TestComputeGenerationHash_CombinedReplicaChangeAndCliqueReorderIsNoOp +// covers the realistic Dynamo-operator scenario where a single PCS write +// both bumps replicas (because the Planner asked for a scale-up) AND happens +// to emit cliques in a different order (because the upstream operator +// constructed them from a Go map). Both axes are individually no-ops; they +// must remain a no-op when combined. +func TestComputeGenerationHash_CombinedReplicaChangeAndCliqueReorderIsNoOp(t *testing.T) { + pcs := testutils.NewPodCliqueSetBuilder(testPCSName, testNamespace, uuid.NewUUID()). + WithPodCliqueTemplateSpec(testutils.NewPodCliqueTemplateSpecBuilder("frontend"). + WithLabels(map[string]string{"role": "frontend"}).WithReplicas(1).Build()). + WithPodCliqueTemplateSpec(testutils.NewPodCliqueTemplateSpecBuilder("planner"). + WithLabels(map[string]string{"role": "planner"}).WithReplicas(1).Build()). + WithPodCliqueTemplateSpec(testutils.NewPodCliqueTemplateSpecBuilder("prefill"). + WithLabels(map[string]string{"role": "prefill"}).WithReplicas(1).Build()). + Build() + originalHash := computeGenerationHash(pcs) + + combined := pcs.DeepCopy() + // Reorder cliques (Cliques is +listType=map keyed by name, no semantic + // order under default AnyOrder StartupType). + combined.Spec.Template.Cliques = []*grovecorev1alpha1.PodCliqueTemplateSpec{ + pcs.Spec.Template.Cliques[2].DeepCopy(), + pcs.Spec.Template.Cliques[0].DeepCopy(), + pcs.Spec.Template.Cliques[1].DeepCopy(), + } + // Bump replicas on one of the cliques (replica-only changes are + // intentionally excluded from the generation hash). + combined.Spec.Template.Cliques[0].Spec.Replicas = 5 + + assert.Equal(t, originalHash, computeGenerationHash(combined), + "replica bump + clique reorder is the realistic latency-mode scale-up patch shape — neither axis must flip the generation hash") +} + +// TestComputeGenerationHash_RealCliqueTemplateChangeFlipsHash is the +// regression test in the other direction: if a real per-clique PodSpec +// change happens (e.g. an image bump rolled out via the DGD) the +// generation hash must change. Otherwise the rolling update would never +// kick off. +func TestComputeGenerationHash_RealCliqueTemplateChangeFlipsHash(t *testing.T) { + build := func(image string) *grovecorev1alpha1.PodCliqueSet { + pcs := testutils.NewPodCliqueSetBuilder(testPCSName, testNamespace, uuid.NewUUID()). + WithPodCliqueTemplateSpec(testutils.NewPodCliqueTemplateSpecBuilder("frontend").Build()). + WithPodCliqueTemplateSpec(testutils.NewPodCliqueTemplateSpecBuilder("worker").Build()). + Build() + pcs.Spec.Template.Cliques[1].Spec.PodSpec = corev1.PodSpec{ + Containers: []corev1.Container{{Name: "main", Image: image}}, + } + return pcs + } + assert.NotEqual(t, computeGenerationHash(build("worker:v1")), computeGenerationHash(build("worker:v2")), + "a real per-clique image change must flip the generation hash; otherwise rolling updates would never start") +} + // TestInitUpdateProgress tests the initUpdateProgress function for both OnDelete and RollingRecreate strategies func TestInitUpdateProgress(t *testing.T) { tests := []struct { diff --git a/operator/internal/controller/podcliqueset/reconcilestatus.go b/operator/internal/controller/podcliqueset/reconcilestatus.go index ca16fc02c..a47180325 100644 --- a/operator/internal/controller/podcliqueset/reconcilestatus.go +++ b/operator/internal/controller/podcliqueset/reconcilestatus.go @@ -23,6 +23,7 @@ import ( "slices" "strconv" + apicommon "github.com/ai-dynamo/grove/operator/api/common" apicommonconstants "github.com/ai-dynamo/grove/operator/api/common/constants" grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" "github.com/ai-dynamo/grove/operator/internal/clustertopology" @@ -148,6 +149,7 @@ func (r *Reconciler) computeAvailableAndUpdatedReplicas(ctx context.Context, log // Group both resources by PCS replica index standalonePCLQsByReplica := componentutils.GroupPCLQsByPCSReplicaIndex(standalonePCLQs) pcsgsByReplica := componentutils.GroupPCSGsByPCSReplicaIndex(pcsgs) + pcsGenerationHashCandidates := componentutils.ComputePCSGenerationHashCandidates(pcs) for replicaIndex := 0; replicaIndex < int(pcs.Spec.Replicas); replicaIndex++ { replicaIndexStr := strconv.Itoa(replicaIndex) @@ -158,10 +160,10 @@ func (r *Reconciler) computeAvailableAndUpdatedReplicas(ctx context.Context, log stats.totalPCLQs += int32(expectedPCLQCount) stats.totalPCSGs += int32(expectedPCSGCount) - stats.updatedPCLQs += countUpdatedPCLQs(pcs.Status.CurrentGenerationHash, replicaStandalonePCLQs) - stats.updatedPCSGs += countUpdatedPCSGs(pcs.Status.CurrentGenerationHash, replicaPCSGs) + stats.updatedPCLQs += countUpdatedPCLQs(pcs.Status.CurrentGenerationHash, replicaStandalonePCLQs, pcsGenerationHashCandidates.Canonical, pcsGenerationHashCandidates.Legacy) + stats.updatedPCSGs += countUpdatedPCSGs(pcs.Status.CurrentGenerationHash, replicaPCSGs, pcsGenerationHashCandidates.Canonical, pcsGenerationHashCandidates.Legacy) - isReplicaAvailable, isReplicaUpdated := r.computeReplicaStatus(pcs.Status.CurrentGenerationHash, replicaPCSGs, + isReplicaAvailable, isReplicaUpdated := r.computeReplicaStatus(pcs, replicaPCSGs, replicaStandalonePCLQs, expectedPCSGCount, expectedPCLQCount) if isReplicaAvailable { stats.availableReplicas++ @@ -179,7 +181,7 @@ func (r *Reconciler) computeAvailableAndUpdatedReplicas(ctx context.Context, log } // countUpdatedPCLQs counts non-terminating PCLQs whose generation hash matches the PCS hash. -func countUpdatedPCLQs(pcsGenerationHash *string, pclqs []grovecorev1alpha1.PodClique) int32 { +func countUpdatedPCLQs(pcsGenerationHash *string, pclqs []grovecorev1alpha1.PodClique, compatiblePCSGenerationHashes ...string) int32 { if pcsGenerationHash == nil { return 0 } @@ -190,7 +192,7 @@ func countUpdatedPCLQs(pcsGenerationHash *string, pclqs []grovecorev1alpha1.PodC continue } if pclq.Status.CurrentPodCliqueSetGenerationHash != nil && - *pclq.Status.CurrentPodCliqueSetGenerationHash == *pcsGenerationHash { + pcsGenerationHashMatches(pcsGenerationHash, *pclq.Status.CurrentPodCliqueSetGenerationHash, compatiblePCSGenerationHashes...) { n++ } } @@ -198,7 +200,7 @@ func countUpdatedPCLQs(pcsGenerationHash *string, pclqs []grovecorev1alpha1.PodC } // countUpdatedPCSGs counts non-terminating PCSGs whose update completed at the PCS hash. -func countUpdatedPCSGs(pcsGenerationHash *string, pcsgs []grovecorev1alpha1.PodCliqueScalingGroup) int32 { +func countUpdatedPCSGs(pcsGenerationHash *string, pcsgs []grovecorev1alpha1.PodCliqueScalingGroup, compatiblePCSGenerationHashes ...string) int32 { if pcsGenerationHash == nil { return 0 } @@ -208,22 +210,27 @@ func countUpdatedPCSGs(pcsGenerationHash *string, pcsgs []grovecorev1alpha1.PodC if k8sutils.IsResourceTerminating(pcsg.ObjectMeta) { continue } - if componentutils.IsPCSGUpdateComplete(pcsg, *pcsGenerationHash) { + if componentutils.IsPCSGUpdateComplete(pcsg, *pcsGenerationHash, compatiblePCSGenerationHashes...) { n++ } } return n } +func pcsGenerationHashMatches(currentGenerationHash *string, storedGenerationHash string, compatiblePCSGenerationHashes ...string) bool { + return currentGenerationHash != nil && + (*currentGenerationHash == storedGenerationHash || slices.Contains(compatiblePCSGenerationHashes, storedGenerationHash)) +} + // computeReplicaStatus determines if a replica is available and updated based on its components. -func (r *Reconciler) computeReplicaStatus(pcsGenerationHash *string, replicaPCSGs []grovecorev1alpha1.PodCliqueScalingGroup, standalonePCLQs []grovecorev1alpha1.PodClique, expectedPCSGs int, expectedStandalonePCLQs int) (bool, bool) { - pclqsAvailable, pclqsUpdated := r.computePCLQsStatus(expectedStandalonePCLQs, standalonePCLQs) - pcsgsAvailable, pcsgsUpdated := r.computePCSGsStatus(pcsGenerationHash, expectedPCSGs, replicaPCSGs) +func (r *Reconciler) computeReplicaStatus(pcs *grovecorev1alpha1.PodCliqueSet, replicaPCSGs []grovecorev1alpha1.PodCliqueScalingGroup, standalonePCLQs []grovecorev1alpha1.PodClique, expectedPCSGs int, expectedStandalonePCLQs int) (bool, bool) { + pclqsAvailable, pclqsUpdated := r.computePCLQsStatus(pcs, expectedStandalonePCLQs, standalonePCLQs) + pcsgsAvailable, pcsgsUpdated := r.computePCSGsStatus(pcs, expectedPCSGs, replicaPCSGs) return pclqsAvailable && pcsgsAvailable, pclqsUpdated && pcsgsUpdated } // computePCLQsStatus checks if standalone PodCliques are available and updated. -func (r *Reconciler) computePCLQsStatus(expectedStandalonePCLQs int, existingPCLQs []grovecorev1alpha1.PodClique) (isAvailable, isUpdated bool) { +func (r *Reconciler) computePCLQsStatus(pcs *grovecorev1alpha1.PodCliqueSet, expectedStandalonePCLQs int, existingPCLQs []grovecorev1alpha1.PodClique) (isAvailable, isUpdated bool) { nonTerminatedPCLQs := lo.Filter(existingPCLQs, func(pclq grovecorev1alpha1.PodClique, _ int) bool { return !k8sutils.IsResourceTerminating(pclq.ObjectMeta) }) @@ -233,15 +240,31 @@ func (r *Reconciler) computePCLQsStatus(expectedStandalonePCLQs int, existingPCL return pclq.Status.ReadyReplicas >= *pclq.Spec.MinAvailable }) + pcsGenerationHashCandidates := componentutils.ComputePCSGenerationHashCandidates(pcs) isUpdated = isAvailable && lo.EveryBy(nonTerminatedPCLQs, func(pclq grovecorev1alpha1.PodClique) bool { - return pclq.Status.UpdatedReplicas >= *pclq.Spec.MinAvailable + expectedTemplateHashes, err := componentutils.GetExpectedPCLQPodTemplateHashCandidates(pcs, pclq.ObjectMeta) + if err != nil { + return false + } + return isStandalonePCLQUpdated(&pclq, expectedTemplateHashes, pcsGenerationHashCandidates) }) return } +// isStandalonePCLQUpdated checks if a standalone PodClique is fully updated to the expected pod template and PodCliqueSet generation hashes. +func isStandalonePCLQUpdated(pclq *grovecorev1alpha1.PodClique, expectedPodTemplateHashes, pcsGenerationHashCandidates componentutils.HashCandidates) bool { + return expectedPodTemplateHashes.Matches(pclq.Labels[apicommon.LabelPodTemplateHash]) && + pclq.Status.CurrentPodTemplateHash != nil && + expectedPodTemplateHashes.Matches(*pclq.Status.CurrentPodTemplateHash) && + pclq.Status.CurrentPodCliqueSetGenerationHash != nil && + pcsGenerationHashCandidates.Matches(*pclq.Status.CurrentPodCliqueSetGenerationHash) && + pclq.Status.ReadyReplicas >= *pclq.Spec.MinAvailable && + pclq.Status.UpdatedReplicas >= *pclq.Spec.MinAvailable +} + // computePCSGsStatus checks if PodCliqueScalingGroups are available and updated. -func (r *Reconciler) computePCSGsStatus(pcsGenerationHash *string, expectedPCSGs int, pcsgs []grovecorev1alpha1.PodCliqueScalingGroup) (isAvailable, isUpdated bool) { +func (r *Reconciler) computePCSGsStatus(pcs *grovecorev1alpha1.PodCliqueSet, expectedPCSGs int, pcsgs []grovecorev1alpha1.PodCliqueScalingGroup) (isAvailable, isUpdated bool) { nonTerminatedPCSGs := lo.Filter(pcsgs, func(pcsg grovecorev1alpha1.PodCliqueScalingGroup, _ int) bool { return !k8sutils.IsResourceTerminating(pcsg.ObjectMeta) }) @@ -251,8 +274,10 @@ func (r *Reconciler) computePCSGsStatus(pcsGenerationHash *string, expectedPCSGs return pcsg.Status.AvailableReplicas >= *pcsg.Spec.MinAvailable }) + pcsGenerationHashCandidates := componentutils.ComputePCSGenerationHashCandidates(pcs) isUpdated = isAvailable && lo.EveryBy(nonTerminatedPCSGs, func(pcsg grovecorev1alpha1.PodCliqueScalingGroup) bool { - return pcsGenerationHash != nil && componentutils.IsPCSGUpdateComplete(&pcsg, *pcsGenerationHash) + return pcs.Status.CurrentGenerationHash != nil && + componentutils.IsPCSGUpdateComplete(&pcsg, pcsGenerationHashCandidates.Canonical, pcsGenerationHashCandidates.Legacy) }) return diff --git a/operator/internal/controller/podcliqueset/reconcilestatus_test.go b/operator/internal/controller/podcliqueset/reconcilestatus_test.go index 00a56b9cc..d6d185498 100644 --- a/operator/internal/controller/podcliqueset/reconcilestatus_test.go +++ b/operator/internal/controller/podcliqueset/reconcilestatus_test.go @@ -21,6 +21,7 @@ import ( "testing" "time" + apicommon "github.com/ai-dynamo/grove/operator/api/common" apicommonconstants "github.com/ai-dynamo/grove/operator/api/common/constants" configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" @@ -280,6 +281,81 @@ func TestComputePCSAvailableReplicas(t *testing.T) { } } +// TestComputePCSUpdatedReplicasRequiresStandaloneHashConvergence verifies that +// a standalone-PodClique-backed replica only counts toward updatedReplicas once +// all three rollout hashes tracked by isStandalonePCLQUpdated have converged on +// the values derived from the current PCS spec: the PCLQ's pod-template-hash +// label, status.CurrentPodTemplateHash, and status.CurrentPodCliqueSetGenerationHash. +// Each can lag the others mid-reconcile, so the cases pin one stale at a time +// plus a happy-path case where everything converges. +func TestComputePCSUpdatedReplicasRequiresStandaloneHashConvergence(t *testing.T) { + pcs := testutils.NewPodCliqueSetBuilder(testPCSName, testNamespace, uuid.NewUUID()). + WithReplicas(1). + WithStandaloneClique("worker"). + Build() + templateHashes := componentutils.ComputePCLQPodTemplateHashCandidates(pcs.Spec.Template.Cliques[0], pcs.Spec.Template.PriorityClassName) + generationHashes := componentutils.ComputePCSGenerationHashCandidates(pcs) + pcs.Status.CurrentGenerationHash = ptr.To(generationHashes.Canonical) + + tests := []struct { + name string + labelPodTemplateHash string + currentPodTemplateHash string + currentPodCliqueSetGenerationHash string + wantUpdatedReplicas int32 + }{ + { + name: "stale generation hash", + labelPodTemplateHash: templateHashes.Canonical, + currentPodTemplateHash: templateHashes.Canonical, + currentPodCliqueSetGenerationHash: "old-generation-hash", + wantUpdatedReplicas: 0, + }, + { + name: "stale current template hash", + labelPodTemplateHash: templateHashes.Canonical, + currentPodTemplateHash: "old-template-hash", + currentPodCliqueSetGenerationHash: generationHashes.Canonical, + wantUpdatedReplicas: 0, + }, + { + name: "stale label template hash", + labelPodTemplateHash: "old-template-hash", + currentPodTemplateHash: templateHashes.Canonical, + currentPodCliqueSetGenerationHash: generationHashes.Canonical, + wantUpdatedReplicas: 0, + }, + { + name: "hashes converged", + labelPodTemplateHash: templateHashes.Canonical, + currentPodTemplateHash: templateHashes.Canonical, + currentPodCliqueSetGenerationHash: generationHashes.Canonical, + wantUpdatedReplicas: 1, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + pclq := testutils.NewPodCliqueBuilder(testPCSName, uuid.NewUUID(), "worker", testNamespace, 0). + WithOptions(testutils.WithPCLQReplicaReadyStatus(1)). + Build() + pclq.Labels[apicommon.LabelPodTemplateHash] = tt.labelPodTemplateHash + pclq.Status.UpdatedReplicas = 1 + pclq.Status.CurrentPodTemplateHash = ptr.To(tt.currentPodTemplateHash) + pclq.Status.CurrentPodCliqueSetGenerationHash = ptr.To(tt.currentPodCliqueSetGenerationHash) + + cl := testutils.CreateDefaultFakeClient([]client.Object{pcs, pclq}) + reconciler := &Reconciler{client: cl} + + stats, err := reconciler.computeAvailableAndUpdatedReplicas(context.Background(), logr.Discard(), pcs) + + assert.NoError(t, err) + assert.Equal(t, int32(1), stats.availableReplicas) + assert.Equal(t, tt.wantUpdatedReplicas, stats.updatedReplicas) + }) + } +} + // TestMutateTopologyLevelUnavailableConditions tests the mutateTopologyLevelUnavailableConditions function. // It covers TAS-disabled paths, backward-compat paths (missing topologyName), and fully-specified // ClusterTopology paths (not found, unavailable domains, all available). diff --git a/operator/internal/controller/podcliqueset/register.go b/operator/internal/controller/podcliqueset/register.go index ce291db8d..8aafb871b 100644 --- a/operator/internal/controller/podcliqueset/register.go +++ b/operator/internal/controller/podcliqueset/register.go @@ -50,7 +50,7 @@ func (r *Reconciler) RegisterWithManager(mgr manager.Manager) error { WithOptions(controller.Options{ MaxConcurrentReconciles: *r.config.ConcurrentSyncs, }). - For(&grovecorev1alpha1.PodCliqueSet{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})). + For(&grovecorev1alpha1.PodCliqueSet{}, builder.WithPredicates(podCliqueSetPredicate())). Watches( &grovecorev1alpha1.ClusterTopology{}, handler.EnqueueRequestsFromMapFunc(mapClusterTopologyToPodCliqueSets(r.client)), @@ -68,6 +68,22 @@ func (r *Reconciler) RegisterWithManager(mgr manager.Manager) error { Complete(r) } +// podCliqueSetPredicate returns a predicate that allows spec changes and explicit no-op reconcile triggers. +func podCliqueSetPredicate() predicate.Predicate { + return predicate.Funcs{ + CreateFunc: func(_ event.CreateEvent) bool { return true }, + DeleteFunc: func(_ event.DeleteEvent) bool { return true }, + UpdateFunc: func(updateEvent event.UpdateEvent) bool { + if updateEvent.ObjectOld == nil || updateEvent.ObjectNew == nil { + return false + } + return hasSpecChanged(updateEvent) || + hasAnnotationChanged(updateEvent.ObjectOld.GetAnnotations(), updateEvent.ObjectNew.GetAnnotations(), constants.AnnotationReconcileTrigger) + }, + GenericFunc: func(_ event.GenericEvent) bool { return true }, + } +} + // mapPodCliqueToPodCliqueSet returns a function that maps PodClique events to their parent PodCliqueSet. func mapPodCliqueToPodCliqueSet() handler.MapFunc { return func(_ context.Context, obj client.Object) []reconcile.Request { @@ -147,7 +163,8 @@ func podCliqueScalingGroupPredicate() predicate.Predicate { if !okOld || !okNew { return false } - return hasMinAvailableBreachedConditionChanged(oldPCSG.Status.Conditions, newPCSG.Status.Conditions) || hasUpdateStatusChanged(&oldPCSG.Status, &newPCSG.Status) + return hasMinAvailableBreachedConditionChanged(oldPCSG.Status.Conditions, newPCSG.Status.Conditions) || + hasPodCliqueScalingGroupStatusChanged(&oldPCSG.Status, &newPCSG.Status) }, GenericFunc: func(_ event.TypedGenericEvent[client.Object]) bool { return false }, } @@ -158,6 +175,12 @@ func hasSpecChanged(updateEvent event.UpdateEvent) bool { return updateEvent.ObjectOld.GetGeneration() != updateEvent.ObjectNew.GetGeneration() } +func hasAnnotationChanged(oldAnnotations, newAnnotations map[string]string, key string) bool { + oldValue, oldOK := oldAnnotations[key] + newValue, newOK := newAnnotations[key] + return oldOK != newOK || oldValue != newValue +} + // hasStatusChanged checks if PodClique status fields have changed. func hasStatusChanged(updateEvent event.UpdateEvent) bool { oldPCLQ, okOld := updateEvent.ObjectOld.(*grovecorev1alpha1.PodClique) @@ -166,6 +189,8 @@ func hasStatusChanged(updateEvent event.UpdateEvent) bool { return false } return hasAnyStatusReplicasChanged(oldPCLQ.Status, newPCLQ.Status) || + hasPodCliqueHashStatusChanged(oldPCLQ.Status, newPCLQ.Status) || + hasUpdateStatusChanged(oldPCLQ.Status.UpdateProgress, newPCLQ.Status.UpdateProgress) || hasMinAvailableBreachedConditionChanged(oldPCLQ.Status.Conditions, newPCLQ.Status.Conditions) } @@ -173,7 +198,13 @@ func hasStatusChanged(updateEvent event.UpdateEvent) bool { func hasAnyStatusReplicasChanged(oldPCLQStatus, newPCLQStatus grovecorev1alpha1.PodCliqueStatus) bool { return oldPCLQStatus.Replicas != newPCLQStatus.Replicas || oldPCLQStatus.ReadyReplicas != newPCLQStatus.ReadyReplicas || - oldPCLQStatus.ScheduleGatedReplicas != newPCLQStatus.ScheduleGatedReplicas + oldPCLQStatus.ScheduleGatedReplicas != newPCLQStatus.ScheduleGatedReplicas || + oldPCLQStatus.UpdatedReplicas != newPCLQStatus.UpdatedReplicas +} + +func hasPodCliqueHashStatusChanged(oldPCLQStatus, newPCLQStatus grovecorev1alpha1.PodCliqueStatus) bool { + return !stringPointersEqual(oldPCLQStatus.CurrentPodTemplateHash, newPCLQStatus.CurrentPodTemplateHash) || + !stringPointersEqual(oldPCLQStatus.CurrentPodCliqueSetGenerationHash, newPCLQStatus.CurrentPodCliqueSetGenerationHash) } // hasMinAvailableBreachedConditionChanged checks if the MinAvailableBreached condition has changed. @@ -189,7 +220,23 @@ func hasMinAvailableBreachedConditionChanged(oldConditions, newConditions []meta return false } -// hasUpdateStatusChanged checks if PCSG update progress has changed. -func hasUpdateStatusChanged(oldPCSGStatus, newPCSGStatus *grovecorev1alpha1.PodCliqueScalingGroupStatus) bool { - return !reflect.DeepEqual(oldPCSGStatus.UpdateProgress, newPCSGStatus.UpdateProgress) +// hasPodCliqueScalingGroupStatusChanged reports whether any reconcile-relevant fields of the PodCliqueScalingGroup status have changed. +func hasPodCliqueScalingGroupStatusChanged(oldPCSGStatus, newPCSGStatus *grovecorev1alpha1.PodCliqueScalingGroupStatus) bool { + return oldPCSGStatus.AvailableReplicas != newPCSGStatus.AvailableReplicas || + oldPCSGStatus.UpdatedReplicas != newPCSGStatus.UpdatedReplicas || + !stringPointersEqual(oldPCSGStatus.CurrentPodCliqueSetGenerationHash, newPCSGStatus.CurrentPodCliqueSetGenerationHash) || + hasUpdateStatusChanged(oldPCSGStatus.UpdateProgress, newPCSGStatus.UpdateProgress) +} + +// hasUpdateStatusChanged reports whether the update progress has changed between the old and new states. +func hasUpdateStatusChanged(oldProgress, newProgress any) bool { + return !reflect.DeepEqual(oldProgress, newProgress) +} + +// stringPointersEqual reports whether two *string values are equal, treating nil pointers as equal only to other nil pointers. +func stringPointersEqual(oldValue, newValue *string) bool { + if oldValue == nil || newValue == nil { + return oldValue == newValue + } + return *oldValue == *newValue } diff --git a/operator/internal/controller/podcliqueset/register_test.go b/operator/internal/controller/podcliqueset/register_test.go index 811aade7b..9f8341bcf 100644 --- a/operator/internal/controller/podcliqueset/register_test.go +++ b/operator/internal/controller/podcliqueset/register_test.go @@ -19,6 +19,7 @@ package podcliqueset import ( "testing" + "github.com/ai-dynamo/grove/operator/api/common/constants" grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" testutils "github.com/ai-dynamo/grove/operator/test/utils" @@ -26,6 +27,10 @@ import ( "github.com/stretchr/testify/require" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/uuid" + "k8s.io/utils/ptr" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/reconcile" ) @@ -84,3 +89,165 @@ func TestMapClusterTopologyToPodCliqueSets(t *testing.T) { {NamespacedName: types.NamespacedName{Namespace: "team-b", Name: "pcs-b"}}, }, requests) } + +// TestPodCliqueSetPredicateUpdate verifies that the PodCliqueSet update predicate +// enqueues on generation bumps and reconcile-trigger annotation changes, and +// ignores unrelated annotation edits or no-op updates. +func TestPodCliqueSetPredicateUpdate(t *testing.T) { + pred, ok := podCliqueSetPredicate().(predicate.Funcs) + require.True(t, ok, "predicate must be predicate.Funcs") + + tests := []struct { + name string + old *grovecorev1alpha1.PodCliqueSet + new *grovecorev1alpha1.PodCliqueSet + want bool + }{ + { + name: "generation change enqueues", + old: podCliqueSetWithGenerationAndAnnotations(1, nil), + new: podCliqueSetWithGenerationAndAnnotations(2, nil), + want: true, + }, + { + name: "reconcile trigger annotation change enqueues", + old: podCliqueSetWithGenerationAndAnnotations(1, map[string]string{ + constants.AnnotationReconcileTrigger: "old", + }), + new: podCliqueSetWithGenerationAndAnnotations(1, map[string]string{ + constants.AnnotationReconcileTrigger: "new", + }), + want: true, + }, + { + name: "unrelated annotation change does not enqueue", + old: podCliqueSetWithGenerationAndAnnotations(1, map[string]string{ + "example.com/other": "old", + }), + new: podCliqueSetWithGenerationAndAnnotations(1, map[string]string{ + "example.com/other": "new", + }), + want: false, + }, + { + name: "same generation and same trigger value does not enqueue", + old: podCliqueSetWithGenerationAndAnnotations(1, map[string]string{ + constants.AnnotationReconcileTrigger: "same", + }), + new: podCliqueSetWithGenerationAndAnnotations(1, map[string]string{ + constants.AnnotationReconcileTrigger: "same", + }), + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.want, pred.UpdateFunc(event.UpdateEvent{ObjectOld: tt.old, ObjectNew: tt.new})) + }) + } +} + +func podCliqueSetWithGenerationAndAnnotations(generation int64, annotations map[string]string) *grovecorev1alpha1.PodCliqueSet { + return &grovecorev1alpha1.PodCliqueSet{ + ObjectMeta: metav1.ObjectMeta{ + Generation: generation, + Annotations: annotations, + }, + } +} + +// TestPodCliquePredicateStatusChangesAffectingUpdatedAccounting asserts that the +// PodClique predicate enqueues when status fields feeding rolling-update +// accounting (current hashes, updated replica count, update progress) change. +func TestPodCliquePredicateStatusChangesAffectingUpdatedAccounting(t *testing.T) { + pred, ok := podCliquePredicate().(predicate.Funcs) + require.True(t, ok, "predicate must be predicate.Funcs") + + tests := []struct { + name string + mutate func(*grovecorev1alpha1.PodClique) + }{ + { + name: "current pod template hash changes", + mutate: func(pclq *grovecorev1alpha1.PodClique) { + pclq.Status.CurrentPodTemplateHash = ptr.To("new-template-hash") + }, + }, + { + name: "current PCS generation hash changes", + mutate: func(pclq *grovecorev1alpha1.PodClique) { + pclq.Status.CurrentPodCliqueSetGenerationHash = ptr.To("new-generation-hash") + }, + }, + { + name: "updated replicas changes", + mutate: func(pclq *grovecorev1alpha1.PodClique) { + pclq.Status.UpdatedReplicas = 1 + }, + }, + { + name: "update progress changes", + mutate: func(pclq *grovecorev1alpha1.PodClique) { + pclq.Status.UpdateProgress = &grovecorev1alpha1.PodCliqueUpdateProgress{ + PodCliqueSetGenerationHash: "generation-hash", + PodTemplateHash: "template-hash", + } + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + oldPCLQ := testutils.NewPodCliqueBuilder("pcs", uuid.NewUUID(), "worker", "default", 0).Build() + newPCLQ := oldPCLQ.DeepCopy() + tt.mutate(newPCLQ) + + assert.True(t, pred.UpdateFunc(event.UpdateEvent{ObjectOld: oldPCLQ, ObjectNew: newPCLQ})) + }) + } +} + +// TestPodCliqueScalingGroupPredicateStatusChangesAffectingUpdatedAccounting asserts +// that the PodCliqueScalingGroup predicate enqueues on status changes relevant to +// rolling-update accounting (generation hash, updated replicas, update progress). +func TestPodCliqueScalingGroupPredicateStatusChangesAffectingUpdatedAccounting(t *testing.T) { + pred, ok := podCliqueScalingGroupPredicate().(predicate.Funcs) + require.True(t, ok, "predicate must be predicate.Funcs") + + tests := []struct { + name string + mutate func(*grovecorev1alpha1.PodCliqueScalingGroup) + }{ + { + name: "current PCS generation hash changes", + mutate: func(pcsg *grovecorev1alpha1.PodCliqueScalingGroup) { + pcsg.Status.CurrentPodCliqueSetGenerationHash = ptr.To("new-generation-hash") + }, + }, + { + name: "updated replicas changes", + mutate: func(pcsg *grovecorev1alpha1.PodCliqueScalingGroup) { + pcsg.Status.UpdatedReplicas = 1 + }, + }, + { + name: "update progress changes", + mutate: func(pcsg *grovecorev1alpha1.PodCliqueScalingGroup) { + pcsg.Status.UpdateProgress = &grovecorev1alpha1.PodCliqueScalingGroupUpdateProgress{ + PodCliqueSetGenerationHash: "generation-hash", + } + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + oldPCSG := &grovecorev1alpha1.PodCliqueScalingGroup{} + newPCSG := oldPCSG.DeepCopy() + tt.mutate(newPCSG) + + assert.True(t, pred.UpdateFunc(event.UpdateEvent{ObjectOld: oldPCSG, ObjectNew: newPCSG})) + }) + } +} diff --git a/operator/internal/utils/kubernetes/pod.go b/operator/internal/utils/kubernetes/pod.go index 34fb5cdfe..b71ec484a 100644 --- a/operator/internal/utils/kubernetes/pod.go +++ b/operator/internal/utils/kubernetes/pod.go @@ -19,6 +19,7 @@ package kubernetes import ( "fmt" "hash/fnv" + "sort" "github.com/go-logr/logr" "github.com/samber/lo" @@ -134,16 +135,199 @@ func HasAnyContainerNotStarted(pod *corev1.Pod) bool { return false } -// ComputeHash computes a hash given one or more corev1.PodTemplateSpec. +// ComputeHash returns a stable hash for one or more corev1.PodTemplateSpec. +// +// Each template is canonicalized before hashing so that specs representing +// the same desired state hash to the same value, even when an upstream +// controller serialized order-independent +listType=map slices (e.g. +// Containers, Volumes) in a different order. See canonicalizePodSpecForHashing +// for the field-level rules, including the slices intentionally left +// untouched because their order is part of the spec. func ComputeHash(podTemplateSpecs ...*corev1.PodTemplateSpec) string { + return computeHash(canonicalizePodTemplateSpecForHashing, nil, podTemplateSpecs...) +} + +// ComputeHashWithOrderKeys computes a hash for pod templates whose caller-level +// identity/order matters in addition to the canonicalized PodTemplateSpec +// content. orderKeys must be aligned with podTemplateSpecs by index. +func ComputeHashWithOrderKeys(orderKeys []string, podTemplateSpecs ...*corev1.PodTemplateSpec) string { + if len(orderKeys) != len(podTemplateSpecs) { + panic("ComputeHashWithOrderKeys: orderKeys length must match podTemplateSpecs length") + } + return computeHash(canonicalizePodTemplateSpecForHashing, orderKeys, podTemplateSpecs...) +} + +type podTemplateSpecHashInput struct { + OrderKey string + PodTemplateSpec *corev1.PodTemplateSpec +} + +// ComputeHashLegacy computes a hash using the pre-canonicalization byte stream. +// +// This is intentionally the v0.1.0-alpha.8 behavior: each PodTemplateSpec is +// passed directly to dump.ForHash without sorting order-independent API lists. +// It exists only for the one-release compatibility window while persisted +// legacy hashes are migrated to ComputeHash. +func ComputeHashLegacy(podTemplateSpecs ...*corev1.PodTemplateSpec) string { + return computeHash(func(podTemplateSpec *corev1.PodTemplateSpec) *corev1.PodTemplateSpec { + return podTemplateSpec + }, nil, podTemplateSpecs...) +} + +func computeHash(prepare func(*corev1.PodTemplateSpec) *corev1.PodTemplateSpec, orderKeys []string, podTemplateSpecs ...*corev1.PodTemplateSpec) string { podTemplateSpecHasher := fnv.New64a() podTemplateSpecHasher.Reset() - for _, podTemplateSpec := range podTemplateSpecs { - _, _ = fmt.Fprintf(podTemplateSpecHasher, "%v", dump.ForHash(podTemplateSpec)) + for i, podTemplateSpec := range podTemplateSpecs { + prepared := prepare(podTemplateSpec) + if orderKeys != nil { + _, _ = fmt.Fprintf(podTemplateSpecHasher, "%v", dump.ForHash(podTemplateSpecHashInput{ + OrderKey: orderKeys[i], + PodTemplateSpec: prepared, + })) + continue + } + _, _ = fmt.Fprintf(podTemplateSpecHasher, "%v", dump.ForHash(prepared)) } return rand.SafeEncodeString(fmt.Sprint(podTemplateSpecHasher.Sum64())) } +// canonicalizePodTemplateSpecForHashing returns a deep-copied PodTemplateSpec +// with its PodSpec canonicalized via canonicalizePodSpecForHashing. +func canonicalizePodTemplateSpecForHashing(in *corev1.PodTemplateSpec) *corev1.PodTemplateSpec { + if in == nil { + return nil + } + out := in.DeepCopy() + canonicalizePodSpecForHashing(&out.Spec) + return out +} + +// canonicalizePodSpecForHashing sorts the order-independent +listType=map +// slices in spec in place so that two PodSpecs representing the same desired +// state serialize identically. Each per-helper function below documents its +// own sort key. +// +// The following slices are intentionally left in their original order — +// sorting them would mis-represent a real spec change as equivalent: +// - InitContainers: +listType=map, but the field doc states init containers +// "are run in the order they appear in this list". +// - Container.Env: +listType=map, but order participates in $(VAR) +// substitution and so can change runtime values. +// - Container.Args, Container.Command: ordered argument lists. +// - Container.EnvFrom, PodSpec.Tolerations, PodSpec.ReadinessGates, +// Container.ResizePolicy, Container.RestartPolicyRules: +listType=atomic; +// the API treats the slice as a single opaque value. +func canonicalizePodSpecForHashing(spec *corev1.PodSpec) { + // Containers run in parallel and are +listType=map keyed by name — + // safe to sort. After sorting the slice, also canonicalize the + // per-container map-list fields (ports, volumeMounts, volumeDevices, + // resources.claims). + sort.SliceStable(spec.Containers, func(i, j int) bool { + return spec.Containers[i].Name < spec.Containers[j].Name + }) + for i := range spec.Containers { + canonicalizeContainerForHashing(&spec.Containers[i]) + } + + // InitContainers slice itself is NOT reordered — execution sequence + // matters — but the per-container map-list fields inside each init + // container ARE order-independent and get canonicalized. + for i := range spec.InitContainers { + canonicalizeContainerForHashing(&spec.InitContainers[i]) + } + + // EphemeralContainers are +listType=map keyed by name; ordering has no + // runtime meaning. Sort them and canonicalize their inner slices via + // the embedded EphemeralContainerCommon (same shape as Container). + sort.SliceStable(spec.EphemeralContainers, func(i, j int) bool { + return spec.EphemeralContainers[i].Name < spec.EphemeralContainers[j].Name + }) + for i := range spec.EphemeralContainers { + canonicalizeEphemeralContainerForHashing(&spec.EphemeralContainers[i]) + } + + sort.SliceStable(spec.Volumes, func(i, j int) bool { + return spec.Volumes[i].Name < spec.Volumes[j].Name + }) + sort.SliceStable(spec.ImagePullSecrets, func(i, j int) bool { + return spec.ImagePullSecrets[i].Name < spec.ImagePullSecrets[j].Name + }) + sort.SliceStable(spec.HostAliases, func(i, j int) bool { + return spec.HostAliases[i].IP < spec.HostAliases[j].IP + }) + sort.SliceStable(spec.TopologySpreadConstraints, func(i, j int) bool { + a, b := spec.TopologySpreadConstraints[i], spec.TopologySpreadConstraints[j] + if a.TopologyKey != b.TopologyKey { + return a.TopologyKey < b.TopologyKey + } + return string(a.WhenUnsatisfiable) < string(b.WhenUnsatisfiable) + }) + sort.SliceStable(spec.ResourceClaims, func(i, j int) bool { + return spec.ResourceClaims[i].Name < spec.ResourceClaims[j].Name + }) + sort.SliceStable(spec.SchedulingGates, func(i, j int) bool { + return spec.SchedulingGates[i].Name < spec.SchedulingGates[j].Name + }) +} + +// canonicalizeContainerForHashing sorts the order-independent +listType=map +// slices inside a single Container in place: Ports, VolumeMounts, +// VolumeDevices, Resources.Claims. Order-significant slices (Env, EnvFrom, +// Args, Command) are intentionally left untouched. +func canonicalizeContainerForHashing(c *corev1.Container) { + canonicalizePortsForHashing(c.Ports) + canonicalizeVolumeMountsForHashing(c.VolumeMounts) + canonicalizeVolumeDevicesForHashing(c.VolumeDevices) + canonicalizeResourceClaimsForHashing(c.Resources.Claims) +} + +// canonicalizeEphemeralContainerForHashing applies the same canonicalization +// as canonicalizeContainerForHashing to the embedded EphemeralContainerCommon +// inside an EphemeralContainer. +func canonicalizeEphemeralContainerForHashing(ec *corev1.EphemeralContainer) { + canonicalizePortsForHashing(ec.Ports) + canonicalizeVolumeMountsForHashing(ec.VolumeMounts) + canonicalizeVolumeDevicesForHashing(ec.VolumeDevices) + canonicalizeResourceClaimsForHashing(ec.Resources.Claims) +} + +// canonicalizePortsForHashing sorts ContainerPort entries by their composite +// API key (containerPort, protocol). Container.Ports is +listType=map. +func canonicalizePortsForHashing(ports []corev1.ContainerPort) { + sort.SliceStable(ports, func(p, q int) bool { + a, b := ports[p], ports[q] + if a.ContainerPort != b.ContainerPort { + return a.ContainerPort < b.ContainerPort + } + return string(a.Protocol) < string(b.Protocol) + }) +} + +// canonicalizeVolumeMountsForHashing sorts VolumeMount entries by mountPath. +// Container.VolumeMounts is +listType=map +listMapKey=mountPath; order has +// no runtime meaning. +func canonicalizeVolumeMountsForHashing(mounts []corev1.VolumeMount) { + sort.SliceStable(mounts, func(i, j int) bool { + return mounts[i].MountPath < mounts[j].MountPath + }) +} + +// canonicalizeVolumeDevicesForHashing sorts VolumeDevice entries by +// devicePath. Container.VolumeDevices is +listType=map +listMapKey=devicePath. +func canonicalizeVolumeDevicesForHashing(devices []corev1.VolumeDevice) { + sort.SliceStable(devices, func(i, j int) bool { + return devices[i].DevicePath < devices[j].DevicePath + }) +} + +// canonicalizeResourceClaimsForHashing sorts ResourceClaim entries by name. +// Container.Resources.Claims is +listType=map +listMapKey=name. +func canonicalizeResourceClaimsForHashing(claims []corev1.ResourceClaim) { + sort.SliceStable(claims, func(i, j int) bool { + return claims[i].Name < claims[j].Name + }) +} + // GetContainerStatusIfTerminatedErroneously gets the first occurrence of corev1.ContainerStatus (across init, sidecar and main containers) // that has a non-zero LastTerminationState.Terminated.ExitCode. The reason to choose `containerStatus.LastTerminationState` instead of `containerStatus.State` is that // the `containerStatus.State` oscillates between waiting and terminating in case of containers exiting with non-zero exit code, while the `containerStatus.LastTerminationState` diff --git a/operator/internal/utils/kubernetes/pod_test.go b/operator/internal/utils/kubernetes/pod_test.go index 64c116539..068b89c5d 100644 --- a/operator/internal/utils/kubernetes/pod_test.go +++ b/operator/internal/utils/kubernetes/pod_test.go @@ -776,3 +776,529 @@ func TestCategorizePodsByConditionType(t *testing.T) { assert.Len(t, categories[PodStartedButNotReady], 1) assert.Equal(t, "started-not-ready-pod", categories[PodStartedButNotReady][0].Name) } + +// TestComputeHash_CanonicalizesListTypeMapSlices verifies the foundational +// behavior that ComputeHash canonicalizes Kubernetes API +listType=map +// slices in PodSpec before hashing, so two PodTemplateSpecs that represent +// the same desired state hash to the same value regardless of the order in +// which an upstream serializer emitted the slices. +func TestComputeHash_CanonicalizesListTypeMapSlices(t *testing.T) { + mainContainer := corev1.Container{ + Name: "main", + Image: "nvcr.io/example/runtime:v1", + Ports: []corev1.ContainerPort{ + {Name: "http", ContainerPort: 8000, Protocol: corev1.ProtocolTCP}, + {Name: "metrics", ContainerPort: 9090, Protocol: corev1.ProtocolTCP}, + }, + } + sidecar := corev1.Container{Name: "sidecar", Image: "sidecar:latest"} + + canonical := &corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{mainContainer, sidecar}, + Volumes: []corev1.Volume{ + {Name: "model-cache"}, + {Name: "shared-memory"}, + }, + ImagePullSecrets: []corev1.LocalObjectReference{ + {Name: "docker-imagepullsecret"}, + {Name: "nvcr-imagepullsecret"}, + }, + }, + } + + mainContainerWithSwappedPorts := mainContainer + mainContainerWithSwappedPorts.Ports = []corev1.ContainerPort{ + {Name: "metrics", ContainerPort: 9090, Protocol: corev1.ProtocolTCP}, + {Name: "http", ContainerPort: 8000, Protocol: corev1.ProtocolTCP}, + } + + shuffled := &corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{sidecar, mainContainerWithSwappedPorts}, + Volumes: []corev1.Volume{ + {Name: "shared-memory"}, + {Name: "model-cache"}, + }, + ImagePullSecrets: []corev1.LocalObjectReference{ + {Name: "nvcr-imagepullsecret"}, + {Name: "docker-imagepullsecret"}, + }, + }, + } + + canonicalHash := ComputeHash(canonical) + shuffledHash := ComputeHash(shuffled) + + assert.Equal(t, canonicalHash, shuffledHash, + "ComputeHash must canonicalize +listType=map PodSpec slices so that the same desired state always produces the same hash, regardless of upstream serialization order") +} + +func TestComputeHashWithOrderKeys(t *testing.T) { + t.Run("same_templates_with_different_order_keys_change_hash", func(t *testing.T) { + podTemplateSpec := &corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{Name: "main", Image: "runtime:v1"}}, + }, + } + + assert.NotEqual(t, + ComputeHashWithOrderKeys([]string{"database", "app"}, podTemplateSpec, podTemplateSpec), + ComputeHashWithOrderKeys([]string{"app", "database"}, podTemplateSpec, podTemplateSpec), + "order keys must participate in the hash when caller-level order carries semantics") + }) + + t.Run("canonicalizes_templates_before_hashing_order_key_inputs", func(t *testing.T) { + mainContainer := corev1.Container{ + Name: "main", + Image: "runtime:v1", + Ports: []corev1.ContainerPort{ + {Name: "http", ContainerPort: 8000, Protocol: corev1.ProtocolTCP}, + {Name: "metrics", ContainerPort: 9090, Protocol: corev1.ProtocolTCP}, + }, + } + sidecar := corev1.Container{Name: "sidecar", Image: "sidecar:v1"} + + canonical := &corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{mainContainer, sidecar}, + Volumes: []corev1.Volume{ + {Name: "cache"}, + {Name: "scratch"}, + }, + }, + } + + mainContainerWithSwappedPorts := mainContainer + mainContainerWithSwappedPorts.Ports = []corev1.ContainerPort{ + {Name: "metrics", ContainerPort: 9090, Protocol: corev1.ProtocolTCP}, + {Name: "http", ContainerPort: 8000, Protocol: corev1.ProtocolTCP}, + } + + shuffled := &corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{sidecar, mainContainerWithSwappedPorts}, + Volumes: []corev1.Volume{ + {Name: "scratch"}, + {Name: "cache"}, + }, + }, + } + + assert.Equal(t, + ComputeHashWithOrderKeys([]string{"database"}, canonical), + ComputeHashWithOrderKeys([]string{"database"}, shuffled), + "order-key hashing must still canonicalize order-independent PodSpec slices") + }) + + t.Run("order_key_count_must_match_template_count", func(t *testing.T) { + podTemplateSpec := &corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{Name: "main", Image: "runtime:v1"}}, + }, + } + + assert.Panics(t, func() { + _ = ComputeHashWithOrderKeys([]string{"database"}, podTemplateSpec, podTemplateSpec) + }, "order-key hashing must reject misaligned inputs") + }) +} + +func TestComputeHashLegacy_DivergesOnReorderedListTypeMapSlices(t *testing.T) { + mainContainer := corev1.Container{Name: "main", Image: "main:v1"} + sidecar := corev1.Container{Name: "sidecar", Image: "sidecar:v1"} + + canonical := &corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{mainContainer, sidecar}, + Volumes: []corev1.Volume{ + {Name: "cache"}, + {Name: "shared"}, + }, + }, + } + shuffled := &corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{sidecar, mainContainer}, + Volumes: []corev1.Volume{ + {Name: "shared"}, + {Name: "cache"}, + }, + }, + } + + assert.Equal(t, ComputeHash(canonical), ComputeHash(shuffled), + "canonical hashing should treat order-independent map-list reorders as the same desired spec") + assert.NotEqual(t, ComputeHashLegacy(canonical), ComputeHashLegacy(shuffled), + "legacy hashing must preserve the pre-canonical byte stream during the compatibility window") +} + +// TestComputeHash_DoesNotCanonicalizeOrderSensitiveSlices documents the +// boundary of the canonicalization: slices whose runtime behavior depends +// on their order must remain order-sensitive in the hash, regardless of +// listType. +// - Container.Env is +listType=map by name, but order participates in +// $(VAR) substitution; reordering can change runtime values. +// - PodSpec.InitContainers is +listType=map by name, but the API doc +// states init containers "are run in the order they appear in this +// list"; reordering changes startup semantics. +// - Container.ResizePolicy is +listType=atomic; order is part of the +// atomic value the API treats as opaque. +// +// Reordering any of these is a real spec change and ComputeHash must +// continue to reflect it. +func TestComputeHash_DoesNotCanonicalizeOrderSensitiveSlices(t *testing.T) { + mkSpec := func(envs []corev1.EnvVar, initContainers []corev1.Container) *corev1.PodTemplateSpec { + return &corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + {Name: "main", Image: "main:v1", Env: envs}, + }, + InitContainers: initContainers, + }, + } + } + + t.Run("env_var_reorder_changes_hash", func(t *testing.T) { + envsA := []corev1.EnvVar{ + {Name: "FOO", Value: "1"}, + {Name: "BAR", Value: "$(FOO)"}, + } + envsB := []corev1.EnvVar{ + {Name: "BAR", Value: "$(FOO)"}, + {Name: "FOO", Value: "1"}, + } + assert.NotEqual(t, ComputeHash(mkSpec(envsA, nil)), ComputeHash(mkSpec(envsB, nil)), + "Container.Env is +listType=map by name but order participates in $(VAR) substitution") + }) + + t.Run("init_container_reorder_changes_hash", func(t *testing.T) { + initA := corev1.Container{Name: "init-a", Image: "init-a:v1"} + initB := corev1.Container{Name: "init-b", Image: "init-b:v1"} + assert.NotEqual(t, + ComputeHash(mkSpec(nil, []corev1.Container{initA, initB})), + ComputeHash(mkSpec(nil, []corev1.Container{initB, initA})), + "InitContainers run in slice order — reorder changes runtime behavior and must change the hash") + }) + + t.Run("resize_policy_reorder_changes_hash", func(t *testing.T) { + rpCPU := corev1.ContainerResizePolicy{ResourceName: corev1.ResourceCPU, RestartPolicy: corev1.NotRequired} + rpMem := corev1.ContainerResizePolicy{ResourceName: corev1.ResourceMemory, RestartPolicy: corev1.RestartContainer} + assert.NotEqual(t, + ComputeHash(&corev1.PodTemplateSpec{Spec: corev1.PodSpec{ + Containers: []corev1.Container{{Name: "main", Image: "main:v1", ResizePolicy: []corev1.ContainerResizePolicy{rpCPU, rpMem}}}, + }}), + ComputeHash(&corev1.PodTemplateSpec{Spec: corev1.PodSpec{ + Containers: []corev1.Container{{Name: "main", Image: "main:v1", ResizePolicy: []corev1.ContainerResizePolicy{rpMem, rpCPU}}}, + }}), + "Container.ResizePolicy is +listType=atomic and must remain order-sensitive") + }) +} + +// TestComputeHash_DoesNotMutateInput is a regression test for the +// canonicalization implementation: ComputeHash must operate on a deep copy +// and leave caller-owned PodTemplateSpec slices untouched. +func TestComputeHash_DoesNotMutateInput(t *testing.T) { + in := &corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + {Name: "z-main"}, + {Name: "a-sidecar"}, + }, + Volumes: []corev1.Volume{ + {Name: "z-vol"}, + {Name: "a-vol"}, + }, + }, + } + + containerNamesBefore := []string{in.Spec.Containers[0].Name, in.Spec.Containers[1].Name} + volumeNamesBefore := []string{in.Spec.Volumes[0].Name, in.Spec.Volumes[1].Name} + + _ = ComputeHash(in) + + assert.Equal(t, containerNamesBefore, []string{in.Spec.Containers[0].Name, in.Spec.Containers[1].Name}, + "ComputeHash must not reorder the caller's Containers slice") + assert.Equal(t, volumeNamesBefore, []string{in.Spec.Volumes[0].Name, in.Spec.Volumes[1].Name}, + "ComputeHash must not reorder the caller's Volumes slice") +} + +// TestComputeHash_AdditionalListTypeMapSlices pins the sort-invariance for +// every +listType=map slice the canonicalizer touches that isn't covered by +// TestComputeHash_CanonicalizesListTypeMapSlices: VolumeMounts, VolumeDevices, +// HostAliases, TopologySpreadConstraints, ResourceClaims, SchedulingGates, +// Container.Resources.Claims, and EphemeralContainers. +// +// Note: There was a bug report where VolumeMounts was one of the +// slices that flipped the hash on every Dynamo-operator-driven PCS update; +// this is the regression test for that case. +func TestComputeHash_AdditionalListTypeMapSlices(t *testing.T) { + withSpec := func(mut func(*corev1.PodSpec)) *corev1.PodTemplateSpec { + s := &corev1.PodTemplateSpec{Spec: corev1.PodSpec{Containers: []corev1.Container{{Name: "main", Image: "main:v1"}}}} + mut(&s.Spec) + return s + } + + t.Run("volume_mount_reorder_does_not_change_hash", func(t *testing.T) { + mountA := corev1.VolumeMount{Name: "model-cache", MountPath: "/opt/model-cache"} + mountB := corev1.VolumeMount{Name: "shared-memory", MountPath: "/dev/shm"} + mountC := corev1.VolumeMount{Name: "scratch", MountPath: "/var/scratch"} + + hashA := ComputeHash(withSpec(func(s *corev1.PodSpec) { + s.Containers[0].VolumeMounts = []corev1.VolumeMount{mountA, mountB, mountC} + })) + hashB := ComputeHash(withSpec(func(s *corev1.PodSpec) { + s.Containers[0].VolumeMounts = []corev1.VolumeMount{mountC, mountA, mountB} + })) + assert.Equal(t, hashA, hashB, + "Container.VolumeMounts is +listType=map keyed by mountPath — order must not affect the hash") + }) + + t.Run("volume_device_reorder_does_not_change_hash", func(t *testing.T) { + devA := corev1.VolumeDevice{Name: "raw-a", DevicePath: "/dev/xvda"} + devB := corev1.VolumeDevice{Name: "raw-b", DevicePath: "/dev/xvdb"} + + hashA := ComputeHash(withSpec(func(s *corev1.PodSpec) { + s.Containers[0].VolumeDevices = []corev1.VolumeDevice{devA, devB} + })) + hashB := ComputeHash(withSpec(func(s *corev1.PodSpec) { + s.Containers[0].VolumeDevices = []corev1.VolumeDevice{devB, devA} + })) + assert.Equal(t, hashA, hashB, + "Container.VolumeDevices is +listType=map keyed by devicePath — order must not affect the hash") + }) + + t.Run("host_alias_reorder_does_not_change_hash", func(t *testing.T) { + hashA := ComputeHash(withSpec(func(s *corev1.PodSpec) { + s.HostAliases = []corev1.HostAlias{ + {IP: "10.0.0.1", Hostnames: []string{"a.example"}}, + {IP: "10.0.0.2", Hostnames: []string{"b.example"}}, + } + })) + hashB := ComputeHash(withSpec(func(s *corev1.PodSpec) { + s.HostAliases = []corev1.HostAlias{ + {IP: "10.0.0.2", Hostnames: []string{"b.example"}}, + {IP: "10.0.0.1", Hostnames: []string{"a.example"}}, + } + })) + assert.Equal(t, hashA, hashB, + "PodSpec.HostAliases is +listType=map keyed by ip — order must not affect the hash") + }) + + t.Run("topology_spread_constraint_reorder_does_not_change_hash", func(t *testing.T) { + c1 := corev1.TopologySpreadConstraint{ + MaxSkew: 1, + TopologyKey: "topology.kubernetes.io/zone", + WhenUnsatisfiable: corev1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"role": "worker"}}, + } + c2 := corev1.TopologySpreadConstraint{ + MaxSkew: 1, + TopologyKey: "kubernetes.io/hostname", + WhenUnsatisfiable: corev1.ScheduleAnyway, + LabelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"role": "worker"}}, + } + hashA := ComputeHash(withSpec(func(s *corev1.PodSpec) { + s.TopologySpreadConstraints = []corev1.TopologySpreadConstraint{c1, c2} + })) + hashB := ComputeHash(withSpec(func(s *corev1.PodSpec) { + s.TopologySpreadConstraints = []corev1.TopologySpreadConstraint{c2, c1} + })) + assert.Equal(t, hashA, hashB, + "PodSpec.TopologySpreadConstraints is +listType=map keyed by (topologyKey, whenUnsatisfiable) — order must not affect the hash") + }) + + t.Run("resource_claim_reorder_does_not_change_hash", func(t *testing.T) { + hashA := ComputeHash(withSpec(func(s *corev1.PodSpec) { + s.ResourceClaims = []corev1.PodResourceClaim{ + {Name: "claim-a"}, + {Name: "claim-b"}, + } + })) + hashB := ComputeHash(withSpec(func(s *corev1.PodSpec) { + s.ResourceClaims = []corev1.PodResourceClaim{ + {Name: "claim-b"}, + {Name: "claim-a"}, + } + })) + assert.Equal(t, hashA, hashB, + "PodSpec.ResourceClaims is +listType=map keyed by name — order must not affect the hash") + }) + + t.Run("scheduling_gate_reorder_does_not_change_hash", func(t *testing.T) { + hashA := ComputeHash(withSpec(func(s *corev1.PodSpec) { + s.SchedulingGates = []corev1.PodSchedulingGate{ + {Name: "gate-a"}, + {Name: "gate-b"}, + } + })) + hashB := ComputeHash(withSpec(func(s *corev1.PodSpec) { + s.SchedulingGates = []corev1.PodSchedulingGate{ + {Name: "gate-b"}, + {Name: "gate-a"}, + } + })) + assert.Equal(t, hashA, hashB, + "PodSpec.SchedulingGates is +listType=map keyed by name — order must not affect the hash") + }) + + t.Run("container_resource_claim_reorder_does_not_change_hash", func(t *testing.T) { + claimA := corev1.ResourceClaim{Name: "claim-a"} + claimB := corev1.ResourceClaim{Name: "claim-b"} + hashA := ComputeHash(withSpec(func(s *corev1.PodSpec) { + s.Containers[0].Resources.Claims = []corev1.ResourceClaim{claimA, claimB} + })) + hashB := ComputeHash(withSpec(func(s *corev1.PodSpec) { + s.Containers[0].Resources.Claims = []corev1.ResourceClaim{claimB, claimA} + })) + assert.Equal(t, hashA, hashB, + "Container.Resources.Claims is +listType=map keyed by name — order must not affect the hash") + }) + + t.Run("ephemeral_container_reorder_does_not_change_hash", func(t *testing.T) { + ecA := corev1.EphemeralContainer{ + EphemeralContainerCommon: corev1.EphemeralContainerCommon{Name: "debug-a", Image: "busybox"}, + } + ecB := corev1.EphemeralContainer{ + EphemeralContainerCommon: corev1.EphemeralContainerCommon{Name: "debug-b", Image: "busybox"}, + } + hashA := ComputeHash(withSpec(func(s *corev1.PodSpec) { + s.EphemeralContainers = []corev1.EphemeralContainer{ecA, ecB} + })) + hashB := ComputeHash(withSpec(func(s *corev1.PodSpec) { + s.EphemeralContainers = []corev1.EphemeralContainer{ecB, ecA} + })) + assert.Equal(t, hashA, hashB, + "PodSpec.EphemeralContainers is +listType=map keyed by name — order must not affect the hash") + }) + + t.Run("ephemeral_container_inner_slices_canonicalized", func(t *testing.T) { + // EphemeralContainerCommon has the same shape as Container. Reorder + // its VolumeMounts and Ports — the hash must still be stable. + ec := func(mounts []corev1.VolumeMount, ports []corev1.ContainerPort) corev1.EphemeralContainer { + return corev1.EphemeralContainer{EphemeralContainerCommon: corev1.EphemeralContainerCommon{ + Name: "debug", Image: "busybox", VolumeMounts: mounts, Ports: ports, + }} + } + mountA := corev1.VolumeMount{Name: "a", MountPath: "/a"} + mountB := corev1.VolumeMount{Name: "b", MountPath: "/b"} + portA := corev1.ContainerPort{Name: "p1", ContainerPort: 8000, Protocol: corev1.ProtocolTCP} + portB := corev1.ContainerPort{Name: "p2", ContainerPort: 9090, Protocol: corev1.ProtocolTCP} + + hashA := ComputeHash(withSpec(func(s *corev1.PodSpec) { + s.EphemeralContainers = []corev1.EphemeralContainer{ec([]corev1.VolumeMount{mountA, mountB}, []corev1.ContainerPort{portA, portB})} + })) + hashB := ComputeHash(withSpec(func(s *corev1.PodSpec) { + s.EphemeralContainers = []corev1.EphemeralContainer{ec([]corev1.VolumeMount{mountB, mountA}, []corev1.ContainerPort{portB, portA})} + })) + assert.Equal(t, hashA, hashB, + "EphemeralContainer's inner +listType=map slices must also be canonicalized") + }) + + t.Run("init_container_inner_slices_canonicalized", func(t *testing.T) { + // InitContainers slice ORDER is significant (execution sequence), + // but the +listType=map slices INSIDE each init container are not. + mountA := corev1.VolumeMount{Name: "a", MountPath: "/a"} + mountB := corev1.VolumeMount{Name: "b", MountPath: "/b"} + makeInit := func(mounts []corev1.VolumeMount) corev1.Container { + return corev1.Container{Name: "init-1", Image: "busybox", VolumeMounts: mounts} + } + hashA := ComputeHash(withSpec(func(s *corev1.PodSpec) { + s.InitContainers = []corev1.Container{makeInit([]corev1.VolumeMount{mountA, mountB})} + })) + hashB := ComputeHash(withSpec(func(s *corev1.PodSpec) { + s.InitContainers = []corev1.Container{makeInit([]corev1.VolumeMount{mountB, mountA})} + })) + assert.Equal(t, hashA, hashB, + "InitContainer.VolumeMounts is order-independent even though InitContainers slice itself is order-significant") + }) + + t.Run("init_container_inner_resource_claims_canonicalized", func(t *testing.T) { + claimA := corev1.ResourceClaim{Name: "claim-a"} + claimB := corev1.ResourceClaim{Name: "claim-b"} + makeInit := func(claims []corev1.ResourceClaim) corev1.Container { + return corev1.Container{Name: "init-1", Image: "busybox", Resources: corev1.ResourceRequirements{Claims: claims}} + } + hashA := ComputeHash(withSpec(func(s *corev1.PodSpec) { + s.InitContainers = []corev1.Container{makeInit([]corev1.ResourceClaim{claimA, claimB})} + })) + hashB := ComputeHash(withSpec(func(s *corev1.PodSpec) { + s.InitContainers = []corev1.Container{makeInit([]corev1.ResourceClaim{claimB, claimA})} + })) + assert.Equal(t, hashA, hashB, + "InitContainer.Resources.Claims is order-independent even though InitContainers slice itself is order-significant") + }) +} + +// TestComputeHash_RealSpecChangesStillFlipHash is a regression test in the +// other direction: canonicalization must NOT mask actual desired-state +// changes. If a future contributor accidentally over-canonicalizes — e.g. +// sorts Env, or normalizes a value into its zero — rolling updates would +// silently stop working, and that would be invisible without this kind of +// test. +func TestComputeHash_RealSpecChangesStillFlipHash(t *testing.T) { + base := &corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + {Name: "main", Image: "main:v1", Env: []corev1.EnvVar{{Name: "FOO", Value: "1"}}}, + }, + Volumes: []corev1.Volume{{Name: "cache"}}, + }, + } + baseHash := ComputeHash(base) + + t.Run("image_change_flips_hash", func(t *testing.T) { + mut := base.DeepCopy() + mut.Spec.Containers[0].Image = "main:v2" + assert.NotEqual(t, baseHash, ComputeHash(mut), "container image change must flip the hash") + }) + + t.Run("env_value_change_flips_hash", func(t *testing.T) { + mut := base.DeepCopy() + mut.Spec.Containers[0].Env[0].Value = "2" + assert.NotEqual(t, baseHash, ComputeHash(mut), "env var value change must flip the hash") + }) + + t.Run("new_env_var_flips_hash", func(t *testing.T) { + mut := base.DeepCopy() + mut.Spec.Containers[0].Env = append(mut.Spec.Containers[0].Env, corev1.EnvVar{Name: "BAR", Value: "1"}) + assert.NotEqual(t, baseHash, ComputeHash(mut), "adding an env var must flip the hash") + }) + + t.Run("new_volume_flips_hash", func(t *testing.T) { + mut := base.DeepCopy() + mut.Spec.Volumes = append(mut.Spec.Volumes, corev1.Volume{Name: "scratch"}) + assert.NotEqual(t, baseHash, ComputeHash(mut), "adding a volume must flip the hash") + }) + + t.Run("new_container_flips_hash", func(t *testing.T) { + mut := base.DeepCopy() + mut.Spec.Containers = append(mut.Spec.Containers, corev1.Container{Name: "sidecar", Image: "sidecar:v1"}) + assert.NotEqual(t, baseHash, ComputeHash(mut), "adding a container must flip the hash") + }) + + t.Run("volume_mount_path_change_flips_hash", func(t *testing.T) { + mut := base.DeepCopy() + mut.Spec.Containers[0].VolumeMounts = []corev1.VolumeMount{{Name: "cache", MountPath: "/old"}} + mutHash := ComputeHash(mut) + + mut2 := base.DeepCopy() + mut2.Spec.Containers[0].VolumeMounts = []corev1.VolumeMount{{Name: "cache", MountPath: "/new"}} + assert.NotEqual(t, mutHash, ComputeHash(mut2), "changing a volumeMount's mountPath must flip the hash") + }) +} + +// TestComputeHash_NilSafety pins that ComputeHash and the canonicalizer +// helpers tolerate a nil PodTemplateSpec without panicking. +func TestComputeHash_NilSafety(t *testing.T) { + assert.NotPanics(t, func() { _ = ComputeHash(nil) }, + "ComputeHash(nil) must not panic") + + assert.NotPanics(t, func() { + _ = ComputeHash(nil, &corev1.PodTemplateSpec{Spec: corev1.PodSpec{Containers: []corev1.Container{{Name: "x"}}}}) + }, "ComputeHash with a nil entry mixed in must not panic") + + assert.NotPanics(t, func() { + canonicalizePodSpecForHashing(&corev1.PodSpec{}) // empty / zero-valued + }, "canonicalizing an empty PodSpec must not panic") +}