From e4d54661c8eee4248b801c3ffc6f1b908e658c2e Mon Sep 17 00:00:00 2001 From: Viktor Karpochev Date: Thu, 23 Apr 2026 18:30:01 +1000 Subject: [PATCH 01/14] feat: Rook CephCluster and csi-ceph StorageClass provisioning Add kubernetes helpers for CephCluster, CephBlockPool, rook-config-override, Ceph credentials, CephClusterConnection/Authentication, CephStorageClass, VolumeSnapshotClass, and OSD backing StorageClass resolution. Add testkit.EnsureCephStorageClass orchestrating module enablement through working csi-ceph StorageClass, plus csi-ceph e2e test package. Signed-off-by: Viktor Karpochev Made-with: Cursor Signed-off-by: Viktor Karpochev --- pkg/kubernetes/cephblockpool.go | 218 ++++++++++++ pkg/kubernetes/cephcluster.go | 395 +++++++++++++++++++++ pkg/kubernetes/cephclusterconnection.go | 273 +++++++++++++++ pkg/kubernetes/cephcredentials.go | 183 ++++++++++ pkg/kubernetes/cephstorageclass.go | 230 ++++++++++++ pkg/kubernetes/rookconfigoverride.go | 140 ++++++++ pkg/kubernetes/storageclass_manage.go | 100 ++++++ pkg/kubernetes/volumesnapshotclass.go | 125 +++++++ pkg/testkit/ceph.go | 441 ++++++++++++++++++++++++ tests/csi-ceph/cluster_config.yml | 56 +++ tests/csi-ceph/csi_ceph_suite_test.go | 46 +++ tests/csi-ceph/csi_ceph_test.go | 127 +++++++ 12 files changed, 2334 insertions(+) create mode 100644 pkg/kubernetes/cephblockpool.go create mode 100644 pkg/kubernetes/cephcluster.go create mode 100644 pkg/kubernetes/cephclusterconnection.go create mode 100644 pkg/kubernetes/cephcredentials.go create mode 100644 pkg/kubernetes/cephstorageclass.go create mode 100644 pkg/kubernetes/rookconfigoverride.go create mode 100644 pkg/kubernetes/storageclass_manage.go create mode 100644 pkg/kubernetes/volumesnapshotclass.go create mode 100644 pkg/testkit/ceph.go create mode 100644 tests/csi-ceph/cluster_config.yml create mode 100644 tests/csi-ceph/csi_ceph_suite_test.go create mode 100644 tests/csi-ceph/csi_ceph_test.go diff --git a/pkg/kubernetes/cephblockpool.go b/pkg/kubernetes/cephblockpool.go new file mode 100644 index 0000000..1d112e5 --- /dev/null +++ b/pkg/kubernetes/cephblockpool.go @@ -0,0 +1,218 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubernetes + +import ( + "context" + "fmt" + "time" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/rest" + + "github.com/deckhouse/storage-e2e/internal/logger" +) + +// CephBlockPoolGVR is the GroupVersionResource of Rook's CephBlockPool. +var CephBlockPoolGVR = schema.GroupVersionResource{ + Group: "ceph.rook.io", + Version: "v1", + Resource: "cephblockpools", +} + +// CephBlockPoolConfig describes a minimal replicated or erasure-coded Ceph +// RBD pool managed by Rook. Exactly one of ReplicaSize or ErasureCoded must +// be set; leaving both zero defaults to a single-replica pool suitable for +// single-node test clusters. +type CephBlockPoolConfig struct { + // Name of the CephBlockPool CR (also becomes the Ceph pool name). + Name string + + // Namespace the Rook operator watches (typically "d8-sds-elastic"). + Namespace string + + // FailureDomain is the CRUSH failure domain: "host" or "osd" (default: "host"). + FailureDomain string + + // --- Replicated pool knobs (used when ErasureCoded is nil) --- + + // ReplicaSize is the number of object copies. Default: 1. + ReplicaSize int + + // RequireSafeReplicaSize toggles Ceph's safeguard against single-replica + // pools. When nil, it is set to `false` for ReplicaSize==1 (unsafe single + // replica, accepted for e2e test clusters) and left unset otherwise. + RequireSafeReplicaSize *bool + + // --- Erasure-coded pool knobs --- + + // ErasureCoded, when non-nil, produces an EC pool instead of a replicated + // one. Its fields map to `spec.erasureCoded.{dataChunks,codingChunks}`. + ErasureCoded *CephBlockPoolErasureCoded +} + +// CephBlockPoolErasureCoded configures a Ceph erasure-coded RBD pool. +type CephBlockPoolErasureCoded struct { + DataChunks int + CodingChunks int +} + +// CreateCephBlockPool creates (or updates, if already present) a CephBlockPool +// in the given namespace from the provided configuration. It is idempotent and +// safe to call on every test run. +func CreateCephBlockPool(ctx context.Context, kubeconfig *rest.Config, cfg CephBlockPoolConfig) error { + if cfg.Name == "" { + return fmt.Errorf("CephBlockPool name is required") + } + if cfg.Namespace == "" { + return fmt.Errorf("CephBlockPool namespace is required") + } + if cfg.ErasureCoded == nil && cfg.ReplicaSize <= 0 { + cfg.ReplicaSize = 1 + } + if cfg.FailureDomain == "" { + cfg.FailureDomain = "host" + } + + spec := map[string]interface{}{ + "failureDomain": cfg.FailureDomain, + } + + if cfg.ErasureCoded != nil { + if cfg.ErasureCoded.DataChunks <= 0 || cfg.ErasureCoded.CodingChunks <= 0 { + return fmt.Errorf("ErasureCoded pool requires positive dataChunks and codingChunks") + } + spec["erasureCoded"] = map[string]interface{}{ + "dataChunks": int64(cfg.ErasureCoded.DataChunks), + "codingChunks": int64(cfg.ErasureCoded.CodingChunks), + } + } else { + replicated := map[string]interface{}{ + "size": int64(cfg.ReplicaSize), + } + requireSafe := cfg.RequireSafeReplicaSize + if requireSafe == nil && cfg.ReplicaSize == 1 { + f := false + requireSafe = &f + } + if requireSafe != nil { + replicated["requireSafeReplicaSize"] = *requireSafe + } + spec["replicated"] = replicated + } + + obj := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "ceph.rook.io/v1", + "kind": "CephBlockPool", + "metadata": map[string]interface{}{ + "name": cfg.Name, + "namespace": cfg.Namespace, + }, + "spec": spec, + }, + } + + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + + logger.Info("Creating CephBlockPool %s/%s", cfg.Namespace, cfg.Name) + _, err = dynamicClient.Resource(CephBlockPoolGVR).Namespace(cfg.Namespace).Create(ctx, obj, metav1.CreateOptions{}) + if err == nil { + logger.Success("CephBlockPool %s/%s created", cfg.Namespace, cfg.Name) + return nil + } + if !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("failed to create CephBlockPool %s/%s: %w", cfg.Namespace, cfg.Name, err) + } + + logger.Info("CephBlockPool %s/%s already exists, updating spec", cfg.Namespace, cfg.Name) + existing, err := dynamicClient.Resource(CephBlockPoolGVR).Namespace(cfg.Namespace).Get(ctx, cfg.Name, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to fetch existing CephBlockPool %s/%s: %w", cfg.Namespace, cfg.Name, err) + } + existing.Object["spec"] = spec + if _, err := dynamicClient.Resource(CephBlockPoolGVR).Namespace(cfg.Namespace).Update(ctx, existing, metav1.UpdateOptions{}); err != nil { + return fmt.Errorf("failed to update CephBlockPool %s/%s: %w", cfg.Namespace, cfg.Name, err) + } + return nil +} + +// WaitForCephBlockPoolReady blocks until the CephBlockPool reports +// `status.phase == "Ready"`. Rook transitions the pool from Progressing to +// Ready once the Ceph OSDs have accepted the new pool and its CRUSH rule. +func WaitForCephBlockPoolReady(ctx context.Context, kubeconfig *rest.Config, namespace, name string, timeout time.Duration) error { + if namespace == "" || name == "" { + return fmt.Errorf("namespace and name are required") + } + + logger.Debug("Waiting for CephBlockPool %s/%s to become Ready (timeout: %v)", namespace, name, timeout) + + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + + ctx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + + for { + obj, err := dynamicClient.Resource(CephBlockPoolGVR).Namespace(namespace).Get(ctx, name, metav1.GetOptions{}) + if err == nil { + phase, _, _ := unstructured.NestedString(obj.Object, "status", "phase") + if phase == "Ready" { + logger.Success("CephBlockPool %s/%s is Ready", namespace, name) + return nil + } + logger.Debug("CephBlockPool %s/%s phase: %q, waiting...", namespace, name, phase) + } else if !apierrors.IsNotFound(err) { + logger.Debug("Error getting CephBlockPool %s/%s: %v", namespace, name, err) + } + + select { + case <-ctx.Done(): + return fmt.Errorf("timeout waiting for CephBlockPool %s/%s: %w", namespace, name, ctx.Err()) + case <-ticker.C: + } + } +} + +// DeleteCephBlockPool deletes a CephBlockPool. Safe to call if the pool does +// not exist. +func DeleteCephBlockPool(ctx context.Context, kubeconfig *rest.Config, namespace, name string) error { + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + + if err := dynamicClient.Resource(CephBlockPoolGVR).Namespace(namespace).Delete(ctx, name, metav1.DeleteOptions{}); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed to delete CephBlockPool %s/%s: %w", namespace, name, err) + } + logger.Info("Deleted CephBlockPool %s/%s", namespace, name) + return nil +} diff --git a/pkg/kubernetes/cephcluster.go b/pkg/kubernetes/cephcluster.go new file mode 100644 index 0000000..d6a1ad8 --- /dev/null +++ b/pkg/kubernetes/cephcluster.go @@ -0,0 +1,395 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubernetes + +import ( + "context" + "fmt" + "time" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/rest" + + "github.com/deckhouse/storage-e2e/internal/logger" +) + +// CephClusterGVR is the GroupVersionResource of Rook's CephCluster. +var CephClusterGVR = schema.GroupVersionResource{ + Group: "ceph.rook.io", + Version: "v1", + Resource: "cephclusters", +} + +// Defaults shared between CephClusterConfig and the testkit-level helper. +const ( + DefaultRookNamespace = "d8-sds-elastic" + DefaultCephClusterName = "ceph-cluster" + DefaultCephImage = "quay.io/ceph/ceph:v18.2.7" + DefaultDataDirHostPath = "/var/lib/rook" + DefaultOSDStorageClassSize = "20Gi" +) + +// CephClusterConfig describes a Rook-managed Ceph cluster suitable for e2e +// testing. It is intentionally narrower than Rook's native CephCluster CRD: +// knobs that don't matter for our scenarios are hidden behind hard-coded +// defaults (mirroring the values from the internal Flant wiki instruction +// on deploying sds-elastic + Rook + Ceph on LVM). +type CephClusterConfig struct { + // Name of the CephCluster (default: "ceph-cluster"). + Name string + + // Namespace where Rook watches (default: "d8-sds-elastic"). + Namespace string + + // CephImage is the Ceph container image tag. + // Default: "quay.io/ceph/ceph:v18.2.7". + CephImage string + + // AllowUnsupportedCephVersion flips spec.cephVersion.allowUnsupported. + // Default: true (e2e clusters are allowed to run any version Ceph ships). + AllowUnsupportedCephVersion *bool + + // MonCount / MgrCount are the Rook mon/mgr replica counts. Defaults: + // 1 / 1, which is appropriate for single-node / tiny test clusters. + MonCount int + MgrCount int + + // AllowMultipleMonPerNode allows multiple mons on the same node + // (required for single-node clusters). Default: true. + AllowMultipleMonPerNode *bool + + // DataDirHostPath is where Rook persists mon/OSD data on each node. + // Default: "/var/lib/rook". + DataDirHostPath string + + // NetworkProvider selects the Rook networking mode. Supported values: + // "" — default CNI pod network (suitable for in-cluster e2e); + // "host" — host networking (matches the Flant wiki production layout). + NetworkProvider string + + // PublicNetworkCIDRs / ClusterNetworkCIDRs are the public/cluster CIDRs + // plumbed into `spec.network.addressRanges` when NetworkProvider is + // non-empty. They are ignored for the default (CNI) mode. + PublicNetworkCIDRs []string + ClusterNetworkCIDRs []string + + // --- OSD backing --- + + // OSDStorageClass is the name of a k8s StorageClass able to hand out + // block-mode PVCs. Those PVCs are used by Rook's + // `storage.storageClassDeviceSets` to back OSDs. + OSDStorageClass string + + // OSDCount is the number of OSDs to provision (default: 1). + OSDCount int + + // OSDSize is the size of each OSD PVC (default: "20Gi"). + OSDSize string + + // OSDDeviceSetName is the `storageClassDeviceSets[].name` (default: + // "set1"). Changing it is useful mostly for debugging. + OSDDeviceSetName string +} + +func (c *CephClusterConfig) applyDefaults() { + if c.Name == "" { + c.Name = DefaultCephClusterName + } + if c.Namespace == "" { + c.Namespace = DefaultRookNamespace + } + if c.CephImage == "" { + c.CephImage = DefaultCephImage + } + if c.AllowUnsupportedCephVersion == nil { + t := true + c.AllowUnsupportedCephVersion = &t + } + if c.MonCount <= 0 { + c.MonCount = 1 + } + if c.MgrCount <= 0 { + c.MgrCount = 1 + } + if c.AllowMultipleMonPerNode == nil { + t := true + c.AllowMultipleMonPerNode = &t + } + if c.DataDirHostPath == "" { + c.DataDirHostPath = DefaultDataDirHostPath + } + if c.OSDCount <= 0 { + c.OSDCount = 1 + } + if c.OSDSize == "" { + c.OSDSize = DefaultOSDStorageClassSize + } + if c.OSDDeviceSetName == "" { + c.OSDDeviceSetName = "set1" + } +} + +// CreateCephCluster creates (or updates) a CephCluster in the given namespace. +// It is idempotent: if the resource already exists, its spec is overwritten +// with the freshly-rendered one so callers can tweak `CephClusterConfig` and +// re-apply without manual cleanup. +func CreateCephCluster(ctx context.Context, kubeconfig *rest.Config, cfg CephClusterConfig) error { + cfg.applyDefaults() + + if cfg.OSDStorageClass == "" { + return fmt.Errorf("CephCluster requires OSDStorageClass (backing StorageClass for OSD PVCs)") + } + + spec := buildCephClusterSpec(cfg) + + obj := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "ceph.rook.io/v1", + "kind": "CephCluster", + "metadata": map[string]interface{}{ + "name": cfg.Name, + "namespace": cfg.Namespace, + }, + "spec": spec, + }, + } + + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + + logger.Info("Creating CephCluster %s/%s (image=%s, mon=%d, mgr=%d, osd=%d x %s on SC %s)", + cfg.Namespace, cfg.Name, cfg.CephImage, cfg.MonCount, cfg.MgrCount, cfg.OSDCount, cfg.OSDSize, cfg.OSDStorageClass) + + _, err = dynamicClient.Resource(CephClusterGVR).Namespace(cfg.Namespace).Create(ctx, obj, metav1.CreateOptions{}) + if err == nil { + logger.Success("CephCluster %s/%s created", cfg.Namespace, cfg.Name) + return nil + } + if !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("failed to create CephCluster %s/%s: %w", cfg.Namespace, cfg.Name, err) + } + + logger.Info("CephCluster %s/%s already exists, updating spec", cfg.Namespace, cfg.Name) + existing, err := dynamicClient.Resource(CephClusterGVR).Namespace(cfg.Namespace).Get(ctx, cfg.Name, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to fetch existing CephCluster %s/%s: %w", cfg.Namespace, cfg.Name, err) + } + existing.Object["spec"] = spec + if _, err := dynamicClient.Resource(CephClusterGVR).Namespace(cfg.Namespace).Update(ctx, existing, metav1.UpdateOptions{}); err != nil { + return fmt.Errorf("failed to update CephCluster %s/%s: %w", cfg.Namespace, cfg.Name, err) + } + return nil +} + +// buildCephClusterSpec renders the spec portion of a CephCluster object. The +// choice of fields follows the Flant internal wiki instruction for +// sds-elastic + Rook + Ceph, stripped down to the parts that matter in e2e: +// - mon/mgr counts come from the config (1/1 by default for single-node); +// - network.provider=host is opt-in via NetworkProvider; +// - OSDs are backed by one `storageClassDeviceSets[0]` entry that points +// to a user-supplied StorageClass capable of issuing block-mode PVCs. +func buildCephClusterSpec(cfg CephClusterConfig) map[string]interface{} { + spec := map[string]interface{}{ + "cephVersion": map[string]interface{}{ + "image": cfg.CephImage, + "allowUnsupported": *cfg.AllowUnsupportedCephVersion, + }, + "dataDirHostPath": cfg.DataDirHostPath, + "skipUpgradeChecks": false, + "continueUpgradeAfterChecksEvenIfNotHealthy": false, + "mon": map[string]interface{}{ + "count": int64(cfg.MonCount), + "allowMultiplePerNode": *cfg.AllowMultipleMonPerNode, + }, + "mgr": map[string]interface{}{ + "count": int64(cfg.MgrCount), + "allowMultiplePerNode": *cfg.AllowMultipleMonPerNode, + "modules": []interface{}{ + map[string]interface{}{ + "name": "pg_autoscaler", + "enabled": true, + }, + }, + }, + "dashboard": map[string]interface{}{ + "enabled": false, + "ssl": false, + }, + "crashCollector": map[string]interface{}{ + "disable": false, + }, + "logCollector": map[string]interface{}{ + "enabled": true, + "periodicity": "daily", + "maxLogSize": "100M", + }, + "priorityClassNames": map[string]interface{}{ + "mon": "system-node-critical", + "osd": "system-node-critical", + "mgr": "system-cluster-critical", + }, + "disruptionManagement": map[string]interface{}{ + "managePodBudgets": true, + "osdMaintenanceTimeout": int64(30), + "pgHealthCheckTimeout": int64(0), + }, + "storage": map[string]interface{}{ + "useAllNodes": true, + "useAllDevices": false, + "storageClassDeviceSets": []interface{}{ + map[string]interface{}{ + "name": cfg.OSDDeviceSetName, + "count": int64(cfg.OSDCount), + "portable": false, + "tuneDeviceClass": true, + "volumeClaimTemplates": []interface{}{ + map[string]interface{}{ + "metadata": map[string]interface{}{ + "name": "data", + }, + "spec": map[string]interface{}{ + "resources": map[string]interface{}{ + "requests": map[string]interface{}{ + "storage": cfg.OSDSize, + }, + }, + "storageClassName": cfg.OSDStorageClass, + "volumeMode": "Block", + "accessModes": []interface{}{"ReadWriteOnce"}, + }, + }, + }, + }, + }, + }, + } + + if cfg.NetworkProvider != "" { + network := map[string]interface{}{ + "provider": cfg.NetworkProvider, + "connections": map[string]interface{}{ + "encryption": map[string]interface{}{"enabled": false}, + "compression": map[string]interface{}{"enabled": false}, + "requireMsgr2": false, + }, + } + + addrs := map[string]interface{}{} + if len(cfg.PublicNetworkCIDRs) > 0 { + addrs["public"] = toInterfaceSlice(cfg.PublicNetworkCIDRs) + } + if len(cfg.ClusterNetworkCIDRs) > 0 { + addrs["cluster"] = toInterfaceSlice(cfg.ClusterNetworkCIDRs) + } + if len(addrs) > 0 { + network["addressRanges"] = addrs + } + spec["network"] = network + } + + return spec +} + +// toInterfaceSlice converts a []string to a []interface{} so it can be +// embedded into an `unstructured.Unstructured`'s object tree. +func toInterfaceSlice(in []string) []interface{} { + out := make([]interface{}, len(in)) + for i, v := range in { + out[i] = v + } + return out +} + +// WaitForCephClusterReady blocks until the CephCluster status reports that +// Ceph is up and healthy. Rook exposes the cluster state through two status +// fields: +// - `status.state` — overall lifecycle phase ("Creating", "Created", +// "Updating", "Error"); +// - `status.ceph.health` — the Ceph health summary ("HEALTH_OK", +// "HEALTH_WARN", "HEALTH_ERR"). On a single-OSD test cluster Ceph often +// sits in HEALTH_WARN (PGs undersized, no replicas), which we still treat +// as "good enough" as long as `status.state == "Created"`. +// +// We return success once `state == "Created"`. HEALTH_ERR is reported in the +// log and does not short-circuit (Rook may recover). +func WaitForCephClusterReady(ctx context.Context, kubeconfig *rest.Config, namespace, name string, timeout time.Duration) error { + if namespace == "" || name == "" { + return fmt.Errorf("namespace and name are required") + } + + logger.Debug("Waiting for CephCluster %s/%s to reach Created state (timeout: %v)", namespace, name, timeout) + + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + + ctx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + ticker := time.NewTicker(10 * time.Second) + defer ticker.Stop() + + for { + obj, err := dynamicClient.Resource(CephClusterGVR).Namespace(namespace).Get(ctx, name, metav1.GetOptions{}) + if err == nil { + state, _, _ := unstructured.NestedString(obj.Object, "status", "state") + health, _, _ := unstructured.NestedString(obj.Object, "status", "ceph", "health") + phase, _, _ := unstructured.NestedString(obj.Object, "status", "phase") + + if state == "Created" || phase == "Ready" { + logger.Success("CephCluster %s/%s is Created (ceph health: %s)", namespace, name, health) + return nil + } + logger.Debug("CephCluster %s/%s state=%q phase=%q health=%q", namespace, name, state, phase, health) + } else if !apierrors.IsNotFound(err) { + logger.Debug("Error getting CephCluster %s/%s: %v", namespace, name, err) + } + + select { + case <-ctx.Done(): + return fmt.Errorf("timeout waiting for CephCluster %s/%s: %w", namespace, name, ctx.Err()) + case <-ticker.C: + } + } +} + +// DeleteCephCluster removes a CephCluster. Tearing down the cluster this way +// is a *destructive* operation — Rook will leave OSD data on host disks under +// `dataDirHostPath` and operator-managed PVCs will not be garbage-collected +// automatically. The operation is still idempotent: a NotFound error is +// swallowed. +func DeleteCephCluster(ctx context.Context, kubeconfig *rest.Config, namespace, name string) error { + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + + if err := dynamicClient.Resource(CephClusterGVR).Namespace(namespace).Delete(ctx, name, metav1.DeleteOptions{}); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed to delete CephCluster %s/%s: %w", namespace, name, err) + } + logger.Info("Deleted CephCluster %s/%s", namespace, name) + return nil +} diff --git a/pkg/kubernetes/cephclusterconnection.go b/pkg/kubernetes/cephclusterconnection.go new file mode 100644 index 0000000..3110cfb --- /dev/null +++ b/pkg/kubernetes/cephclusterconnection.go @@ -0,0 +1,273 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubernetes + +import ( + "context" + "fmt" + "time" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/rest" + + "github.com/deckhouse/storage-e2e/internal/logger" +) + +// GVRs of the csi-ceph cluster-scoped CRs. We use unstructured to avoid +// pulling github.com/deckhouse/csi-ceph/api into go.mod just for these +// tiny types. +var ( + CephClusterConnectionGVR = schema.GroupVersionResource{ + Group: "storage.deckhouse.io", + Version: "v1alpha1", + Resource: "cephclusterconnections", + } + CephClusterAuthenticationGVR = schema.GroupVersionResource{ + Group: "storage.deckhouse.io", + Version: "v1alpha1", + Resource: "cephclusterauthentications", + } +) + +// CephClusterAuthenticationConfig describes CephX credentials that csi-ceph +// reuses for every StorageClass that references the authentication. +type CephClusterAuthenticationConfig struct { + // Name of the CephClusterAuthentication CR. + Name string + // UserID is the Ceph user (typically "admin"). + UserID string + // UserKey is the CephX key of UserID. + UserKey string +} + +// CreateCephClusterAuthentication creates (or updates) a +// CephClusterAuthentication CR with the given CephX credentials. +func CreateCephClusterAuthentication(ctx context.Context, kubeconfig *rest.Config, cfg CephClusterAuthenticationConfig) error { + if cfg.Name == "" { + return fmt.Errorf("CephClusterAuthentication name is required") + } + if cfg.UserID == "" { + return fmt.Errorf("CephClusterAuthentication UserID is required") + } + if cfg.UserKey == "" { + return fmt.Errorf("CephClusterAuthentication UserKey is required") + } + + obj := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "storage.deckhouse.io/v1alpha1", + "kind": "CephClusterAuthentication", + "metadata": map[string]interface{}{ + "name": cfg.Name, + }, + "spec": map[string]interface{}{ + "userID": cfg.UserID, + "userKey": cfg.UserKey, + }, + }, + } + + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + + logger.Info("Creating CephClusterAuthentication %s (userID=%s)", cfg.Name, cfg.UserID) + _, err = dynamicClient.Resource(CephClusterAuthenticationGVR).Create(ctx, obj, metav1.CreateOptions{}) + if err == nil { + return nil + } + if !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("failed to create CephClusterAuthentication %s: %w", cfg.Name, err) + } + + logger.Info("CephClusterAuthentication %s already exists, updating spec", cfg.Name) + existing, err := dynamicClient.Resource(CephClusterAuthenticationGVR).Get(ctx, cfg.Name, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to fetch CephClusterAuthentication %s: %w", cfg.Name, err) + } + existing.Object["spec"] = obj.Object["spec"] + if _, err := dynamicClient.Resource(CephClusterAuthenticationGVR).Update(ctx, existing, metav1.UpdateOptions{}); err != nil { + return fmt.Errorf("failed to update CephClusterAuthentication %s: %w", cfg.Name, err) + } + return nil +} + +// DeleteCephClusterAuthentication removes a CephClusterAuthentication. +// NotFound is treated as success. +func DeleteCephClusterAuthentication(ctx context.Context, kubeconfig *rest.Config, name string) error { + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + if err := dynamicClient.Resource(CephClusterAuthenticationGVR).Delete(ctx, name, metav1.DeleteOptions{}); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed to delete CephClusterAuthentication %s: %w", name, err) + } + logger.Info("Deleted CephClusterAuthentication %s", name) + return nil +} + +// CephClusterConnectionConfig describes a csi-ceph CephClusterConnection CR. +// Its spec.clusterID (== Ceph fsid) is immutable once created. +type CephClusterConnectionConfig struct { + // Name of the CephClusterConnection CR. + Name string + // ClusterID is the Ceph fsid. Immutable after creation. + ClusterID string + // Monitors is the list of `ip:port` monitor endpoints. + Monitors []string + // UserID is the Ceph user (typically "admin"). + UserID string + // UserKey is the CephX key of UserID. + UserKey string +} + +// CreateCephClusterConnection creates (or updates) a CephClusterConnection CR. +// If the resource already exists we do *not* attempt to update spec.clusterID +// (which the CRD marks immutable) — only Monitors/UserID/UserKey are synced. +func CreateCephClusterConnection(ctx context.Context, kubeconfig *rest.Config, cfg CephClusterConnectionConfig) error { + if cfg.Name == "" { + return fmt.Errorf("CephClusterConnection name is required") + } + if cfg.ClusterID == "" { + return fmt.Errorf("CephClusterConnection ClusterID (fsid) is required") + } + if len(cfg.Monitors) == 0 { + return fmt.Errorf("CephClusterConnection Monitors is required") + } + + monitors := make([]interface{}, len(cfg.Monitors)) + for i, m := range cfg.Monitors { + monitors[i] = m + } + + obj := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "storage.deckhouse.io/v1alpha1", + "kind": "CephClusterConnection", + "metadata": map[string]interface{}{ + "name": cfg.Name, + }, + "spec": map[string]interface{}{ + "clusterID": cfg.ClusterID, + "monitors": monitors, + "userID": cfg.UserID, + "userKey": cfg.UserKey, + }, + }, + } + + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + + logger.Info("Creating CephClusterConnection %s (clusterID=%s, mons=%d)", cfg.Name, cfg.ClusterID, len(cfg.Monitors)) + _, err = dynamicClient.Resource(CephClusterConnectionGVR).Create(ctx, obj, metav1.CreateOptions{}) + if err == nil { + return nil + } + if !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("failed to create CephClusterConnection %s: %w", cfg.Name, err) + } + + logger.Info("CephClusterConnection %s already exists, syncing monitors/userID/userKey", cfg.Name) + existing, err := dynamicClient.Resource(CephClusterConnectionGVR).Get(ctx, cfg.Name, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to fetch CephClusterConnection %s: %w", cfg.Name, err) + } + if err := unstructured.SetNestedSlice(existing.Object, monitors, "spec", "monitors"); err != nil { + return fmt.Errorf("set monitors: %w", err) + } + if err := unstructured.SetNestedField(existing.Object, cfg.UserID, "spec", "userID"); err != nil { + return fmt.Errorf("set userID: %w", err) + } + if err := unstructured.SetNestedField(existing.Object, cfg.UserKey, "spec", "userKey"); err != nil { + return fmt.Errorf("set userKey: %w", err) + } + if _, err := dynamicClient.Resource(CephClusterConnectionGVR).Update(ctx, existing, metav1.UpdateOptions{}); err != nil { + return fmt.Errorf("failed to update CephClusterConnection %s: %w", cfg.Name, err) + } + return nil +} + +// DeleteCephClusterConnection removes a CephClusterConnection. +// NotFound is treated as success. +func DeleteCephClusterConnection(ctx context.Context, kubeconfig *rest.Config, name string) error { + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + if err := dynamicClient.Resource(CephClusterConnectionGVR).Delete(ctx, name, metav1.DeleteOptions{}); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed to delete CephClusterConnection %s: %w", name, err) + } + logger.Info("Deleted CephClusterConnection %s", name) + return nil +} + +// WaitForCephClusterConnectionCreated polls until the CephClusterConnection +// status reports phase=Created. csi-ceph's controller flips the status from +// Pending to Created once it has verified the supplied fsid / monitors / +// CephX credentials against the real Ceph cluster. +func WaitForCephClusterConnectionCreated(ctx context.Context, kubeconfig *rest.Config, name string, timeout time.Duration) error { + if name == "" { + return fmt.Errorf("name is required") + } + + logger.Debug("Waiting for CephClusterConnection %s phase=Created (timeout: %v)", name, timeout) + + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + + ctx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + + for { + obj, err := dynamicClient.Resource(CephClusterConnectionGVR).Get(ctx, name, metav1.GetOptions{}) + if err == nil { + phase, _, _ := unstructured.NestedString(obj.Object, "status", "phase") + reason, _, _ := unstructured.NestedString(obj.Object, "status", "reason") + if phase == "Created" { + logger.Success("CephClusterConnection %s is Created", name) + return nil + } + logger.Debug("CephClusterConnection %s phase=%q reason=%q", name, phase, reason) + } else if !apierrors.IsNotFound(err) { + logger.Debug("Error getting CephClusterConnection %s: %v", name, err) + } + + select { + case <-ctx.Done(): + return fmt.Errorf("timeout waiting for CephClusterConnection %s: %w", name, ctx.Err()) + case <-ticker.C: + } + } +} diff --git a/pkg/kubernetes/cephcredentials.go b/pkg/kubernetes/cephcredentials.go new file mode 100644 index 0000000..11f68ec --- /dev/null +++ b/pkg/kubernetes/cephcredentials.go @@ -0,0 +1,183 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubernetes + +import ( + "context" + "fmt" + "sort" + "strings" + "time" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/rest" + + "github.com/deckhouse/storage-e2e/internal/logger" +) + +// Well-known Rook resources that hold Ceph connection data. +const ( + // RookMonSecretName is the Secret that the Rook operator populates with + // admin credentials and cluster fsid once the CephCluster is bootstrapped. + RookMonSecretName = "rook-ceph-mon" + + // RookMonEndpointsConfigMapName is the ConfigMap the operator keeps in + // sync with the current set of Ceph monitors. + RookMonEndpointsConfigMapName = "rook-ceph-mon-endpoints" +) + +// CephCredentials holds the information a Ceph CSI client needs to connect +// to a cluster bootstrapped by Rook. +type CephCredentials struct { + // FSID is the Ceph cluster unique identifier. + FSID string + + // AdminUser is the Ceph user name (typically "admin"). + AdminUser string + + // AdminKey is the CephX key for AdminUser. + AdminKey string + + // Monitors is the list of monitor endpoints in "IP:PORT" form, sorted + // alphabetically to make the output stable across runs. + Monitors []string +} + +// WaitForCephCredentials blocks until all pieces of information required to +// connect to the Rook-managed Ceph cluster are populated: +// - Secret `rook-ceph-mon` exists and has `fsid`, `ceph-username`, `ceph-secret`. +// - ConfigMap `rook-ceph-mon-endpoints` exists and has at least one reachable monitor. +// +// The returned CephCredentials is suitable for wiring csi-ceph CRs +// (CephClusterConnection, CephClusterAuthentication). +func WaitForCephCredentials(ctx context.Context, kubeconfig *rest.Config, namespace string, timeout time.Duration) (*CephCredentials, error) { + if namespace == "" { + return nil, fmt.Errorf("namespace is required") + } + + logger.Debug("Waiting for Ceph credentials in %s (timeout: %v)", namespace, timeout) + + clientset, err := NewClientsetWithRetry(ctx, kubeconfig) + if err != nil { + return nil, fmt.Errorf("failed to create clientset: %w", err) + } + + ctx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + + for { + secret, err := clientset.CoreV1().Secrets(namespace).Get(ctx, RookMonSecretName, metav1.GetOptions{}) + if err != nil && !apierrors.IsNotFound(err) { + logger.Debug("Failed to get Secret %s/%s: %v", namespace, RookMonSecretName, err) + } + + cm, cmErr := clientset.CoreV1().ConfigMaps(namespace).Get(ctx, RookMonEndpointsConfigMapName, metav1.GetOptions{}) + if cmErr != nil && !apierrors.IsNotFound(cmErr) { + logger.Debug("Failed to get ConfigMap %s/%s: %v", namespace, RookMonEndpointsConfigMapName, cmErr) + } + + if err == nil && cmErr == nil { + creds, extractErr := extractCephCredentials(secret.Data, cm.Data) + if extractErr == nil { + logger.Success("Ceph credentials ready in %s (fsid=%s, %d monitor(s))", namespace, creds.FSID, len(creds.Monitors)) + return creds, nil + } + logger.Debug("Rook credentials not complete yet: %v", extractErr) + } + + select { + case <-ctx.Done(): + return nil, fmt.Errorf("timeout waiting for Ceph credentials in %s: %w", namespace, ctx.Err()) + case <-ticker.C: + } + } +} + +// extractCephCredentials parses the Rook-managed Secret/ConfigMap payloads +// into a CephCredentials struct. It returns an error if any required field +// is missing so the caller can keep polling until the operator has populated +// everything. +func extractCephCredentials(secretData map[string][]byte, cmData map[string]string) (*CephCredentials, error) { + fsid := strings.TrimSpace(string(secretData["fsid"])) + if fsid == "" { + return nil, fmt.Errorf("Secret %s is missing `fsid`", RookMonSecretName) + } + + adminUser := strings.TrimSpace(string(secretData["ceph-username"])) + if adminUser == "" { + adminUser = "client.admin" + } + adminUser = strings.TrimPrefix(adminUser, "client.") + + adminKey := strings.TrimSpace(string(secretData["ceph-secret"])) + if adminKey == "" { + return nil, fmt.Errorf("Secret %s is missing `ceph-secret`", RookMonSecretName) + } + + raw, ok := cmData["data"] + if !ok { + return nil, fmt.Errorf("ConfigMap %s is missing `data`", RookMonEndpointsConfigMapName) + } + monitors, err := parseMonEndpoints(raw) + if err != nil { + return nil, err + } + if len(monitors) == 0 { + return nil, fmt.Errorf("ConfigMap %s has no populated monitor endpoints", RookMonEndpointsConfigMapName) + } + + return &CephCredentials{ + FSID: fsid, + AdminUser: adminUser, + AdminKey: adminKey, + Monitors: monitors, + }, nil +} + +// parseMonEndpoints parses the Rook-maintained monitor endpoints string. +// +// Rook stores the current mon list in the `data` key of the +// `rook-ceph-mon-endpoints` ConfigMap as a comma-separated list of +// `=:` pairs, for example: +// +// a=10.0.0.1:6789,b=10.0.0.2:6789,c=10.0.0.3:6789 +// +// This helper returns just the `:` portion of every entry, sorted +// alphabetically for stable output. +func parseMonEndpoints(raw string) ([]string, error) { + out := []string{} + for _, part := range strings.Split(raw, ",") { + part = strings.TrimSpace(part) + if part == "" { + continue + } + // Strip the "=" prefix if present. + if idx := strings.Index(part, "="); idx >= 0 { + part = part[idx+1:] + } + if part == "" { + continue + } + out = append(out, part) + } + sort.Strings(out) + return out, nil +} diff --git a/pkg/kubernetes/cephstorageclass.go b/pkg/kubernetes/cephstorageclass.go new file mode 100644 index 0000000..6bf256c --- /dev/null +++ b/pkg/kubernetes/cephstorageclass.go @@ -0,0 +1,230 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubernetes + +import ( + "context" + "fmt" + "time" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/rest" + + "github.com/deckhouse/storage-e2e/internal/logger" +) + +// CephStorageClassGVR points at csi-ceph's CephStorageClass CR (not to be +// confused with Rook's CephCluster / CephBlockPool). +var CephStorageClassGVR = schema.GroupVersionResource{ + Group: "storage.deckhouse.io", + Version: "v1alpha1", + Resource: "cephstorageclasses", +} + +// Supported CephStorageClass types, mirroring csi-ceph's CRD enum. +const ( + CephStorageClassTypeRBD = "RBD" + CephStorageClassTypeCephFS = "CephFS" +) + +// CephStorageClassConfig is an intentionally narrow shape tailored for the +// e2e scenarios we care about today — an RBD StorageClass backed by a single +// block pool. CephFS variant is supported but requires FSName+FSPool to be +// set by the caller. +type CephStorageClassConfig struct { + // Name of the CephStorageClass CR (becomes the k8s StorageClass name). + Name string + + // ClusterConnectionName points at a CephClusterConnection CR. + ClusterConnectionName string + + // ClusterAuthenticationName points at a CephClusterAuthentication CR. + ClusterAuthenticationName string + + // ReclaimPolicy mirrors StorageClass.ReclaimPolicy ("Delete" / "Retain"). + // Default: "Delete". + ReclaimPolicy string + + // Type is "RBD" (default) or "CephFS". + Type string + + // --- RBD options (Type == "RBD") --- + + // RBDPool is the Ceph pool name (e.g. "ceph-rbd-r1"). + RBDPool string + + // RBDDefaultFSType picks the filesystem mkfs on volume attach. + // Default: "ext4". + RBDDefaultFSType string + + // --- CephFS options (Type == "CephFS") --- + CephFSName string // Name of the CephFilesystem. + CephFSPool string // Pool to use inside that filesystem. +} + +// CreateCephStorageClass creates (or updates) a CephStorageClass CR. On +// success the csi-ceph controller provisions a corresponding core +// storage.k8s.io/v1 StorageClass in the cluster. +func CreateCephStorageClass(ctx context.Context, kubeconfig *rest.Config, cfg CephStorageClassConfig) error { + if cfg.Name == "" { + return fmt.Errorf("CephStorageClass name is required") + } + if cfg.ClusterConnectionName == "" { + return fmt.Errorf("CephStorageClass ClusterConnectionName is required") + } + if cfg.ClusterAuthenticationName == "" { + return fmt.Errorf("CephStorageClass ClusterAuthenticationName is required") + } + if cfg.Type == "" { + cfg.Type = CephStorageClassTypeRBD + } + if cfg.ReclaimPolicy == "" { + cfg.ReclaimPolicy = "Delete" + } + + spec := map[string]interface{}{ + "clusterConnectionName": cfg.ClusterConnectionName, + "clusterAuthenticationName": cfg.ClusterAuthenticationName, + "reclaimPolicy": cfg.ReclaimPolicy, + "type": cfg.Type, + } + + switch cfg.Type { + case CephStorageClassTypeRBD: + if cfg.RBDPool == "" { + return fmt.Errorf("CephStorageClass of type RBD requires RBDPool") + } + if cfg.RBDDefaultFSType == "" { + cfg.RBDDefaultFSType = "ext4" + } + spec["rbd"] = map[string]interface{}{ + "defaultFSType": cfg.RBDDefaultFSType, + "pool": cfg.RBDPool, + } + case CephStorageClassTypeCephFS: + if cfg.CephFSName == "" || cfg.CephFSPool == "" { + return fmt.Errorf("CephStorageClass of type CephFS requires CephFSName and CephFSPool") + } + spec["cephFS"] = map[string]interface{}{ + "fsName": cfg.CephFSName, + "pool": cfg.CephFSPool, + } + default: + return fmt.Errorf("unsupported CephStorageClass Type: %s", cfg.Type) + } + + obj := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "storage.deckhouse.io/v1alpha1", + "kind": "CephStorageClass", + "metadata": map[string]interface{}{ + "name": cfg.Name, + }, + "spec": spec, + }, + } + + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + + logger.Info("Creating CephStorageClass %s (type=%s, conn=%s, auth=%s)", + cfg.Name, cfg.Type, cfg.ClusterConnectionName, cfg.ClusterAuthenticationName) + _, err = dynamicClient.Resource(CephStorageClassGVR).Create(ctx, obj, metav1.CreateOptions{}) + if err == nil { + return nil + } + if !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("failed to create CephStorageClass %s: %w", cfg.Name, err) + } + + logger.Info("CephStorageClass %s already exists, updating spec", cfg.Name) + existing, err := dynamicClient.Resource(CephStorageClassGVR).Get(ctx, cfg.Name, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to fetch CephStorageClass %s: %w", cfg.Name, err) + } + existing.Object["spec"] = spec + if _, err := dynamicClient.Resource(CephStorageClassGVR).Update(ctx, existing, metav1.UpdateOptions{}); err != nil { + return fmt.Errorf("failed to update CephStorageClass %s: %w", cfg.Name, err) + } + return nil +} + +// DeleteCephStorageClass removes a CephStorageClass. NotFound is treated as +// success. The underlying k8s StorageClass is removed by the csi-ceph +// controller as a side effect. +func DeleteCephStorageClass(ctx context.Context, kubeconfig *rest.Config, name string) error { + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + if err := dynamicClient.Resource(CephStorageClassGVR).Delete(ctx, name, metav1.DeleteOptions{}); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed to delete CephStorageClass %s: %w", name, err) + } + logger.Info("Deleted CephStorageClass %s", name) + return nil +} + +// WaitForCephStorageClassCreated polls until the CephStorageClass status +// reports phase=Created (the csi-ceph controller flips this once the backing +// k8s StorageClass has been provisioned). +func WaitForCephStorageClassCreated(ctx context.Context, kubeconfig *rest.Config, name string, timeout time.Duration) error { + if name == "" { + return fmt.Errorf("name is required") + } + + logger.Debug("Waiting for CephStorageClass %s phase=Created (timeout: %v)", name, timeout) + + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + + ctx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + ticker := time.NewTicker(3 * time.Second) + defer ticker.Stop() + + for { + obj, err := dynamicClient.Resource(CephStorageClassGVR).Get(ctx, name, metav1.GetOptions{}) + if err == nil { + phase, _, _ := unstructured.NestedString(obj.Object, "status", "phase") + reason, _, _ := unstructured.NestedString(obj.Object, "status", "reason") + if phase == "Created" { + logger.Success("CephStorageClass %s is Created", name) + return nil + } + logger.Debug("CephStorageClass %s phase=%q reason=%q", name, phase, reason) + } else if !apierrors.IsNotFound(err) { + logger.Debug("Error getting CephStorageClass %s: %v", name, err) + } + + select { + case <-ctx.Done(): + return fmt.Errorf("timeout waiting for CephStorageClass %s: %w", name, ctx.Err()) + case <-ticker.C: + } + } +} diff --git a/pkg/kubernetes/rookconfigoverride.go b/pkg/kubernetes/rookconfigoverride.go new file mode 100644 index 0000000..2027318 --- /dev/null +++ b/pkg/kubernetes/rookconfigoverride.go @@ -0,0 +1,140 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubernetes + +import ( + "context" + "fmt" + "sort" + "strings" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/rest" + + "github.com/deckhouse/storage-e2e/internal/logger" +) + +// RookConfigOverrideName is the well-known ConfigMap name Rook reads Ceph +// config overrides from (see Rook docs: "Advanced Configuration – Custom +// ceph.conf Settings"). Rook watches this ConfigMap in its operator namespace +// and injects the `config` key into `/etc/ceph/ceph.conf` of every Ceph daemon. +const RookConfigOverrideName = "rook-config-override" + +// SetRookConfigOverride creates or updates the `rook-config-override` ConfigMap +// in the given Rook operator namespace so that Ceph daemons pick up the +// provided global settings. +// +// The ConfigMap format expected by Rook is: +// +// apiVersion: v1 +// kind: ConfigMap +// metadata: +// name: rook-config-override +// namespace: +// data: +// config: | +// [global] +// key1 = value1 +// key2 = value2 +// +// `globals` is rendered under `[global]`. Keys are sorted for a stable output. +// Passing an empty/nil `globals` map produces an empty `[global]` section, +// which effectively clears previously-set overrides. +func SetRookConfigOverride(ctx context.Context, kubeconfig *rest.Config, namespace string, globals map[string]string) error { + if namespace == "" { + return fmt.Errorf("namespace is required") + } + + clientset, err := NewClientsetWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create clientset: %w", err) + } + + cfg := renderCephGlobalConfig(globals) + + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: RookConfigOverrideName, + Namespace: namespace, + }, + Data: map[string]string{ + "config": cfg, + }, + } + + existing, err := clientset.CoreV1().ConfigMaps(namespace).Get(ctx, RookConfigOverrideName, metav1.GetOptions{}) + if err != nil { + if apierrors.IsNotFound(err) { + logger.Info("Creating ConfigMap %s/%s with Ceph global overrides (%d keys)", namespace, RookConfigOverrideName, len(globals)) + if _, err := clientset.CoreV1().ConfigMaps(namespace).Create(ctx, cm, metav1.CreateOptions{}); err != nil { + return fmt.Errorf("failed to create ConfigMap %s/%s: %w", namespace, RookConfigOverrideName, err) + } + return nil + } + return fmt.Errorf("failed to get ConfigMap %s/%s: %w", namespace, RookConfigOverrideName, err) + } + + logger.Info("Updating ConfigMap %s/%s with Ceph global overrides (%d keys)", namespace, RookConfigOverrideName, len(globals)) + existing.Data = cm.Data + if _, err := clientset.CoreV1().ConfigMaps(namespace).Update(ctx, existing, metav1.UpdateOptions{}); err != nil { + return fmt.Errorf("failed to update ConfigMap %s/%s: %w", namespace, RookConfigOverrideName, err) + } + return nil +} + +// DeleteRookConfigOverride removes the `rook-config-override` ConfigMap. It +// is safe to call when the ConfigMap does not exist. +func DeleteRookConfigOverride(ctx context.Context, kubeconfig *rest.Config, namespace string) error { + if namespace == "" { + return fmt.Errorf("namespace is required") + } + + clientset, err := NewClientsetWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create clientset: %w", err) + } + + if err := clientset.CoreV1().ConfigMaps(namespace).Delete(ctx, RookConfigOverrideName, metav1.DeleteOptions{}); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed to delete ConfigMap %s/%s: %w", namespace, RookConfigOverrideName, err) + } + logger.Info("Deleted ConfigMap %s/%s", namespace, RookConfigOverrideName) + return nil +} + +// renderCephGlobalConfig renders a `[global]` section for ceph.conf from the +// provided key/value pairs. Keys are sorted so the rendered output is stable +// across calls with logically-equivalent maps (avoids unnecessary CM updates). +func renderCephGlobalConfig(globals map[string]string) string { + var b strings.Builder + b.WriteString("[global]\n") + + keys := make([]string, 0, len(globals)) + for k := range globals { + keys = append(keys, k) + } + sort.Strings(keys) + + for _, k := range keys { + fmt.Fprintf(&b, "%s = %s\n", k, globals[k]) + } + return b.String() +} diff --git a/pkg/kubernetes/storageclass_manage.go b/pkg/kubernetes/storageclass_manage.go new file mode 100644 index 0000000..bb7fb94 --- /dev/null +++ b/pkg/kubernetes/storageclass_manage.go @@ -0,0 +1,100 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubernetes + +import ( + "context" + "fmt" + + corev1 "k8s.io/api/core/v1" + storagev1 "k8s.io/api/storage/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/rest" + + "github.com/deckhouse/storage-e2e/internal/logger" +) + +type StorageClassCreateConfig struct { + Name string + Provisioner string + Parameters map[string]string + VolumeBindingMode storagev1.VolumeBindingMode + ReclaimPolicy corev1.PersistentVolumeReclaimPolicy + AllowExpansion bool + MakeDefault bool + AdditionalLabels map[string]string + AdditionalAnnot map[string]string +} + +func CreateStorageClass(ctx context.Context, kubeconfig *rest.Config, cfg StorageClassCreateConfig) error { + if cfg.Name == "" { + return fmt.Errorf("storage class name is required") + } + if cfg.Provisioner == "" { + return fmt.Errorf("provisioner is required") + } + + clientset, err := NewClientsetWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create clientset: %w", err) + } + + annotations := map[string]string{} + for k, v := range cfg.AdditionalAnnot { + annotations[k] = v + } + if cfg.MakeDefault { + annotations["storageclass.kubernetes.io/is-default-class"] = "true" + annotations["storageclass.beta.kubernetes.io/is-default-class"] = "true" + } + + labels := map[string]string{} + for k, v := range cfg.AdditionalLabels { + labels[k] = v + } + + sc := &storagev1.StorageClass{ + TypeMeta: metav1.TypeMeta{ + Kind: "StorageClass", + APIVersion: "storage.k8s.io/v1", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: cfg.Name, + Labels: labels, + Annotations: annotations, + }, + Provisioner: cfg.Provisioner, + Parameters: cfg.Parameters, + ReclaimPolicy: &cfg.ReclaimPolicy, + AllowVolumeExpansion: &cfg.AllowExpansion, + VolumeBindingMode: &cfg.VolumeBindingMode, + } + + logger.Info("Creating StorageClass %s (provisioner=%s)", cfg.Name, cfg.Provisioner) + _, err = clientset.StorageV1().StorageClasses().Create(ctx, sc, metav1.CreateOptions{}) + if err != nil { + if apierrors.IsAlreadyExists(err) { + logger.Info("StorageClass %s already exists, skipping create", cfg.Name) + return nil + } + return fmt.Errorf("failed to create StorageClass %s: %w", cfg.Name, err) + } + logger.Success("StorageClass %s created", cfg.Name) + return nil +} + diff --git a/pkg/kubernetes/volumesnapshotclass.go b/pkg/kubernetes/volumesnapshotclass.go new file mode 100644 index 0000000..9307615 --- /dev/null +++ b/pkg/kubernetes/volumesnapshotclass.go @@ -0,0 +1,125 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubernetes + +import ( + "context" + "fmt" + "time" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/rest" + + "github.com/deckhouse/storage-e2e/internal/logger" +) + +var VolumeSnapshotClassGVR = schema.GroupVersionResource{ + Group: "snapshot.storage.k8s.io", + Version: "v1", + Resource: "volumesnapshotclasses", +} + +type VolumeSnapshotClassConfig struct { + Name string + Driver string + DeletionPolicy string // "Delete" or "Retain" + Parameters map[string]string + MakeDefault bool +} + +func CreateVolumeSnapshotClass(ctx context.Context, kubeconfig *rest.Config, cfg VolumeSnapshotClassConfig) error { + if cfg.Name == "" { + return fmt.Errorf("volume snapshot class name is required") + } + if cfg.Driver == "" { + return fmt.Errorf("driver is required") + } + if cfg.DeletionPolicy == "" { + cfg.DeletionPolicy = "Delete" + } + + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + + annotations := map[string]interface{}{} + if cfg.MakeDefault { + annotations["snapshot.storage.kubernetes.io/is-default-class"] = "true" + } + + parameters := map[string]interface{}{} + for k, v := range cfg.Parameters { + parameters[k] = v + } + + vsc := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "snapshot.storage.k8s.io/v1", + "kind": "VolumeSnapshotClass", + "metadata": map[string]interface{}{ + "name": cfg.Name, + "annotations": annotations, + }, + "driver": cfg.Driver, + "deletionPolicy": cfg.DeletionPolicy, + "parameters": parameters, + }, + } + + logger.Info("Creating VolumeSnapshotClass %s (driver=%s, deletionPolicy=%s)", cfg.Name, cfg.Driver, cfg.DeletionPolicy) + _, err = dynamicClient.Resource(VolumeSnapshotClassGVR).Create(ctx, vsc, metav1.CreateOptions{}) + if err != nil { + if apierrors.IsAlreadyExists(err) { + logger.Info("VolumeSnapshotClass %s already exists, skipping create", cfg.Name) + return nil + } + return fmt.Errorf("failed to create VolumeSnapshotClass %s: %w", cfg.Name, err) + } + logger.Success("VolumeSnapshotClass %s created", cfg.Name) + return nil +} + +func WaitForVolumeSnapshotClass(ctx context.Context, kubeconfig *rest.Config, name string, timeout time.Duration) error { + logger.Debug("Waiting for VolumeSnapshotClass %s to become available (timeout: %v)", name, timeout) + + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + + deadline := time.Now().Add(timeout) + for { + if ctx.Err() != nil { + return ctx.Err() + } + if time.Now().After(deadline) { + return fmt.Errorf("timeout waiting for VolumeSnapshotClass %s", name) + } + + _, err := dynamicClient.Resource(VolumeSnapshotClassGVR).Get(ctx, name, metav1.GetOptions{}) + if err == nil { + logger.Success("VolumeSnapshotClass %s is available", name) + return nil + } + + time.Sleep(5 * time.Second) + } +} diff --git a/pkg/testkit/ceph.go b/pkg/testkit/ceph.go new file mode 100644 index 0000000..f7e0e5e --- /dev/null +++ b/pkg/testkit/ceph.go @@ -0,0 +1,441 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package testkit + +import ( + "context" + "fmt" + "time" + + "k8s.io/client-go/rest" + + "github.com/deckhouse/storage-e2e/internal/infrastructure/ssh" + "github.com/deckhouse/storage-e2e/internal/logger" + "github.com/deckhouse/storage-e2e/pkg/kubernetes" +) + +// CephStorageClassConfig controls the end-to-end provisioning of a +// Rook-managed Ceph cluster plus a csi-ceph-backed k8s StorageClass: +// +// 1. Enables Deckhouse modules required for the stack: +// sds-node-configurator, sds-elastic (Rook), csi-ceph. +// 2. (Optional) Falls back to EnsureDefaultStorageClass to produce a +// sds-local-volume StorageClass for backing OSD PVCs. +// 3. Seeds `rook-config-override` with per-test global Ceph settings +// (e.g. `ms_crc_data = false` for the PR #131 scenario). +// 4. Creates a CephCluster (Rook) and waits until it is Created. +// 5. Creates a CephBlockPool and waits until it is Ready. +// 6. Reads fsid / monitors / CephX admin key from Rook-managed secrets +// and wires them into CephClusterConnection + CephClusterAuthentication +// CRs so csi-ceph can talk to the cluster. +// 7. Creates a CephStorageClass CR and waits for the csi-ceph controller +// to materialize a core storage.k8s.io/v1 StorageClass. +// +// Only StorageClassName is strictly required; everything else has sensible +// defaults tuned for single-node / tiny test clusters. +type CephStorageClassConfig struct { + // --- Top-level identity --- + + // StorageClassName is the name of the CephStorageClass CR (and of the + // resulting k8s StorageClass). Required. + StorageClassName string + + // Namespace is the Rook / sds-elastic namespace. Default: "d8-sds-elastic". + Namespace string + + // --- sds-elastic / Rook CephCluster --- + + // CephClusterName is the Rook CephCluster name. Default: "ceph-cluster". + CephClusterName string + + // CephImage is the Ceph container image tag. Default: "quay.io/ceph/ceph:v18.2.7". + CephImage string + + // MonCount / MgrCount are the Rook mon/mgr replica counts. + // Defaults: 1 / 1 (good for 1..3 node test clusters). + MonCount int + MgrCount int + + // NetworkProvider: "" for CNI (default), "host" for host networking. + NetworkProvider string + PublicNetworkCIDRs []string + ClusterNetworkCIDRs []string + + // GlobalCephConfigOverrides populates `rook-config-override` under + // `[global]`, e.g. {"ms_crc_data": "false"}. nil / empty map leaves + // the ConfigMap untouched except for creating it as an empty `[global]`. + GlobalCephConfigOverrides map[string]string + + // --- OSD backing --- + + // OSDStorageClass is a block-capable StorageClass used to back OSD PVCs. + // When empty, EnsureDefaultStorageClass is invoked with + // OSDBackingStorageClass* to provision a sds-local-volume SC. + OSDStorageClass string + + // OSDCount is the number of OSDs. Default: 1. + OSDCount int + + // OSDSize is the size of each OSD PVC. Default: "20Gi". + OSDSize string + + // --- Fallback SC provisioning via sds-local-volume (when OSDStorageClass is empty) --- + + // OSDBackingStorageClassName names the sds-local-volume SC that we + // auto-provision for OSDs. Default: "sds-local-volume-thin-ceph-osd". + OSDBackingStorageClassName string + + // OSDBackingLVMType passed to EnsureDefaultStorageClass ("Thick"/"Thin"). + // Default: "Thick" (simpler for block-mode PVCs used as Ceph OSDs). + OSDBackingLVMType string + + // OSDBackingIncludeMasters exposes EnsureDefaultStorageClass.IncludeMasters. + OSDBackingIncludeMasters bool + + // OSDBackingBaseKubeconfig/VMNamespace/BaseStorageClassName are plumbed + // through to EnsureDefaultStorageClass to enable automatic VirtualDisk + // attachment on nested-VM clusters. + OSDBackingBaseKubeconfig *rest.Config + OSDBackingVMNamespace string + OSDBackingBaseStorageClassName string + + // MasterSSH is optional SSH access to the control plane. Not used by + // EnsureCephStorageClass in this revision; callers may set it for + // follow-up bootstrap or diagnostics hooks. + MasterSSH ssh.SSHClient + + // --- CephBlockPool --- + + // PoolName is the Rook CephBlockPool name (also becomes the Ceph pool + // name referenced by CephStorageClass.spec.rbd.pool). + // Default: "ceph-rbd-r". + PoolName string + + // ReplicaSize is the CephBlockPool replication factor. Default: 1. + ReplicaSize int + + // FailureDomain is the CRUSH failure domain: "host" or "osd". + // Default: "osd" when ReplicaSize==1, "host" otherwise. + FailureDomain string + + // --- csi-ceph wiring --- + + // ClusterConnectionName and ClusterAuthenticationName point at the + // CephClusterConnection / CephClusterAuthentication CRs we create. + // Defaults: both "-conn". + ClusterConnectionName string + ClusterAuthenticationName string + + // RBDDefaultFSType picks the mkfs used on attach. Default: "ext4". + RBDDefaultFSType string + + // --- Modules --- + + // SkipModuleEnablement disables the module-enable step (useful when the + // caller has already configured ModuleConfig on the cluster). + SkipModuleEnablement bool + + // SdsElasticSettings overrides `spec.settings` of the sds-elastic + // ModuleConfig. Defaults to the minimal set that makes sense on a + // single-node test cluster. + SdsElasticSettings map[string]interface{} + + // CsiCephSettings overrides `spec.settings` of the csi-ceph ModuleConfig. + CsiCephSettings map[string]interface{} + + // CsiCephModulePullOverride pins a specific csi-ceph image tag (dev + // registry only). Useful for testing PRs that haven't been released yet. + CsiCephModulePullOverride string + + // --- Timeouts --- + + ModulesReadyTimeout time.Duration // default 15m + CephClusterReadyTimeout time.Duration // default 20m + CephPoolReadyTimeout time.Duration // default 10m + CredentialsTimeout time.Duration // default 10m + CSICephPhaseTimeout time.Duration // default 5m + StorageClassWaitTimeout time.Duration // default 2m +} + +func (c *CephStorageClassConfig) applyDefaults() { + if c.Namespace == "" { + c.Namespace = kubernetes.DefaultRookNamespace + } + if c.CephClusterName == "" { + c.CephClusterName = kubernetes.DefaultCephClusterName + } + if c.CephImage == "" { + c.CephImage = kubernetes.DefaultCephImage + } + if c.MonCount <= 0 { + c.MonCount = 1 + } + if c.MgrCount <= 0 { + c.MgrCount = 1 + } + if c.OSDCount <= 0 { + c.OSDCount = 1 + } + if c.OSDSize == "" { + c.OSDSize = kubernetes.DefaultOSDStorageClassSize + } + if c.OSDBackingStorageClassName == "" { + c.OSDBackingStorageClassName = "sds-local-volume-thick-ceph-osd" + } + if c.OSDBackingLVMType == "" { + c.OSDBackingLVMType = "Thick" + } + if c.ReplicaSize <= 0 { + c.ReplicaSize = 1 + } + if c.PoolName == "" { + c.PoolName = fmt.Sprintf("ceph-rbd-r%d", c.ReplicaSize) + } + if c.FailureDomain == "" { + if c.ReplicaSize == 1 { + c.FailureDomain = "osd" + } else { + c.FailureDomain = "host" + } + } + if c.ClusterConnectionName == "" { + c.ClusterConnectionName = c.StorageClassName + "-conn" + } + if c.ClusterAuthenticationName == "" { + c.ClusterAuthenticationName = c.StorageClassName + "-conn" + } + if c.RBDDefaultFSType == "" { + c.RBDDefaultFSType = "ext4" + } + if c.ModulesReadyTimeout == 0 { + c.ModulesReadyTimeout = 15 * time.Minute + } + if c.CephClusterReadyTimeout == 0 { + c.CephClusterReadyTimeout = 20 * time.Minute + } + if c.CephPoolReadyTimeout == 0 { + c.CephPoolReadyTimeout = 10 * time.Minute + } + if c.CredentialsTimeout == 0 { + c.CredentialsTimeout = 10 * time.Minute + } + if c.CSICephPhaseTimeout == 0 { + c.CSICephPhaseTimeout = 5 * time.Minute + } + if c.StorageClassWaitTimeout == 0 { + c.StorageClassWaitTimeout = 2 * time.Minute + } +} + +// EnsureCephStorageClass is the high-level entry point that turns an empty +// cluster into one with a working csi-ceph StorageClass. See +// CephStorageClassConfig for the step-by-step flow. +// +// The function is idempotent: re-running it picks up the existing Rook +// CephCluster / pool / csi-ceph CRs and only fills in whatever is still +// missing. Returns the name of the resulting k8s StorageClass. +func EnsureCephStorageClass(ctx context.Context, kubeconfig *rest.Config, cfg CephStorageClassConfig) (string, error) { + cfg.applyDefaults() + + if cfg.StorageClassName == "" { + return "", fmt.Errorf("StorageClassName is required") + } + + logger.Step(1, "Enabling Deckhouse modules for csi-ceph (sds-node-configurator, sds-elastic, csi-ceph)") + if !cfg.SkipModuleEnablement { + if err := ensureCephModules(ctx, kubeconfig, cfg); err != nil { + return "", fmt.Errorf("enable ceph modules: %w", err) + } + } + logger.StepComplete(1, "Modules enabled") + + logger.Step(2, "Resolving OSD backing StorageClass") + osdSC, err := ensureOSDBackingStorageClass(ctx, kubeconfig, &cfg) + if err != nil { + return "", fmt.Errorf("resolve OSD backing StorageClass: %w", err) + } + logger.StepComplete(2, "OSD backing StorageClass: %s", osdSC) + + logger.Step(3, "Seeding rook-config-override ConfigMap") + if err := kubernetes.SetRookConfigOverride(ctx, kubeconfig, cfg.Namespace, cfg.GlobalCephConfigOverrides); err != nil { + return "", fmt.Errorf("set rook-config-override: %w", err) + } + logger.StepComplete(3, "rook-config-override ready (%d global key(s))", len(cfg.GlobalCephConfigOverrides)) + + logger.Step(4, "Creating Rook CephCluster %s/%s", cfg.Namespace, cfg.CephClusterName) + if err := kubernetes.CreateCephCluster(ctx, kubeconfig, kubernetes.CephClusterConfig{ + Name: cfg.CephClusterName, + Namespace: cfg.Namespace, + CephImage: cfg.CephImage, + MonCount: cfg.MonCount, + MgrCount: cfg.MgrCount, + NetworkProvider: cfg.NetworkProvider, + PublicNetworkCIDRs: cfg.PublicNetworkCIDRs, + ClusterNetworkCIDRs: cfg.ClusterNetworkCIDRs, + OSDStorageClass: osdSC, + OSDCount: cfg.OSDCount, + OSDSize: cfg.OSDSize, + }); err != nil { + return "", fmt.Errorf("create CephCluster: %w", err) + } + if err := kubernetes.WaitForCephClusterReady(ctx, kubeconfig, cfg.Namespace, cfg.CephClusterName, cfg.CephClusterReadyTimeout); err != nil { + return "", fmt.Errorf("wait CephCluster: %w", err) + } + logger.StepComplete(4, "CephCluster %s/%s is Created", cfg.Namespace, cfg.CephClusterName) + + logger.Step(5, "Creating CephBlockPool %s/%s (replica=%d, failureDomain=%s)", + cfg.Namespace, cfg.PoolName, cfg.ReplicaSize, cfg.FailureDomain) + if err := kubernetes.CreateCephBlockPool(ctx, kubeconfig, kubernetes.CephBlockPoolConfig{ + Name: cfg.PoolName, + Namespace: cfg.Namespace, + FailureDomain: cfg.FailureDomain, + ReplicaSize: cfg.ReplicaSize, + }); err != nil { + return "", fmt.Errorf("create CephBlockPool: %w", err) + } + if err := kubernetes.WaitForCephBlockPoolReady(ctx, kubeconfig, cfg.Namespace, cfg.PoolName, cfg.CephPoolReadyTimeout); err != nil { + return "", fmt.Errorf("wait CephBlockPool: %w", err) + } + logger.StepComplete(5, "CephBlockPool %s/%s is Ready", cfg.Namespace, cfg.PoolName) + + logger.Step(6, "Extracting Rook-managed Ceph credentials (fsid, monitors, admin key)") + creds, err := kubernetes.WaitForCephCredentials(ctx, kubeconfig, cfg.Namespace, cfg.CredentialsTimeout) + if err != nil { + return "", fmt.Errorf("wait ceph credentials: %w", err) + } + logger.StepComplete(6, "Ceph credentials: fsid=%s, user=%s, %d monitor(s): %v", + creds.FSID, creds.AdminUser, len(creds.Monitors), creds.Monitors) + + logger.Step(7, "Wiring csi-ceph: CephClusterAuthentication %q + CephClusterConnection %q", + cfg.ClusterAuthenticationName, cfg.ClusterConnectionName) + if err := kubernetes.CreateCephClusterAuthentication(ctx, kubeconfig, kubernetes.CephClusterAuthenticationConfig{ + Name: cfg.ClusterAuthenticationName, + UserID: creds.AdminUser, + UserKey: creds.AdminKey, + }); err != nil { + return "", fmt.Errorf("create CephClusterAuthentication: %w", err) + } + if err := kubernetes.CreateCephClusterConnection(ctx, kubeconfig, kubernetes.CephClusterConnectionConfig{ + Name: cfg.ClusterConnectionName, + ClusterID: creds.FSID, + Monitors: creds.Monitors, + UserID: creds.AdminUser, + UserKey: creds.AdminKey, + }); err != nil { + return "", fmt.Errorf("create CephClusterConnection: %w", err) + } + if err := kubernetes.WaitForCephClusterConnectionCreated(ctx, kubeconfig, cfg.ClusterConnectionName, cfg.CSICephPhaseTimeout); err != nil { + return "", fmt.Errorf("wait CephClusterConnection: %w", err) + } + logger.StepComplete(7, "csi-ceph wired against Ceph cluster %s", creds.FSID) + + logger.Step(8, "Creating CephStorageClass %q → StorageClass", cfg.StorageClassName) + if err := kubernetes.CreateCephStorageClass(ctx, kubeconfig, kubernetes.CephStorageClassConfig{ + Name: cfg.StorageClassName, + ClusterConnectionName: cfg.ClusterConnectionName, + ClusterAuthenticationName: cfg.ClusterAuthenticationName, + Type: kubernetes.CephStorageClassTypeRBD, + RBDPool: cfg.PoolName, + RBDDefaultFSType: cfg.RBDDefaultFSType, + }); err != nil { + return "", fmt.Errorf("create CephStorageClass: %w", err) + } + if err := kubernetes.WaitForCephStorageClassCreated(ctx, kubeconfig, cfg.StorageClassName, cfg.CSICephPhaseTimeout); err != nil { + return "", fmt.Errorf("wait CephStorageClass: %w", err) + } + if err := kubernetes.WaitForStorageClass(ctx, kubeconfig, cfg.StorageClassName, cfg.StorageClassWaitTimeout); err != nil { + return "", fmt.Errorf("wait core StorageClass: %w", err) + } + logger.StepComplete(8, "StorageClass %s is available", cfg.StorageClassName) + + logger.Success("Ceph e2e stack ready: CephCluster %s/%s + pool %s → StorageClass %s", + cfg.Namespace, cfg.CephClusterName, cfg.PoolName, cfg.StorageClassName) + return cfg.StorageClassName, nil +} + +// EnsureDefaultCephStorageClass is EnsureCephStorageClass + SetGlobalDefaultStorageClass. +// After this call new PVCs without an explicit storageClassName will use the +// freshly-provisioned Ceph RBD class. +func EnsureDefaultCephStorageClass(ctx context.Context, kubeconfig *rest.Config, cfg CephStorageClassConfig) (string, error) { + scName, err := EnsureCephStorageClass(ctx, kubeconfig, cfg) + if err != nil { + return "", err + } + if err := kubernetes.SetGlobalDefaultStorageClass(ctx, kubeconfig, scName); err != nil { + return "", fmt.Errorf("set %s as default in global ModuleConfig: %w", scName, err) + } + logger.Success("StorageClass %s set as cluster default", scName) + return scName, nil +} + +// ensureCephModules enables sds-node-configurator + sds-elastic + csi-ceph +// and waits for their Ready phase. +func ensureCephModules(ctx context.Context, kubeconfig *rest.Config, cfg CephStorageClassConfig) error { + sdsElasticSettings := cfg.SdsElasticSettings + if sdsElasticSettings == nil { + sdsElasticSettings = map[string]interface{}{} + } + + csiCephSettings := cfg.CsiCephSettings + if csiCephSettings == nil { + csiCephSettings = map[string]interface{}{} + } + + modules := []kubernetes.ModuleSpec{ + { + Name: "sds-node-configurator", + Version: 1, + Enabled: true, + }, + { + Name: "sds-elastic", + Version: 1, + Enabled: true, + Settings: sdsElasticSettings, + Dependencies: []string{"sds-node-configurator"}, + }, + { + Name: "csi-ceph", + Version: 1, + Enabled: true, + Settings: csiCephSettings, + Dependencies: []string{"sds-elastic"}, + ModulePullOverride: cfg.CsiCephModulePullOverride, + }, + } + return kubernetes.EnableModulesAndWait(ctx, kubeconfig, nil, nil, modules, cfg.ModulesReadyTimeout) +} + +// ensureOSDBackingStorageClass returns an already-existing SC name (if the +// caller supplied OSDStorageClass) or delegates to EnsureDefaultStorageClass +// to provision a sds-local-volume SC on the fly. +func ensureOSDBackingStorageClass(ctx context.Context, kubeconfig *rest.Config, cfg *CephStorageClassConfig) (string, error) { + if cfg.OSDStorageClass != "" { + logger.Info("Using pre-existing OSD backing StorageClass %s", cfg.OSDStorageClass) + return cfg.OSDStorageClass, nil + } + + localCfg := DefaultStorageClassConfig{ + StorageClassName: cfg.OSDBackingStorageClassName, + LVMType: cfg.OSDBackingLVMType, + IncludeMasters: cfg.OSDBackingIncludeMasters, + BaseKubeconfig: cfg.OSDBackingBaseKubeconfig, + VMNamespace: cfg.OSDBackingVMNamespace, + BaseStorageClassName: cfg.OSDBackingBaseStorageClassName, + } + return EnsureDefaultStorageClass(ctx, kubeconfig, localCfg) +} diff --git a/tests/csi-ceph/cluster_config.yml b/tests/csi-ceph/cluster_config.yml new file mode 100644 index 0000000..62fcdbc --- /dev/null +++ b/tests/csi-ceph/cluster_config.yml @@ -0,0 +1,56 @@ +# csi-ceph smoke testkit: 3 workers to let a Rook Ceph cluster come up in a +# realistic layout (1 mon/mgr, 3 OSDs by default). Masters are untainted so +# the mon/mgr can land there on tiny clusters as well. +clusterDefinition: + masters: + - hostname: "master-1" + hostType: "vm" + osType: "Ubuntu 22.04 6.2.0-39-generic" + cpu: 4 + coreFraction: 50 + ram: 8 + diskSize: 50 + workers: + - hostname: "worker-1" + hostType: "vm" + osType: "Ubuntu 22.04 6.2.0-39-generic" + cpu: 4 + coreFraction: 50 + ram: 8 + diskSize: 50 + - hostname: "worker-2" + hostType: "vm" + osType: "Ubuntu 22.04 6.2.0-39-generic" + cpu: 4 + coreFraction: 50 + ram: 8 + diskSize: 50 + - hostname: "worker-3" + hostType: "vm" + osType: "Ubuntu 22.04 6.2.0-39-generic" + cpu: 4 + coreFraction: 50 + ram: 8 + diskSize: 50 + dkpParameters: + kubernetesVersion: "Automatic" + podSubnetCIDR: "10.112.0.0/16" + serviceSubnetCIDR: "10.225.0.0/16" + clusterDomain: "cluster.local" + registryRepo: "dev-registry.deckhouse.io/sys/deckhouse-oss" + devBranch: "main" + # Only the bare minimum is pre-enabled here. EnsureCephStorageClass will + # turn on sds-elastic + csi-ceph (and sds-local-volume as OSD backing + # automatically, unless CSI_CEPH_OSD_STORAGE_CLASS is provided). + modules: + - name: "snapshot-controller" + version: 1 + enabled: true + modulePullOverride: "main" + dependencies: [] + - name: "sds-node-configurator" + version: 1 + enabled: true + settings: + enableThinProvisioning: true + dependencies: [] diff --git a/tests/csi-ceph/csi_ceph_suite_test.go b/tests/csi-ceph/csi_ceph_suite_test.go new file mode 100644 index 0000000..c8d1442 --- /dev/null +++ b/tests/csi-ceph/csi_ceph_suite_test.go @@ -0,0 +1,46 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package csi_ceph + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/deckhouse/storage-e2e/internal/config" + "github.com/deckhouse/storage-e2e/internal/logger" +) + +var _ = BeforeSuite(func() { + Expect(config.ValidateEnvironment()).To(Succeed(), "validate environment") + Expect(logger.Initialize()).To(Succeed(), "initialize logger") +}) + +var _ = AfterSuite(func() { + if err := logger.Close(); err != nil { + GinkgoWriter.Printf("Warning: Failed to close logger: %v\n", err) + } +}) + +func TestCsiCeph(t *testing.T) { + RegisterFailHandler(Fail) + suiteConfig, reporterConfig := GinkgoConfiguration() + reporterConfig.Verbose = true + reporterConfig.ShowNodeEvents = false + RunSpecs(t, "csi-ceph (storage-e2e testkit)", suiteConfig, reporterConfig) +} diff --git a/tests/csi-ceph/csi_ceph_test.go b/tests/csi-ceph/csi_ceph_test.go new file mode 100644 index 0000000..9f9a600 --- /dev/null +++ b/tests/csi-ceph/csi_ceph_test.go @@ -0,0 +1,127 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package csi_ceph + +import ( + "context" + "fmt" + "os" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/deckhouse/storage-e2e/internal/config" + "github.com/deckhouse/storage-e2e/pkg/cluster" + k8s "github.com/deckhouse/storage-e2e/pkg/kubernetes" + "github.com/deckhouse/storage-e2e/pkg/testkit" +) + +const ( + // testStorageClassName matches what csi-ceph's smoke test in + // /csi-ceph/e2e also expects, so the two can share a cluster. + testStorageClassName = "e2e-ceph-rbd-r1" + testNamespace = "e2e-csi-ceph-smoke" + testPVCName = "e2e-csi-ceph-smoke-pvc" +) + +var _ = Describe("csi-ceph smoke (storage-e2e reference)", Ordered, func() { + var testClusterResources *cluster.TestClusterResources + + BeforeAll(func() { + cluster.OutputEnvironmentVariables() + }) + + AfterAll(func() { + cluster.CleanupTestClusterResources(testClusterResources) + }) + + It("should create or connect to test cluster", func() { + testClusterResources = cluster.CreateOrConnectToTestCluster() + Expect(testClusterResources).NotTo(BeNil()) + Expect(testClusterResources.Kubeconfig).NotTo(BeNil()) + }) + + It("should ensure Ceph RBD StorageClass via Rook (EnsureCephStorageClass)", func() { + Expect(testClusterResources).NotTo(BeNil()) + + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Minute) + defer cancel() + + cfg := testkit.CephStorageClassConfig{ + StorageClassName: testStorageClassName, + ReplicaSize: 1, + FailureDomain: "osd", + + // When OSDStorageClass is empty EnsureCephStorageClass will fall + // back to EnsureDefaultStorageClass to create a sds-local-volume + // Thick SC on the fly. + OSDStorageClass: os.Getenv("CSI_CEPH_OSD_STORAGE_CLASS"), + OSDBackingIncludeMasters: true, + + // Let callers pin a specific csi-ceph image from a dev-registry PR. + CsiCephModulePullOverride: os.Getenv("CSI_CEPH_MODULE_PULL_OVERRIDE"), + } + + // VirtualDisk attachment for nested-VM clusters. + if testClusterResources.VMResources != nil { + cfg.OSDBackingBaseKubeconfig = testClusterResources.BaseKubeconfig + cfg.OSDBackingVMNamespace = testClusterResources.VMResources.Namespace + cfg.OSDBackingBaseStorageClassName = config.TestClusterStorageClass + } + + scName, err := testkit.EnsureCephStorageClass(ctx, testClusterResources.Kubeconfig, cfg) + Expect(err).NotTo(HaveOccurred(), "EnsureCephStorageClass") + Expect(scName).To(Equal(testStorageClassName)) + }) + + It("should provision a PVC against the Ceph StorageClass", func() { + Expect(testClusterResources).NotTo(BeNil()) + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) + defer cancel() + + _, err := k8s.CreateNamespaceIfNotExists(ctx, testClusterResources.Kubeconfig, testNamespace) + Expect(err).NotTo(HaveOccurred(), "create test namespace") + + apply, err := k8s.NewApplyClient(testClusterResources.Kubeconfig) + Expect(err).NotTo(HaveOccurred(), "create apply client") + + pvcYAML := fmt.Sprintf(`apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: %s + namespace: %s + labels: + e2e.csi-ceph/smoke: "true" +spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 1Gi + storageClassName: %s +`, testPVCName, testNamespace, testStorageClassName) + + Expect(apply.ApplyYAML(ctx, pvcYAML, testNamespace)).To(Succeed(), "apply PVC") + + clientset, err := k8s.NewClientsetWithRetry(ctx, testClusterResources.Kubeconfig) + Expect(err).NotTo(HaveOccurred(), "clientset") + + Expect(k8s.WaitForPVCsBound(ctx, clientset, testNamespace, "e2e.csi-ceph/smoke=true", 1, 60, 5*time.Second)). + To(Succeed(), "wait PVC bound") + }) +}) From af5b794cbf3e9d0724d1fa870d19bf49a8077634 Mon Sep 17 00:00:00 2001 From: Viktor Karpochev Date: Tue, 28 Apr 2026 17:03:44 +1000 Subject: [PATCH 02/14] Add Ceph testkit provisioning helpers Move reusable Rook/Ceph provisioning and CRC toggling into storage-e2e so csi-ceph e2e can consume the shared testkit instead of carrying duplicated setup code. Signed-off-by: Viktor Karpochev Made-with: Cursor --- ARCHITECTURE.md | 40 +++- README.md | 34 +++ pkg/FUNCTIONS_GLOSSARY.md | 67 ++++++ pkg/kubernetes/cephcluster.go | 2 +- pkg/kubernetes/modules.go | 25 ++- pkg/kubernetes/rookconfigoverride.go | 6 +- pkg/testkit/ceph.go | 32 +++ pkg/testkit/ceph_cluster.go | 295 +++++++++++++++++++++++++++ pkg/testkit/ceph_crc.go | 209 +++++++++++++++++++ 9 files changed, 692 insertions(+), 18 deletions(-) create mode 100644 pkg/testkit/ceph_cluster.go create mode 100644 pkg/testkit/ceph_crc.go diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 02bf8ae..db644c8 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -78,7 +78,10 @@ storage-e2e/ │ │ └── secrets.go # Secret operations │ │ │ └── testkit/ # Test framework utilities -│ └── stress-tests.go # Stress test helpers +│ ├── stress-tests.go # Stress test helpers +│ ├── storageclass.go # EnsureDefaultStorageClass (sds-local-volume) +│ ├── ceph.go # EnsureCephStorageClass (Rook + csi-ceph) +│ └── ceph_cluster.go # EnsureCephCluster (Rook only, no csi-ceph) │ ├── tests/ # Test suites │ ├── test-template/ # Template for creating new tests @@ -452,16 +455,33 @@ pkg/ │ ├── vms.go # VM lifecycle management │ └── TODO.md # Development notes ├── kubernetes/ -│ ├── apply.go # YAML manifest application -│ ├── modules.go # Module configuration with dependency handling -│ ├── namespace.go # Namespace utilities -│ ├── nodegroup.go # NodeGroup operations -│ ├── pod.go # Pod operations -│ ├── pvc.go # PVC operations -│ ├── resources.go # Resource utilities -│ └── secrets.go # Secret operations +│ ├── apply.go # YAML manifest application +│ ├── modules.go # Module configuration with dependency handling +│ ├── namespace.go # Namespace utilities +│ ├── nodegroup.go # NodeGroup operations +│ ├── pod.go # Pod operations +│ ├── pvc.go # PVC operations +│ ├── resources.go # Resource utilities +│ ├── secrets.go # Secret operations +│ ├── storageclass.go # Core StorageClass helpers +│ ├── storageclass_manage.go # Global default-SC management via ModuleConfig +│ ├── localstorageclass.go # sds-local-volume LocalStorageClass CR +│ ├── lvmvolumegroup.go # sds-node-configurator LVMVolumeGroup CR +│ ├── blockdevice.go # sds-node-configurator BlockDevice CR +│ ├── virtualdisk.go # DKP VirtualDisk CR +│ ├── vmpod.go # Helpers to exec inside VM-hosted pods +│ ├── volumesnapshotclass.go # VolumeSnapshotClass helpers +│ ├── rookconfigoverride.go # Rook global ceph.conf via rook-config-override CM +│ ├── cephcredentials.go # Read fsid/mons/admin-key from Rook secrets +│ ├── cephcluster.go # Rook CephCluster CRUD + wait (unstructured) +│ ├── cephblockpool.go # Rook CephBlockPool CRUD + wait (unstructured) +│ ├── cephclusterconnection.go # csi-ceph CephClusterConnection/Auth CRs +│ └── cephstorageclass.go # csi-ceph CephStorageClass CR └── testkit/ - └── stress-tests.go # Stress test helpers + ├── stress-tests.go # Stress test helpers + ├── storageclass.go # EnsureDefaultStorageClass (sds-local-volume) + ├── ceph.go # EnsureCephStorageClass / EnsureDefaultCephStorageClass + └── ceph_cluster.go # EnsureCephCluster (Rook-only, no csi-ceph) ``` **Responsibilities**: diff --git a/README.md b/README.md index a4e2cf7..013b2fe 100644 --- a/README.md +++ b/README.md @@ -53,6 +53,40 @@ Template folder for creating new E2E tests. Contains a complete framework with: Use `./tests/create-test.sh ` to create a new test from this template. +### csi-ceph + +Reference testkit that provisions a full Rook-managed Ceph cluster and a +csi-ceph-backed `StorageClass` end-to-end, then verifies a plain `PVC` +bound against that class. + +Built around `testkit.EnsureCephStorageClass` (see +[pkg/FUNCTIONS_GLOSSARY.md](pkg/FUNCTIONS_GLOSSARY.md#ceph-storageclass-testkit)), +which handles: enabling `sds-node-configurator` + `sds-elastic` + `csi-ceph` +modules, optionally provisioning a `sds-local-volume` Thick `StorageClass` +for OSD backing, seeding `rook-config-override` (for things like +`ms_crc_data=false`), creating Rook `CephCluster` + `CephBlockPool`, and +wiring `CephClusterConnection` / `CephClusterAuthentication` / +`CephStorageClass` csi-ceph CRs. + +The testkit itself only runs a smoke check; downstream repos (e.g. +`csi-ceph`) can import `github.com/deckhouse/storage-e2e/pkg/testkit` and +reuse `EnsureCephStorageClass` inside their own Ginkgo specs. + +Testkit-specific env variables: + +- `CSI_CEPH_OSD_STORAGE_CLASS` — pre-existing block-mode StorageClass used to + back Rook OSD PVCs. When empty, a `sds-local-volume` Thick SC is + auto-provisioned via `EnsureDefaultStorageClass`. +- `CSI_CEPH_MODULE_PULL_OVERRIDE` — image tag for `csi-ceph`'s + ModulePullOverride (dev registries only, e.g. when testing a PR build). + +Run: + +```bash +source tests/csi-ceph/test_exports +go test -timeout=240m -v ./tests/csi-ceph -count=1 +``` + ### csi-all-stress-tests Stress tests for all CSI storage drivers. This test suite: diff --git a/pkg/FUNCTIONS_GLOSSARY.md b/pkg/FUNCTIONS_GLOSSARY.md index 47b7880..9f648e5 100644 --- a/pkg/FUNCTIONS_GLOSSARY.md +++ b/pkg/FUNCTIONS_GLOSSARY.md @@ -26,7 +26,14 @@ All exported functions available in the `pkg/` directory, grouped by resource. - [Secrets](#secrets) - [Modules](#modules) - [Retry](#retry) +- [Rook Config Override](#rook-config-override) +- [Ceph Credentials](#ceph-credentials) +- [CephCluster (Rook)](#cephcluster-rook) +- [CephBlockPool (Rook)](#cephblockpool-rook) +- [CephClusterConnection / CephClusterAuthentication (csi-ceph)](#cephclusterconnection--cephclusterauthentication-csi-ceph) +- [CephStorageClass (csi-ceph)](#cephstorageclass-csi-ceph) - [Default StorageClass (Testkit)](#default-storageclass-testkit) +- [Ceph StorageClass (Testkit)](#ceph-storageclass-testkit) - [Stress Tests (Testkit)](#stress-tests-testkit) --- @@ -238,6 +245,53 @@ All exported functions available in the `pkg/` directory, grouped by resource. - `IsSSHConnectionError(err)` — Checks if an error specifically indicates SSH connection failure requiring reconnection. - `WithRetryAfter(cfg, err)` — Returns a modified retry config that respects `RetryAfterSeconds` hints from Kubernetes API errors. +## Rook Config Override + +`pkg/kubernetes/rookconfigoverride.go` + +- `SetRookConfigOverride(ctx, kubeconfig, namespace, globals)` — Creates or updates the `rook-config-override` ConfigMap in the Rook operator namespace. The provided map is rendered under `[global]` and Rook picks it up into every Ceph daemon's `ceph.conf` (used for `ms_crc_data`, `bdev_enable_discard`, and similar knobs). Keys are sorted for stable output. +- `DeleteRookConfigOverride(ctx, kubeconfig, namespace)` — Removes the ConfigMap; safe if it does not exist. + +## Ceph Credentials + +`pkg/kubernetes/cephcredentials.go` + +- `WaitForCephCredentials(ctx, kubeconfig, namespace, timeout)` — Polls Rook's `rook-ceph-mon` Secret and `rook-ceph-mon-endpoints` ConfigMap until all pieces required to connect a CSI client to the cluster (`fsid`, admin user, admin key, monitor endpoints) are present. Returns a `*CephCredentials`. + +## CephCluster (Rook) + +`pkg/kubernetes/cephcluster.go` + +- `CreateCephCluster(ctx, kubeconfig, cfg)` — Creates or updates a Rook `CephCluster` CR using `CephClusterConfig` (image, mon/mgr counts, network provider, OSD storage class / count / size, data-dir host path, etc.). Idempotent. +- `WaitForCephClusterReady(ctx, kubeconfig, namespace, name, timeout)` — Blocks until `status.state == "Created"` (or `status.phase == "Ready"`). HEALTH_WARN is tolerated so single-OSD test clusters still succeed. +- `DeleteCephCluster(ctx, kubeconfig, namespace, name)` — Deletes the CR; NotFound is treated as success. Does NOT garbage-collect OSD data on host disks. + +## CephBlockPool (Rook) + +`pkg/kubernetes/cephblockpool.go` + +- `CreateCephBlockPool(ctx, kubeconfig, cfg)` — Creates or updates a Rook `CephBlockPool` from `CephBlockPoolConfig` (replicated with optional `requireSafeReplicaSize` override, or erasure-coded with `dataChunks`/`codingChunks`; `failureDomain`). +- `WaitForCephBlockPoolReady(ctx, kubeconfig, namespace, name, timeout)` — Polls until `status.phase == "Ready"`. +- `DeleteCephBlockPool(ctx, kubeconfig, namespace, name)` — Idempotent delete. + +## CephClusterConnection / CephClusterAuthentication (csi-ceph) + +`pkg/kubernetes/cephclusterconnection.go` + +- `CreateCephClusterAuthentication(ctx, kubeconfig, cfg)` — Creates or updates a `CephClusterAuthentication` CR (`userID` + `userKey`) used by csi-ceph to log in to Ceph. +- `DeleteCephClusterAuthentication(ctx, kubeconfig, name)` — Idempotent delete. +- `CreateCephClusterConnection(ctx, kubeconfig, cfg)` — Creates or updates a `CephClusterConnection` CR (`clusterID == fsid`, `monitors`, `userID`, `userKey`). `clusterID` is immutable: existing-resource updates leave it unchanged and only sync monitors/user. +- `DeleteCephClusterConnection(ctx, kubeconfig, name)` — Idempotent delete. +- `WaitForCephClusterConnectionCreated(ctx, kubeconfig, name, timeout)` — Polls until csi-ceph reports `status.phase == "Created"` (credentials + monitors validated against the live Ceph cluster). + +## CephStorageClass (csi-ceph) + +`pkg/kubernetes/cephstorageclass.go` + +- `CreateCephStorageClass(ctx, kubeconfig, cfg)` — Creates or updates a csi-ceph `CephStorageClass` CR (RBD by default; CephFS when `Type == "CephFS"` and `CephFSName` / `CephFSPool` are set). The csi-ceph controller provisions a corresponding core `storage.k8s.io/v1 StorageClass` as a side effect. +- `DeleteCephStorageClass(ctx, kubeconfig, name)` — Idempotent delete; the controller removes the backing StorageClass. +- `WaitForCephStorageClassCreated(ctx, kubeconfig, name, timeout)` — Polls until `status.phase == "Created"`. + ## Default StorageClass (Testkit) `pkg/testkit/storageclass.go` @@ -245,6 +299,19 @@ All exported functions available in the `pkg/` directory, grouped by resource. - `CreateDefaultStorageClass(ctx, kubeconfig, cfg)` — High-level helper: discovers nodes, enables sds-node-configurator/sds-local-volume modules, labels nodes, optionally attaches VirtualDisks, creates LVMVolumeGroups (Thick or Thin with thin pool), creates LocalStorageClass, waits for StorageClass. Configured via `DefaultStorageClassConfig`. - `EnsureDefaultStorageClass(ctx, kubeconfig, cfg)` — Idempotent wrapper around `CreateDefaultStorageClass`. Checks if StorageClass already exists, skips creation if so, then sets it as the cluster default via "global" ModuleConfig. +## Ceph StorageClass (Testkit) + +`pkg/testkit/ceph.go` + +- `EnsureCephStorageClass(ctx, kubeconfig, cfg)` — High-level end-to-end helper that turns an empty test cluster into one with a working csi-ceph `StorageClass`. Steps: (1) enable `sds-node-configurator`, `sds-elastic`, `csi-ceph` modules and wait Ready; (2) optionally call `EnsureDefaultStorageClass` to auto-provision a sds-local-volume SC for OSDs when `OSDStorageClass` is empty; (3) seed `rook-config-override` with `GlobalCephConfigOverrides` (e.g. `ms_crc_data=false`); (4) create Rook `CephCluster` and wait Created; (5) create `CephBlockPool` and wait Ready; (6) read fsid/monitors/admin-key from Rook-managed secrets; (7) wire csi-ceph by creating `CephClusterAuthentication` + `CephClusterConnection`; (8) create `CephStorageClass` and wait for the backing core StorageClass. Idempotent; returns the resulting StorageClass name. +- `EnsureDefaultCephStorageClass(ctx, kubeconfig, cfg)` — `EnsureCephStorageClass` + `SetGlobalDefaultStorageClass` so new PVCs without an explicit `storageClassName` use the provisioned Ceph RBD class. + +## Ceph Cluster (Testkit) — no csi-ceph wiring + +`pkg/testkit/ceph_cluster.go` + +- `EnsureCephCluster(ctx, kubeconfig, cfg)` — "Stop-before-csi-ceph" variant of `EnsureCephStorageClass`: brings up a Rook-managed Ceph cluster + CephBlockPool via sds-elastic alone. Steps: (1) enable `sds-node-configurator` + `sds-elastic` (does **not** enable `csi-ceph`); (2) resolve/provision OSD backing StorageClass (reuses `EnsureDefaultStorageClass`); (3) seed `rook-config-override` with `GlobalCephConfigOverrides`; (4) create Rook `CephCluster` and wait Created; (5) create `CephBlockPool` and wait Ready. Does not create `CephClusterConnection`/`CephClusterAuthentication`/`CephStorageClass`. Useful when tests need a live Ceph backend to talk to directly (e.g. from within csi-ceph's own e2e) without the testkit preselecting a csi-ceph-backed StorageClass. Idempotent; returns the pool name. + ## Stress Tests (Testkit) `pkg/testkit/stress-tests.go` diff --git a/pkg/kubernetes/cephcluster.go b/pkg/kubernetes/cephcluster.go index d6a1ad8..95612b7 100644 --- a/pkg/kubernetes/cephcluster.go +++ b/pkg/kubernetes/cephcluster.go @@ -43,7 +43,7 @@ const ( DefaultCephClusterName = "ceph-cluster" DefaultCephImage = "quay.io/ceph/ceph:v18.2.7" DefaultDataDirHostPath = "/var/lib/rook" - DefaultOSDStorageClassSize = "20Gi" + DefaultOSDStorageClassSize = "10Gi" ) // CephClusterConfig describes a Rook-managed Ceph cluster suitable for e2e diff --git a/pkg/kubernetes/modules.go b/pkg/kubernetes/modules.go index 3b4cedf..94490a7 100644 --- a/pkg/kubernetes/modules.go +++ b/pkg/kubernetes/modules.go @@ -252,9 +252,18 @@ func configureModuleConfig(ctx context.Context, kubeconfig *rest.Config, moduleC settings = moduleConfig.Settings } - // Retry logic for webhook connection errors and network timeouts - maxRetries := 10 + // Retry logic for webhook connection errors and network timeouts. + // On freshly-bootstrapped Deckhouse clusters the validating-webhook-handler + // pod (or the d8-system Service endpoint backing it) can be unready for + // several minutes while the control plane converges. Our previous cap of + // 10 retries with exponential backoff topped out at ~3.7 minutes total + // which was not enough for the SAN stand — we'd fail Step 18 with + // "connection refused" during the first ModuleConfig write. Bumping to 60 + // attempts with delays capped at 30s gives us up to ~30 minutes of + // soft-retries, which easily outlives any realistic webhook cold start. + maxRetries := 60 retryDelay := 2 * time.Second + const maxRetryDelay = 30 * time.Second var lastErr error for attempt := 0; attempt < maxRetries; attempt++ { @@ -282,8 +291,12 @@ func configureModuleConfig(ctx context.Context, kubeconfig *rest.Config, moduleC case <-ctx.Done(): return ctx.Err() case <-time.After(retryDelay): - // Exponential backoff + // Exponential backoff, capped so we don't sleep forever + // between retries on a slow-to-converge cluster. retryDelay = time.Duration(float64(retryDelay) * 1.5) + if retryDelay > maxRetryDelay { + retryDelay = maxRetryDelay + } continue } } @@ -307,8 +320,12 @@ func configureModuleConfig(ctx context.Context, kubeconfig *rest.Config, moduleC case <-ctx.Done(): return ctx.Err() case <-time.After(retryDelay): - // Exponential backoff + // Exponential backoff, capped (see create branch above + // for the rationale — same webhook cold-start). retryDelay = time.Duration(float64(retryDelay) * 1.5) + if retryDelay > maxRetryDelay { + retryDelay = maxRetryDelay + } continue } } diff --git a/pkg/kubernetes/rookconfigoverride.go b/pkg/kubernetes/rookconfigoverride.go index 2027318..dab8aad 100644 --- a/pkg/kubernetes/rookconfigoverride.go +++ b/pkg/kubernetes/rookconfigoverride.go @@ -66,7 +66,7 @@ func SetRookConfigOverride(ctx context.Context, kubeconfig *rest.Config, namespa return fmt.Errorf("failed to create clientset: %w", err) } - cfg := renderCephGlobalConfig(globals) + cfg := RenderCephGlobalConfig(globals) cm := &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ @@ -120,10 +120,10 @@ func DeleteRookConfigOverride(ctx context.Context, kubeconfig *rest.Config, name return nil } -// renderCephGlobalConfig renders a `[global]` section for ceph.conf from the +// RenderCephGlobalConfig renders a `[global]` section for ceph.conf from the // provided key/value pairs. Keys are sorted so the rendered output is stable // across calls with logically-equivalent maps (avoids unnecessary CM updates). -func renderCephGlobalConfig(globals map[string]string) string { +func RenderCephGlobalConfig(globals map[string]string) string { var b strings.Builder b.WriteString("[global]\n") diff --git a/pkg/testkit/ceph.go b/pkg/testkit/ceph.go index f7e0e5e..a87a7f5 100644 --- a/pkg/testkit/ceph.go +++ b/pkg/testkit/ceph.go @@ -367,6 +367,38 @@ func EnsureCephStorageClass(ctx context.Context, kubeconfig *rest.Config, cfg Ce return cfg.StorageClassName, nil } +// TeardownCephStorageClass removes the csi-ceph wiring + Rook CephCluster + +// pool + rook-config-override produced by EnsureCephStorageClass. Safe to +// call on partial state (missing resources are skipped — the first error is +// returned but subsequent deletions are still attempted). +// +// It deliberately does NOT disable the Deckhouse modules: they may be owned +// by the cluster admin, and re-bootstrapping is cheaper than a full +// module-disable → module-enable cycle. +func TeardownCephStorageClass(ctx context.Context, kubeconfig *rest.Config, cfg CephStorageClassConfig) error { + cfg.applyDefaults() + + var firstErr error + note := func(err error, what string) { + if err == nil { + return + } + logger.Warn("teardown: %s: %v", what, err) + if firstErr == nil { + firstErr = fmt.Errorf("%s: %w", what, err) + } + } + + logger.Info("Tearing down csi-ceph StorageClass %q", cfg.StorageClassName) + note(kubernetes.DeleteCephStorageClass(ctx, kubeconfig, cfg.StorageClassName), "delete CephStorageClass") + note(kubernetes.DeleteCephClusterConnection(ctx, kubeconfig, cfg.ClusterConnectionName), "delete CephClusterConnection") + note(kubernetes.DeleteCephClusterAuthentication(ctx, kubeconfig, cfg.ClusterAuthenticationName), "delete CephClusterAuthentication") + note(kubernetes.DeleteCephBlockPool(ctx, kubeconfig, cfg.Namespace, cfg.PoolName), "delete CephBlockPool") + note(kubernetes.DeleteCephCluster(ctx, kubeconfig, cfg.Namespace, cfg.CephClusterName), "delete CephCluster") + note(kubernetes.DeleteRookConfigOverride(ctx, kubeconfig, cfg.Namespace), "delete rook-config-override") + return firstErr +} + // EnsureDefaultCephStorageClass is EnsureCephStorageClass + SetGlobalDefaultStorageClass. // After this call new PVCs without an explicit storageClassName will use the // freshly-provisioned Ceph RBD class. diff --git a/pkg/testkit/ceph_cluster.go b/pkg/testkit/ceph_cluster.go new file mode 100644 index 0000000..cf683f2 --- /dev/null +++ b/pkg/testkit/ceph_cluster.go @@ -0,0 +1,295 @@ +/* +Copyright 2026 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package testkit + +import ( + "context" + "fmt" + "time" + + "k8s.io/client-go/rest" + + "github.com/deckhouse/storage-e2e/internal/logger" + "github.com/deckhouse/storage-e2e/pkg/kubernetes" +) + +// RookCephClusterConfig configures EnsureCephCluster — the "just bring up +// a Rook-managed Ceph cluster + pool" variant of EnsureCephStorageClass. +// +// Unlike EnsureCephStorageClass, EnsureCephCluster does NOT: +// - enable the `csi-ceph` Deckhouse module; +// - create CephClusterConnection / CephClusterAuthentication CRs; +// - create a CephStorageClass CR / materialize a core StorageClass. +// +// It stops once the Rook CephCluster is Created and the CephBlockPool is +// Ready. Use this when the test suite needs a live Ceph backend to exercise +// (e.g. to run rbd / ceph CLI against it, or to hook some other client) but +// deliberately does NOT want csi-ceph in the picture. +type RookCephClusterConfig struct { + // --- Namespacing / naming --- + + // Namespace is the Rook / sds-elastic namespace. Default: "d8-sds-elastic". + Namespace string + + // CephClusterName is the Rook CephCluster name. Default: "ceph-cluster". + CephClusterName string + + // CephImage is the Ceph container image. Default: + // "quay.io/ceph/ceph:v18.2.7". + CephImage string + + // MonCount / MgrCount are the Rook mon/mgr replica counts. + // Defaults: 1 / 1 (appropriate for 1..3-node test clusters). + MonCount int + MgrCount int + + // NetworkProvider: "" for CNI (default), "host" for host networking. + NetworkProvider string + PublicNetworkCIDRs []string + ClusterNetworkCIDRs []string + + // GlobalCephConfigOverrides populates `rook-config-override` under + // `[global]`, e.g. {"ms_crc_data": "false"} for the csi-ceph + // msCrcData matrix. nil leaves the ConfigMap otherwise empty. + GlobalCephConfigOverrides map[string]string + + // --- OSD backing --- + + // OSDStorageClass is a block-capable StorageClass used to back OSD PVCs. + // When empty, EnsureDefaultStorageClass is invoked with + // OSDBacking* to provision a sds-local-volume SC on the fly. + OSDStorageClass string + + // OSDCount is the number of OSDs. Default: 1. + OSDCount int + + // OSDSize is the size of each OSD PVC. Default: kubernetes.DefaultOSDStorageClassSize. + OSDSize string + + // --- Fallback SC provisioning via sds-local-volume --- + + // OSDBackingStorageClassName names the sds-local-volume SC we auto- + // provision for OSDs. Default: "sds-local-volume-thick-ceph-osd". + OSDBackingStorageClassName string + + // OSDBackingLVMType ("Thick"/"Thin"). Default: "Thick". + OSDBackingLVMType string + + OSDBackingIncludeMasters bool + OSDBackingBaseKubeconfig *rest.Config + OSDBackingVMNamespace string + OSDBackingBaseStorageClassName string + + // --- CephBlockPool --- + + // PoolName is the Rook CephBlockPool name. Default: + // "ceph-rbd-r". + PoolName string + + // ReplicaSize is the CephBlockPool replication factor. Default: 1. + ReplicaSize int + + // FailureDomain: "host" or "osd". Default: "osd" when ReplicaSize==1, + // "host" otherwise. + FailureDomain string + + // --- Modules --- + + // SkipModuleEnablement disables the module-enable step (useful when + // the caller has already enabled sds-node-configurator + sds-elastic + // through other means). + SkipModuleEnablement bool + + // SdsElasticSettings overrides `spec.settings` of the sds-elastic + // ModuleConfig. Defaults to an empty map. + SdsElasticSettings map[string]interface{} + + // --- Timeouts --- + + ModulesReadyTimeout time.Duration // default 15m + CephClusterReadyTimeout time.Duration // default 20m + CephPoolReadyTimeout time.Duration // default 10m +} + +func (c *RookCephClusterConfig) applyDefaults() { + if c.Namespace == "" { + c.Namespace = kubernetes.DefaultRookNamespace + } + if c.CephClusterName == "" { + c.CephClusterName = kubernetes.DefaultCephClusterName + } + if c.CephImage == "" { + c.CephImage = kubernetes.DefaultCephImage + } + if c.MonCount <= 0 { + c.MonCount = 1 + } + if c.MgrCount <= 0 { + c.MgrCount = 1 + } + if c.OSDCount <= 0 { + c.OSDCount = 1 + } + if c.OSDSize == "" { + c.OSDSize = kubernetes.DefaultOSDStorageClassSize + } + if c.OSDBackingStorageClassName == "" { + c.OSDBackingStorageClassName = "sds-local-volume-thick-ceph-osd" + } + if c.OSDBackingLVMType == "" { + c.OSDBackingLVMType = "Thick" + } + if c.ReplicaSize <= 0 { + c.ReplicaSize = 1 + } + if c.PoolName == "" { + c.PoolName = fmt.Sprintf("ceph-rbd-r%d", c.ReplicaSize) + } + if c.FailureDomain == "" { + if c.ReplicaSize == 1 { + c.FailureDomain = "osd" + } else { + c.FailureDomain = "host" + } + } + if c.ModulesReadyTimeout == 0 { + c.ModulesReadyTimeout = 15 * time.Minute + } + if c.CephClusterReadyTimeout == 0 { + c.CephClusterReadyTimeout = 20 * time.Minute + } + if c.CephPoolReadyTimeout == 0 { + c.CephPoolReadyTimeout = 10 * time.Minute + } +} + +// EnsureCephCluster brings up (or reuses) a Rook-managed Ceph cluster plus +// a CephBlockPool via sds-elastic — without touching csi-ceph. +// +// Flow: +// 1. Enable Deckhouse modules: sds-node-configurator + sds-elastic. +// 2. Resolve an OSD backing StorageClass (re-using EnsureDefaultStorageClass +// when none is pre-provided). +// 3. Seed `rook-config-override` with per-test global Ceph settings. +// 4. Create the Rook CephCluster and wait until it is Created. +// 5. Create the CephBlockPool and wait until it is Ready. +// +// Idempotent: re-running picks up existing resources. Returns the pool +// name (same one callers would reference as Ceph pool, e.g. for a +// subsequent `rbd create`/`CephStorageClass.rbd.pool`). +func EnsureCephCluster(ctx context.Context, kubeconfig *rest.Config, cfg RookCephClusterConfig) (string, error) { + cfg.applyDefaults() + + logger.Step(1, "Enabling Deckhouse modules for Rook (sds-node-configurator, sds-elastic)") + if !cfg.SkipModuleEnablement { + if err := ensureRookModules(ctx, kubeconfig, cfg.SdsElasticSettings, cfg.ModulesReadyTimeout); err != nil { + return "", fmt.Errorf("enable rook modules: %w", err) + } + } + logger.StepComplete(1, "Modules enabled") + + logger.Step(2, "Resolving OSD backing StorageClass") + osdSC := cfg.OSDStorageClass + if osdSC == "" { + local := DefaultStorageClassConfig{ + StorageClassName: cfg.OSDBackingStorageClassName, + LVMType: cfg.OSDBackingLVMType, + IncludeMasters: cfg.OSDBackingIncludeMasters, + BaseKubeconfig: cfg.OSDBackingBaseKubeconfig, + VMNamespace: cfg.OSDBackingVMNamespace, + BaseStorageClassName: cfg.OSDBackingBaseStorageClassName, + } + name, err := EnsureDefaultStorageClass(ctx, kubeconfig, local) + if err != nil { + return "", fmt.Errorf("resolve OSD backing StorageClass: %w", err) + } + osdSC = name + } else { + logger.Info("Using pre-existing OSD backing StorageClass %s", osdSC) + } + logger.StepComplete(2, "OSD backing StorageClass: %s", osdSC) + + logger.Step(3, "Seeding rook-config-override ConfigMap") + if err := kubernetes.SetRookConfigOverride(ctx, kubeconfig, cfg.Namespace, cfg.GlobalCephConfigOverrides); err != nil { + return "", fmt.Errorf("set rook-config-override: %w", err) + } + logger.StepComplete(3, "rook-config-override ready (%d global key(s))", len(cfg.GlobalCephConfigOverrides)) + + logger.Step(4, "Creating Rook CephCluster %s/%s", cfg.Namespace, cfg.CephClusterName) + if err := kubernetes.CreateCephCluster(ctx, kubeconfig, kubernetes.CephClusterConfig{ + Name: cfg.CephClusterName, + Namespace: cfg.Namespace, + CephImage: cfg.CephImage, + MonCount: cfg.MonCount, + MgrCount: cfg.MgrCount, + NetworkProvider: cfg.NetworkProvider, + PublicNetworkCIDRs: cfg.PublicNetworkCIDRs, + ClusterNetworkCIDRs: cfg.ClusterNetworkCIDRs, + OSDStorageClass: osdSC, + OSDCount: cfg.OSDCount, + OSDSize: cfg.OSDSize, + }); err != nil { + return "", fmt.Errorf("create CephCluster: %w", err) + } + if err := kubernetes.WaitForCephClusterReady(ctx, kubeconfig, cfg.Namespace, cfg.CephClusterName, cfg.CephClusterReadyTimeout); err != nil { + return "", fmt.Errorf("wait CephCluster: %w", err) + } + logger.StepComplete(4, "CephCluster %s/%s is Created", cfg.Namespace, cfg.CephClusterName) + + logger.Step(5, "Creating CephBlockPool %s/%s (replica=%d, failureDomain=%s)", + cfg.Namespace, cfg.PoolName, cfg.ReplicaSize, cfg.FailureDomain) + if err := kubernetes.CreateCephBlockPool(ctx, kubeconfig, kubernetes.CephBlockPoolConfig{ + Name: cfg.PoolName, + Namespace: cfg.Namespace, + FailureDomain: cfg.FailureDomain, + ReplicaSize: cfg.ReplicaSize, + }); err != nil { + return "", fmt.Errorf("create CephBlockPool: %w", err) + } + if err := kubernetes.WaitForCephBlockPoolReady(ctx, kubeconfig, cfg.Namespace, cfg.PoolName, cfg.CephPoolReadyTimeout); err != nil { + return "", fmt.Errorf("wait CephBlockPool: %w", err) + } + logger.StepComplete(5, "CephBlockPool %s/%s is Ready", cfg.Namespace, cfg.PoolName) + + logger.Success("Ceph cluster ready: CephCluster %s/%s + pool %s (no csi-ceph wiring)", + cfg.Namespace, cfg.CephClusterName, cfg.PoolName) + return cfg.PoolName, nil +} + +// ensureRookModules enables sds-node-configurator + sds-elastic (and nothing +// else). Used by EnsureCephCluster and as the Rook-only step of +// EnsureCephStorageClass's module list. +func ensureRookModules(ctx context.Context, kubeconfig *rest.Config, sdsElasticSettings map[string]interface{}, readyTimeout time.Duration) error { + if sdsElasticSettings == nil { + sdsElasticSettings = map[string]interface{}{} + } + modules := []kubernetes.ModuleSpec{ + { + Name: "sds-node-configurator", + Version: 1, + Enabled: true, + }, + { + Name: "sds-elastic", + Version: 1, + Enabled: true, + Settings: sdsElasticSettings, + Dependencies: []string{"sds-node-configurator"}, + }, + } + return kubernetes.EnableModulesAndWait(ctx, kubeconfig, nil, nil, modules, readyTimeout) +} diff --git a/pkg/testkit/ceph_crc.go b/pkg/testkit/ceph_crc.go new file mode 100644 index 0000000..5ce4708 --- /dev/null +++ b/pkg/testkit/ceph_crc.go @@ -0,0 +1,209 @@ +/* +Copyright 2026 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package testkit + +import ( + "context" + "fmt" + "strconv" + "time" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/rest" + + "github.com/deckhouse/storage-e2e/internal/logger" + "github.com/deckhouse/storage-e2e/pkg/kubernetes" +) + +// EnableServerCRC is the readable counterpart of +// `SetMsCrcDataOnServer(..., ptr.To(true))`. It writes +// `ms_crc_data = true` into rook-config-override and rolling-restarts +// mon/mgr/osd so the override is live on every daemon before returning. +// +// Useful for tests that want the Ceph cluster in an explicit CRC-on state +// (the default Ceph behaviour, but pinned in the ConfigMap so the test +// can assert on it). +func EnableServerCRC(ctx context.Context, kubeconfig *rest.Config, namespace string) error { + enabled := true + return SetMsCrcDataOnServer(ctx, kubeconfig, namespace, &enabled) +} + +// DisableServerCRC flips Ceph into the "CRC off" state: +// `ms_crc_data = false` in rook-config-override + rolling-restart of +// mon/mgr/osd. Paired with a csi-ceph client that still defaults to +// `msCrcData=true`, this reproduces the msCrcData matrix mismatch case. +func DisableServerCRC(ctx context.Context, kubeconfig *rest.Config, namespace string) error { + enabled := false + return SetMsCrcDataOnServer(ctx, kubeconfig, namespace, &enabled) +} + +// ResetServerCRCToDefault removes `ms_crc_data` from rook-config-override +// (rendered `[global]` section becomes empty). Ceph falls back to its +// compile-time default (ms_crc_data = true), matching a freshly-installed +// cluster. Convenient for AfterAll / AfterEach restoration. +func ResetServerCRCToDefault(ctx context.Context, kubeconfig *rest.Config, namespace string) error { + return SetMsCrcDataOnServer(ctx, kubeconfig, namespace, nil) +} + +// SetMsCrcDataOnServer rewrites `rook-config-override` so that only +// `ms_crc_data = ` ends up under `[global]` (nil removes the key +// entirely, falling back to Ceph's compile-time default = true). +// +// After flipping the ConfigMap, it force-restarts mon/mgr/osd Deployments +// in the Rook namespace and waits for them to converge. Idempotent: when +// the ConfigMap already encodes the desired state, nothing is restarted. +// +// Prefer EnableServerCRC / DisableServerCRC / ResetServerCRCToDefault at +// call sites for readability; this lower-level primitive exists so a +// boolean test parameter (e.g. the 2×2 matrix) doesn't have to branch. +func SetMsCrcDataOnServer(ctx context.Context, kubeconfig *rest.Config, namespace string, enabled *bool) error { + if namespace == "" { + namespace = kubernetes.DefaultRookNamespace + } + + overrides := renderMsCrcDataOverrides(enabled) + wantConfig := kubernetes.RenderCephGlobalConfig(overrides) + + clientset, err := kubernetes.NewClientsetWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create clientset: %w", err) + } + + existing, err := clientset.CoreV1().ConfigMaps(namespace).Get(ctx, kubernetes.RookConfigOverrideName, metav1.GetOptions{}) + if err != nil && !apierrors.IsNotFound(err) { + return fmt.Errorf("get %s/%s: %w", namespace, kubernetes.RookConfigOverrideName, err) + } + currentConfig := "" + if existing != nil { + currentConfig = existing.Data["config"] + } + + if currentConfig == wantConfig { + logger.Info("rook-config-override already has ms_crc_data=%s, skipping daemon restart", + msCrcDataString(enabled)) + return nil + } + + logger.Info("Setting server-side ms_crc_data=%s in rook-config-override", msCrcDataString(enabled)) + if err := kubernetes.SetRookConfigOverride(ctx, kubeconfig, namespace, overrides); err != nil { + return fmt.Errorf("set rook-config-override: %w", err) + } + + // Rook operator notices CM changes on its next reconcile loop; force + // a rolling restart of the core Ceph daemons so the new + // `/etc/ceph/ceph.conf` takes effect right now. + if err := RestartCephDaemons(ctx, kubeconfig, namespace, 10*time.Minute); err != nil { + return fmt.Errorf("restart ceph daemons: %w", err) + } + logger.Success("Server-side ms_crc_data=%s is now live on all Ceph daemons", msCrcDataString(enabled)) + return nil +} + +// RestartCephDaemons rollout-restarts Rook's mon/mgr/osd Deployments and +// waits for them to reach their desired ready replica count. +func RestartCephDaemons(ctx context.Context, kubeconfig *rest.Config, namespace string, timeout time.Duration) error { + if namespace == "" { + namespace = kubernetes.DefaultRookNamespace + } + clientset, err := kubernetes.NewClientsetWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create clientset: %w", err) + } + + // Rook labels each Ceph daemon Deployment with `app=rook-ceph-`. + // We restart the daemons that actually consume `/etc/ceph/ceph.conf`: + // mon, mgr and osd. (The operator itself reads rook-config-override + // directly and does not need a bounce.) + labelSel := "app in (rook-ceph-mon,rook-ceph-mgr,rook-ceph-osd)" + deployList, err := clientset.AppsV1().Deployments(namespace).List(ctx, metav1.ListOptions{LabelSelector: labelSel}) + if err != nil { + return fmt.Errorf("list ceph daemon Deployments (%s): %w", labelSel, err) + } + if len(deployList.Items) == 0 { + return fmt.Errorf("no Ceph daemon Deployments matched %q in namespace %s — is Rook running?", labelSel, namespace) + } + + names := make([]string, 0, len(deployList.Items)) + for i := range deployList.Items { + names = append(names, deployList.Items[i].Name) + } + logger.Info("Rolling-restarting %d Ceph daemon Deployment(s): %v", len(names), names) + + stamp := time.Now().UTC().Format(time.RFC3339Nano) + patch := []byte(fmt.Sprintf( + `{"spec":{"template":{"metadata":{"annotations":{"storage-e2e/restarted-at":%q}}}}}`, stamp)) + + for _, name := range names { + if _, err := clientset.AppsV1().Deployments(namespace).Patch( + ctx, name, types.StrategicMergePatchType, patch, metav1.PatchOptions{}); err != nil { + return fmt.Errorf("annotate Deployment %s/%s for rollout: %w", namespace, name, err) + } + } + + waitCtx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + + for { + ready := 0 + for _, name := range names { + d, err := clientset.AppsV1().Deployments(namespace).Get(waitCtx, name, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("get Deployment %s/%s: %w", namespace, name, err) + } + desired := int32(1) + if d.Spec.Replicas != nil { + desired = *d.Spec.Replicas + } + if d.Status.ObservedGeneration >= d.Generation && d.Status.UpdatedReplicas >= desired && d.Status.AvailableReplicas >= desired { + ready++ + } + } + if ready == len(names) { + logger.Success("All %d Ceph daemon Deployment(s) report Ready after rollout", len(names)) + return nil + } + select { + case <-waitCtx.Done(): + return fmt.Errorf("timed out after %s waiting for %d Ceph daemon Deployments to become ready (%d/%d)", + timeout, len(names), ready, len(names)) + case <-ticker.C: + } + } +} + +// renderMsCrcDataOverrides turns a *bool into the minimal rook-config-override +// key/value map used by the msCrcData test matrix. +func renderMsCrcDataOverrides(enabled *bool) map[string]string { + if enabled == nil { + return nil + } + return map[string]string{ + "ms_crc_data": strconv.FormatBool(*enabled), + } +} + +func msCrcDataString(enabled *bool) string { + if enabled == nil { + return "" + } + return strconv.FormatBool(*enabled) +} From fca3bf9a0c1c8264702e9a86b21307778c2c89bb Mon Sep 17 00:00:00 2001 From: Viktor Karpochev Date: Tue, 28 Apr 2026 17:43:30 +1000 Subject: [PATCH 03/14] Remove csi-ceph test suite from storage-e2e Keep storage-e2e focused on reusable Ceph testkit helpers while the csi-ceph repository owns its module-specific e2e suite. Signed-off-by: Viktor Karpochev Made-with: Cursor --- tests/csi-ceph/cluster_config.yml | 56 ------------ tests/csi-ceph/csi_ceph_suite_test.go | 46 ---------- tests/csi-ceph/csi_ceph_test.go | 127 -------------------------- 3 files changed, 229 deletions(-) delete mode 100644 tests/csi-ceph/cluster_config.yml delete mode 100644 tests/csi-ceph/csi_ceph_suite_test.go delete mode 100644 tests/csi-ceph/csi_ceph_test.go diff --git a/tests/csi-ceph/cluster_config.yml b/tests/csi-ceph/cluster_config.yml deleted file mode 100644 index 62fcdbc..0000000 --- a/tests/csi-ceph/cluster_config.yml +++ /dev/null @@ -1,56 +0,0 @@ -# csi-ceph smoke testkit: 3 workers to let a Rook Ceph cluster come up in a -# realistic layout (1 mon/mgr, 3 OSDs by default). Masters are untainted so -# the mon/mgr can land there on tiny clusters as well. -clusterDefinition: - masters: - - hostname: "master-1" - hostType: "vm" - osType: "Ubuntu 22.04 6.2.0-39-generic" - cpu: 4 - coreFraction: 50 - ram: 8 - diskSize: 50 - workers: - - hostname: "worker-1" - hostType: "vm" - osType: "Ubuntu 22.04 6.2.0-39-generic" - cpu: 4 - coreFraction: 50 - ram: 8 - diskSize: 50 - - hostname: "worker-2" - hostType: "vm" - osType: "Ubuntu 22.04 6.2.0-39-generic" - cpu: 4 - coreFraction: 50 - ram: 8 - diskSize: 50 - - hostname: "worker-3" - hostType: "vm" - osType: "Ubuntu 22.04 6.2.0-39-generic" - cpu: 4 - coreFraction: 50 - ram: 8 - diskSize: 50 - dkpParameters: - kubernetesVersion: "Automatic" - podSubnetCIDR: "10.112.0.0/16" - serviceSubnetCIDR: "10.225.0.0/16" - clusterDomain: "cluster.local" - registryRepo: "dev-registry.deckhouse.io/sys/deckhouse-oss" - devBranch: "main" - # Only the bare minimum is pre-enabled here. EnsureCephStorageClass will - # turn on sds-elastic + csi-ceph (and sds-local-volume as OSD backing - # automatically, unless CSI_CEPH_OSD_STORAGE_CLASS is provided). - modules: - - name: "snapshot-controller" - version: 1 - enabled: true - modulePullOverride: "main" - dependencies: [] - - name: "sds-node-configurator" - version: 1 - enabled: true - settings: - enableThinProvisioning: true - dependencies: [] diff --git a/tests/csi-ceph/csi_ceph_suite_test.go b/tests/csi-ceph/csi_ceph_suite_test.go deleted file mode 100644 index c8d1442..0000000 --- a/tests/csi-ceph/csi_ceph_suite_test.go +++ /dev/null @@ -1,46 +0,0 @@ -/* -Copyright 2025 Flant JSC - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package csi_ceph - -import ( - "testing" - - . "github.com/onsi/ginkgo/v2" - . "github.com/onsi/gomega" - - "github.com/deckhouse/storage-e2e/internal/config" - "github.com/deckhouse/storage-e2e/internal/logger" -) - -var _ = BeforeSuite(func() { - Expect(config.ValidateEnvironment()).To(Succeed(), "validate environment") - Expect(logger.Initialize()).To(Succeed(), "initialize logger") -}) - -var _ = AfterSuite(func() { - if err := logger.Close(); err != nil { - GinkgoWriter.Printf("Warning: Failed to close logger: %v\n", err) - } -}) - -func TestCsiCeph(t *testing.T) { - RegisterFailHandler(Fail) - suiteConfig, reporterConfig := GinkgoConfiguration() - reporterConfig.Verbose = true - reporterConfig.ShowNodeEvents = false - RunSpecs(t, "csi-ceph (storage-e2e testkit)", suiteConfig, reporterConfig) -} diff --git a/tests/csi-ceph/csi_ceph_test.go b/tests/csi-ceph/csi_ceph_test.go deleted file mode 100644 index 9f9a600..0000000 --- a/tests/csi-ceph/csi_ceph_test.go +++ /dev/null @@ -1,127 +0,0 @@ -/* -Copyright 2025 Flant JSC - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package csi_ceph - -import ( - "context" - "fmt" - "os" - "time" - - . "github.com/onsi/ginkgo/v2" - . "github.com/onsi/gomega" - - "github.com/deckhouse/storage-e2e/internal/config" - "github.com/deckhouse/storage-e2e/pkg/cluster" - k8s "github.com/deckhouse/storage-e2e/pkg/kubernetes" - "github.com/deckhouse/storage-e2e/pkg/testkit" -) - -const ( - // testStorageClassName matches what csi-ceph's smoke test in - // /csi-ceph/e2e also expects, so the two can share a cluster. - testStorageClassName = "e2e-ceph-rbd-r1" - testNamespace = "e2e-csi-ceph-smoke" - testPVCName = "e2e-csi-ceph-smoke-pvc" -) - -var _ = Describe("csi-ceph smoke (storage-e2e reference)", Ordered, func() { - var testClusterResources *cluster.TestClusterResources - - BeforeAll(func() { - cluster.OutputEnvironmentVariables() - }) - - AfterAll(func() { - cluster.CleanupTestClusterResources(testClusterResources) - }) - - It("should create or connect to test cluster", func() { - testClusterResources = cluster.CreateOrConnectToTestCluster() - Expect(testClusterResources).NotTo(BeNil()) - Expect(testClusterResources.Kubeconfig).NotTo(BeNil()) - }) - - It("should ensure Ceph RBD StorageClass via Rook (EnsureCephStorageClass)", func() { - Expect(testClusterResources).NotTo(BeNil()) - - ctx, cancel := context.WithTimeout(context.Background(), 60*time.Minute) - defer cancel() - - cfg := testkit.CephStorageClassConfig{ - StorageClassName: testStorageClassName, - ReplicaSize: 1, - FailureDomain: "osd", - - // When OSDStorageClass is empty EnsureCephStorageClass will fall - // back to EnsureDefaultStorageClass to create a sds-local-volume - // Thick SC on the fly. - OSDStorageClass: os.Getenv("CSI_CEPH_OSD_STORAGE_CLASS"), - OSDBackingIncludeMasters: true, - - // Let callers pin a specific csi-ceph image from a dev-registry PR. - CsiCephModulePullOverride: os.Getenv("CSI_CEPH_MODULE_PULL_OVERRIDE"), - } - - // VirtualDisk attachment for nested-VM clusters. - if testClusterResources.VMResources != nil { - cfg.OSDBackingBaseKubeconfig = testClusterResources.BaseKubeconfig - cfg.OSDBackingVMNamespace = testClusterResources.VMResources.Namespace - cfg.OSDBackingBaseStorageClassName = config.TestClusterStorageClass - } - - scName, err := testkit.EnsureCephStorageClass(ctx, testClusterResources.Kubeconfig, cfg) - Expect(err).NotTo(HaveOccurred(), "EnsureCephStorageClass") - Expect(scName).To(Equal(testStorageClassName)) - }) - - It("should provision a PVC against the Ceph StorageClass", func() { - Expect(testClusterResources).NotTo(BeNil()) - - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) - defer cancel() - - _, err := k8s.CreateNamespaceIfNotExists(ctx, testClusterResources.Kubeconfig, testNamespace) - Expect(err).NotTo(HaveOccurred(), "create test namespace") - - apply, err := k8s.NewApplyClient(testClusterResources.Kubeconfig) - Expect(err).NotTo(HaveOccurred(), "create apply client") - - pvcYAML := fmt.Sprintf(`apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: %s - namespace: %s - labels: - e2e.csi-ceph/smoke: "true" -spec: - accessModes: [ "ReadWriteOnce" ] - resources: - requests: - storage: 1Gi - storageClassName: %s -`, testPVCName, testNamespace, testStorageClassName) - - Expect(apply.ApplyYAML(ctx, pvcYAML, testNamespace)).To(Succeed(), "apply PVC") - - clientset, err := k8s.NewClientsetWithRetry(ctx, testClusterResources.Kubeconfig) - Expect(err).NotTo(HaveOccurred(), "clientset") - - Expect(k8s.WaitForPVCsBound(ctx, clientset, testNamespace, "e2e.csi-ceph/smoke=true", 1, 60, 5*time.Second)). - To(Succeed(), "wait PVC bound") - }) -}) From f162d5a9474b8ad3168605fe2c1b65932971e013 Mon Sep 17 00:00:00 2001 From: Viktor Karpochev Date: Tue, 28 Apr 2026 18:02:50 +1000 Subject: [PATCH 04/14] Fix Ceph testkit docs Keep the public Ceph helper comments aligned with the 10Gi OSD default and avoid referring to the old full 2x2 CRC matrix. Signed-off-by: Viktor Karpochev Made-with: Cursor --- pkg/kubernetes/cephcluster.go | 20 ++++++++++---------- pkg/testkit/ceph.go | 2 +- pkg/testkit/ceph_crc.go | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/pkg/kubernetes/cephcluster.go b/pkg/kubernetes/cephcluster.go index 95612b7..eeb1a14 100644 --- a/pkg/kubernetes/cephcluster.go +++ b/pkg/kubernetes/cephcluster.go @@ -100,7 +100,7 @@ type CephClusterConfig struct { // OSDCount is the number of OSDs to provision (default: 1). OSDCount int - // OSDSize is the size of each OSD PVC (default: "20Gi"). + // OSDSize is the size of each OSD PVC (default: "10Gi"). OSDSize string // OSDDeviceSetName is the `storageClassDeviceSets[].name` (default: @@ -213,8 +213,8 @@ func buildCephClusterSpec(cfg CephClusterConfig) map[string]interface{} { "image": cfg.CephImage, "allowUnsupported": *cfg.AllowUnsupportedCephVersion, }, - "dataDirHostPath": cfg.DataDirHostPath, - "skipUpgradeChecks": false, + "dataDirHostPath": cfg.DataDirHostPath, + "skipUpgradeChecks": false, "continueUpgradeAfterChecksEvenIfNotHealthy": false, "mon": map[string]interface{}{ "count": int64(cfg.MonCount), @@ -257,10 +257,10 @@ func buildCephClusterSpec(cfg CephClusterConfig) map[string]interface{} { "useAllDevices": false, "storageClassDeviceSets": []interface{}{ map[string]interface{}{ - "name": cfg.OSDDeviceSetName, - "count": int64(cfg.OSDCount), - "portable": false, - "tuneDeviceClass": true, + "name": cfg.OSDDeviceSetName, + "count": int64(cfg.OSDCount), + "portable": false, + "tuneDeviceClass": true, "volumeClaimTemplates": []interface{}{ map[string]interface{}{ "metadata": map[string]interface{}{ @@ -287,9 +287,9 @@ func buildCephClusterSpec(cfg CephClusterConfig) map[string]interface{} { network := map[string]interface{}{ "provider": cfg.NetworkProvider, "connections": map[string]interface{}{ - "encryption": map[string]interface{}{"enabled": false}, - "compression": map[string]interface{}{"enabled": false}, - "requireMsgr2": false, + "encryption": map[string]interface{}{"enabled": false}, + "compression": map[string]interface{}{"enabled": false}, + "requireMsgr2": false, }, } diff --git a/pkg/testkit/ceph.go b/pkg/testkit/ceph.go index a87a7f5..29c09ad 100644 --- a/pkg/testkit/ceph.go +++ b/pkg/testkit/ceph.go @@ -90,7 +90,7 @@ type CephStorageClassConfig struct { // OSDCount is the number of OSDs. Default: 1. OSDCount int - // OSDSize is the size of each OSD PVC. Default: "20Gi". + // OSDSize is the size of each OSD PVC. Default: "10Gi". OSDSize string // --- Fallback SC provisioning via sds-local-volume (when OSDStorageClass is empty) --- diff --git a/pkg/testkit/ceph_crc.go b/pkg/testkit/ceph_crc.go index 5ce4708..5214fde 100644 --- a/pkg/testkit/ceph_crc.go +++ b/pkg/testkit/ceph_crc.go @@ -71,7 +71,7 @@ func ResetServerCRCToDefault(ctx context.Context, kubeconfig *rest.Config, names // // Prefer EnableServerCRC / DisableServerCRC / ResetServerCRCToDefault at // call sites for readability; this lower-level primitive exists so a -// boolean test parameter (e.g. the 2×2 matrix) doesn't have to branch. +// boolean test parameter (e.g. a CRC compatibility matrix) doesn't have to branch. func SetMsCrcDataOnServer(ctx context.Context, kubeconfig *rest.Config, namespace string, enabled *bool) error { if namespace == "" { namespace = kubernetes.DefaultRookNamespace From 8b6c19ea04708076f5df1d04835b9b221d289825 Mon Sep 17 00:00:00 2001 From: Aleksandr Zimin Date: Mon, 4 May 2026 18:40:26 +0300 Subject: [PATCH 05/14] Add CephFS support to Ceph testkit Extend storage-e2e so callers can provision a CephFS-backed CephStorageClass alongside the existing RBD path. * New pkg/kubernetes/cephfilesystem.go with idempotent CreateCephFilesystem / WaitForCephFilesystemReady / DeleteCephFilesystem helpers (single replicated metadata pool + one replicated data pool, configurable failure domain and MDS active count). WaitForCephFilesystemReady accepts both status.phase=Ready and status.conditions[Ready]=True so it works across Rook revisions. Adds CephFSDataPoolFullName helper that encodes Rook's - pool naming convention so callers can feed the right value into CephStorageClass.spec.cephFS.pool. * pkg/testkit/ceph.go: CephStorageClassConfig grows a Type field ("RBD" default / "CephFS") plus CephFSName, CephFSDataPoolName, CephFS{Metadata,Data}Replicas, CephFSActiveMDSCount and CephFilesystemReadyTimeout knobs. EnsureCephStorageClass step 5 now branches on Type to create the matching pool primitive, and step 8 wires the resulting CephStorageClass with rbd.pool or cephFS.{fsName,pool} accordingly. TeardownCephStorageClass deletes the right Rook primitive based on Type. * New SkipClusterTeardown flag on CephStorageClassConfig: when several StorageClasses share one CephCluster, every teardown except the last one sets it to true so only the owning call removes the underlying CephCluster and rook-config-override. * Re-export CephStorageClassTypeRBD / CephStorageClassTypeCephFS from the testkit package so suites don't have to import pkg/kubernetes just to set cfg.Type. * docs/FUNCTIONS_GLOSSARY.md: documents the new CephFilesystem helpers, the CephFS branch of EnsureCephStorageClass, and the TeardownCephStorageClass + SkipClusterTeardown semantics. Signed-off-by: Aleksandr Zimin --- docs/FUNCTIONS_GLOSSARY.md | 15 +- pkg/kubernetes/cephfilesystem.go | 270 +++++++++++++++++++++++++++++++ pkg/testkit/ceph.go | 176 ++++++++++++++++---- 3 files changed, 428 insertions(+), 33 deletions(-) create mode 100644 pkg/kubernetes/cephfilesystem.go diff --git a/docs/FUNCTIONS_GLOSSARY.md b/docs/FUNCTIONS_GLOSSARY.md index 65fd74a..c24629a 100644 --- a/docs/FUNCTIONS_GLOSSARY.md +++ b/docs/FUNCTIONS_GLOSSARY.md @@ -28,6 +28,7 @@ All exported functions available in the `pkg/` directory, grouped by resource. - [Ceph Credentials](#ceph-credentials) - [CephCluster (Rook)](#cephcluster-rook) - [CephBlockPool (Rook)](#cephblockpool-rook) +- [CephFilesystem (Rook)](#cephfilesystem-rook) - [CephClusterConnection / CephClusterAuthentication (csi-ceph)](#cephclusterconnection--cephclusterauthentication-csi-ceph) - [CephStorageClass (csi-ceph)](#cephstorageclass-csi-ceph) - [Default StorageClass (Testkit)](#default-storageclass-testkit) @@ -259,6 +260,15 @@ All exported functions available in the `pkg/` directory, grouped by resource. - `WaitForCephBlockPoolReady(ctx, kubeconfig, namespace, name, timeout)` — Polls until `status.phase == "Ready"`. - `DeleteCephBlockPool(ctx, kubeconfig, namespace, name)` — Idempotent delete. +## CephFilesystem (Rook) + +`pkg/kubernetes/cephfilesystem.go` + +- `CreateCephFilesystem(ctx, kubeconfig, cfg)` — Creates or updates a Rook `CephFilesystem` from `CephFilesystemConfig` (one replicated metadata pool + one replicated data pool, configurable `failureDomain`, `MetadataServerActiveCount`, optional `RequireSafeReplicaSize`). Idempotent. +- `WaitForCephFilesystemReady(ctx, kubeconfig, namespace, name, timeout)` — Polls until `status.phase == "Ready"`, with a fallback that also accepts `status.conditions[type=Ready,status=True]` for Rook revisions that populate conditions before phase. +- `DeleteCephFilesystem(ctx, kubeconfig, namespace, name)` — Idempotent delete. +- `CephFSDataPoolFullName(fsName, dataPoolName)` — Returns the full Ceph pool name (`-`) that should be passed to `CephStorageClass.spec.cephFS.pool`. + ## CephClusterConnection / CephClusterAuthentication (csi-ceph) `pkg/kubernetes/cephclusterconnection.go` @@ -288,8 +298,9 @@ All exported functions available in the `pkg/` directory, grouped by resource. `pkg/testkit/ceph.go` -- `EnsureCephStorageClass(ctx, kubeconfig, cfg)` — High-level end-to-end helper that turns an empty test cluster into one with a working csi-ceph `StorageClass`. Steps: (1) enable `sds-node-configurator`, `sds-elastic`, `csi-ceph` modules and wait Ready; (2) optionally call `EnsureDefaultStorageClass` to auto-provision a sds-local-volume SC for OSDs when `OSDStorageClass` is empty; (3) seed `rook-config-override` with `GlobalCephConfigOverrides` (e.g. `ms_crc_data=false`); (4) create Rook `CephCluster` and wait Created; (5) create `CephBlockPool` and wait Ready; (6) read fsid/monitors/admin-key from Rook-managed secrets; (7) wire csi-ceph by creating `CephClusterAuthentication` + `CephClusterConnection`; (8) create `CephStorageClass` and wait for the backing core StorageClass. Idempotent; returns the resulting StorageClass name. -- `EnsureDefaultCephStorageClass(ctx, kubeconfig, cfg)` — `EnsureCephStorageClass` + `SetGlobalDefaultStorageClass` so new PVCs without an explicit `storageClassName` use the provisioned Ceph RBD class. +- `EnsureCephStorageClass(ctx, kubeconfig, cfg)` — High-level end-to-end helper that turns an empty test cluster into one with a working csi-ceph `StorageClass`. Steps: (1) enable `sds-node-configurator`, `sds-elastic`, `csi-ceph` modules and wait Ready; (2) optionally call `EnsureDefaultStorageClass` to auto-provision a sds-local-volume SC for OSDs when `OSDStorageClass` is empty; (3) seed `rook-config-override` with `GlobalCephConfigOverrides` (e.g. `ms_crc_data=false`); (4) create Rook `CephCluster` and wait Created; (5) create the backing pool primitive — `CephBlockPool` (when `Type == "RBD"`, default) or `CephFilesystem` (when `Type == "CephFS"`) — and wait Ready; (6) read fsid/monitors/admin-key from Rook-managed secrets; (7) wire csi-ceph by creating `CephClusterAuthentication` + `CephClusterConnection`; (8) create the matching `CephStorageClass` (RBD pool or `-` for CephFS) and wait for the backing core StorageClass. Idempotent; returns the resulting StorageClass name. +- `EnsureDefaultCephStorageClass(ctx, kubeconfig, cfg)` — `EnsureCephStorageClass` + `SetGlobalDefaultStorageClass` so new PVCs without an explicit `storageClassName` use the provisioned Ceph (RBD or CephFS) class. +- `TeardownCephStorageClass(ctx, kubeconfig, cfg)` — Reverse of `EnsureCephStorageClass`. Deletes the `CephStorageClass`, `CephClusterConnection`, `CephClusterAuthentication`, and the `CephBlockPool` / `CephFilesystem` matching `cfg.Type`. Also removes the `CephCluster` and `rook-config-override` ConfigMap unless `SkipClusterTeardown` is set (use that flag when several StorageClasses share one `CephCluster` and only the last teardown should drop the cluster). NotFound is treated as success; the first error is returned but later deletions are still attempted. ## Ceph Cluster (Testkit) — no csi-ceph wiring diff --git a/pkg/kubernetes/cephfilesystem.go b/pkg/kubernetes/cephfilesystem.go new file mode 100644 index 0000000..e4d3c4a --- /dev/null +++ b/pkg/kubernetes/cephfilesystem.go @@ -0,0 +1,270 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubernetes + +import ( + "context" + "fmt" + "time" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/rest" + + "github.com/deckhouse/storage-e2e/internal/logger" +) + +// CephFilesystemGVR is the GroupVersionResource of Rook's CephFilesystem. +var CephFilesystemGVR = schema.GroupVersionResource{ + Group: "ceph.rook.io", + Version: "v1", + Resource: "cephfilesystems", +} + +// CephFilesystemConfig describes a minimal Rook CephFilesystem with one +// metadata pool and exactly one data pool. Defaults are tuned for tiny +// single-node test clusters and mirror CephBlockPoolConfig conventions. +type CephFilesystemConfig struct { + // Name of the CephFilesystem CR. + Name string + + // Namespace the Rook operator watches (typically "d8-sds-elastic"). + Namespace string + + // FailureDomain is the CRUSH failure domain: "host" or "osd" + // (default: "osd" when MetadataPoolReplicas == DataPoolReplicas == 1, + // "host" otherwise). + FailureDomain string + + // MetadataPoolReplicas is the metadata pool replication factor. Default: 1. + MetadataPoolReplicas int + + // DataPoolName is the (Rook-side) data pool name. The full Ceph pool + // name is "-" — see CephFSDataPoolFullName. + // Default: "data0". + DataPoolName string + + // DataPoolReplicas is the data pool replication factor. Default: 1. + DataPoolReplicas int + + // MetadataServerActiveCount is the number of active MDS daemons. + // Default: 1. + MetadataServerActiveCount int + + // RequireSafeReplicaSize toggles Ceph's safeguard against single-replica + // pools. When nil, it is set to false for replicas==1 (unsafe single + // replica, accepted for e2e test clusters) and left unset otherwise. + RequireSafeReplicaSize *bool +} + +// CephFSDataPoolFullName returns the full Ceph pool name that ends up +// referenced from CephStorageClass.spec.cephFS.pool. Rook composes the +// per-filesystem pool name as "-". +func CephFSDataPoolFullName(fsName, dataPoolName string) string { + return fmt.Sprintf("%s-%s", fsName, dataPoolName) +} + +// CreateCephFilesystem creates (or updates, if already present) a +// CephFilesystem in the given namespace from the provided configuration. It +// is idempotent and safe to call on every test run. +func CreateCephFilesystem(ctx context.Context, kubeconfig *rest.Config, cfg CephFilesystemConfig) error { + if cfg.Name == "" { + return fmt.Errorf("CephFilesystem name is required") + } + if cfg.Namespace == "" { + return fmt.Errorf("CephFilesystem namespace is required") + } + if cfg.MetadataPoolReplicas <= 0 { + cfg.MetadataPoolReplicas = 1 + } + if cfg.DataPoolReplicas <= 0 { + cfg.DataPoolReplicas = 1 + } + if cfg.DataPoolName == "" { + cfg.DataPoolName = "data0" + } + if cfg.MetadataServerActiveCount <= 0 { + cfg.MetadataServerActiveCount = 1 + } + if cfg.FailureDomain == "" { + if cfg.MetadataPoolReplicas == 1 && cfg.DataPoolReplicas == 1 { + cfg.FailureDomain = "osd" + } else { + cfg.FailureDomain = "host" + } + } + + requireSafe := cfg.RequireSafeReplicaSize + if requireSafe == nil && (cfg.MetadataPoolReplicas == 1 || cfg.DataPoolReplicas == 1) { + f := false + requireSafe = &f + } + + metadataReplicated := map[string]interface{}{ + "size": int64(cfg.MetadataPoolReplicas), + } + dataReplicated := map[string]interface{}{ + "size": int64(cfg.DataPoolReplicas), + } + if requireSafe != nil { + metadataReplicated["requireSafeReplicaSize"] = *requireSafe + dataReplicated["requireSafeReplicaSize"] = *requireSafe + } + + spec := map[string]interface{}{ + "metadataPool": map[string]interface{}{ + "failureDomain": cfg.FailureDomain, + "replicated": metadataReplicated, + }, + "dataPools": []interface{}{ + map[string]interface{}{ + "name": cfg.DataPoolName, + "failureDomain": cfg.FailureDomain, + "replicated": dataReplicated, + }, + }, + "preserveFilesystemOnDelete": false, + "metadataServer": map[string]interface{}{ + "activeCount": int64(cfg.MetadataServerActiveCount), + "activeStandby": false, + }, + } + + obj := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "ceph.rook.io/v1", + "kind": "CephFilesystem", + "metadata": map[string]interface{}{ + "name": cfg.Name, + "namespace": cfg.Namespace, + }, + "spec": spec, + }, + } + + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + + logger.Info("Creating CephFilesystem %s/%s", cfg.Namespace, cfg.Name) + _, err = dynamicClient.Resource(CephFilesystemGVR).Namespace(cfg.Namespace).Create(ctx, obj, metav1.CreateOptions{}) + if err == nil { + logger.Success("CephFilesystem %s/%s created", cfg.Namespace, cfg.Name) + return nil + } + if !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("failed to create CephFilesystem %s/%s: %w", cfg.Namespace, cfg.Name, err) + } + + logger.Info("CephFilesystem %s/%s already exists, updating spec", cfg.Namespace, cfg.Name) + existing, err := dynamicClient.Resource(CephFilesystemGVR).Namespace(cfg.Namespace).Get(ctx, cfg.Name, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to fetch existing CephFilesystem %s/%s: %w", cfg.Namespace, cfg.Name, err) + } + existing.Object["spec"] = spec + if _, err := dynamicClient.Resource(CephFilesystemGVR).Namespace(cfg.Namespace).Update(ctx, existing, metav1.UpdateOptions{}); err != nil { + return fmt.Errorf("failed to update CephFilesystem %s/%s: %w", cfg.Namespace, cfg.Name, err) + } + return nil +} + +// WaitForCephFilesystemReady blocks until the CephFilesystem reports +// `status.phase == "Ready"`. As a fallback (some Rook revisions populate +// `status.conditions` first) the function also accepts a Ready=True +// condition. +func WaitForCephFilesystemReady(ctx context.Context, kubeconfig *rest.Config, namespace, name string, timeout time.Duration) error { + if namespace == "" || name == "" { + return fmt.Errorf("namespace and name are required") + } + + logger.Debug("Waiting for CephFilesystem %s/%s to become Ready (timeout: %v)", namespace, name, timeout) + + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + + ctx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + + for { + obj, err := dynamicClient.Resource(CephFilesystemGVR).Namespace(namespace).Get(ctx, name, metav1.GetOptions{}) + if err == nil { + phase, _, _ := unstructured.NestedString(obj.Object, "status", "phase") + if phase == "Ready" { + logger.Success("CephFilesystem %s/%s is Ready (status.phase)", namespace, name) + return nil + } + if cephFilesystemReadyByCondition(obj.Object) { + logger.Success("CephFilesystem %s/%s is Ready (status.conditions[Ready]=True)", namespace, name) + return nil + } + logger.Debug("CephFilesystem %s/%s phase: %q, waiting...", namespace, name, phase) + } else if !apierrors.IsNotFound(err) { + logger.Debug("Error getting CephFilesystem %s/%s: %v", namespace, name, err) + } + + select { + case <-ctx.Done(): + return fmt.Errorf("timeout waiting for CephFilesystem %s/%s: %w", namespace, name, ctx.Err()) + case <-ticker.C: + } + } +} + +func cephFilesystemReadyByCondition(obj map[string]interface{}) bool { + conditions, found, err := unstructured.NestedSlice(obj, "status", "conditions") + if err != nil || !found { + return false + } + for _, raw := range conditions { + cond, ok := raw.(map[string]interface{}) + if !ok { + continue + } + ctype, _, _ := unstructured.NestedString(cond, "type") + cstatus, _, _ := unstructured.NestedString(cond, "status") + if ctype == "Ready" && cstatus == "True" { + return true + } + } + return false +} + +// DeleteCephFilesystem deletes a CephFilesystem. Safe to call if the +// filesystem does not exist. +func DeleteCephFilesystem(ctx context.Context, kubeconfig *rest.Config, namespace, name string) error { + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + + if err := dynamicClient.Resource(CephFilesystemGVR).Namespace(namespace).Delete(ctx, name, metav1.DeleteOptions{}); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed to delete CephFilesystem %s/%s: %w", namespace, name, err) + } + logger.Info("Deleted CephFilesystem %s/%s", namespace, name) + return nil +} diff --git a/pkg/testkit/ceph.go b/pkg/testkit/ceph.go index 29c09ad..9c6a08a 100644 --- a/pkg/testkit/ceph.go +++ b/pkg/testkit/ceph.go @@ -28,6 +28,13 @@ import ( "github.com/deckhouse/storage-e2e/pkg/kubernetes" ) +// Re-exports of the supported CephStorageClass types so callers don't have +// to import the lower-level pkg/kubernetes package just to set cfg.Type. +const ( + CephStorageClassTypeRBD = kubernetes.CephStorageClassTypeRBD + CephStorageClassTypeCephFS = kubernetes.CephStorageClassTypeCephFS +) + // CephStorageClassConfig controls the end-to-end provisioning of a // Rook-managed Ceph cluster plus a csi-ceph-backed k8s StorageClass: // @@ -132,6 +139,33 @@ type CephStorageClassConfig struct { // Default: "osd" when ReplicaSize==1, "host" otherwise. FailureDomain string + // --- Pool kind --- + + // Type selects the backing Ceph primitive: "RBD" (default) provisions a + // CephBlockPool; "CephFS" provisions a CephFilesystem. The resulting + // csi-ceph CephStorageClass CR mirrors this choice via spec.type. + Type string + + // --- CephFilesystem (used only when Type == "CephFS") --- + + // CephFSName is the Rook CephFilesystem name. Default: "ceph-fs". + CephFSName string + + // CephFSDataPoolName is the per-filesystem data pool name (Rook-side, + // not the full Ceph pool name). Default: "data0". + CephFSDataPoolName string + + // CephFSMetadataReplicas is the metadata pool replication factor. + // Default: ReplicaSize. + CephFSMetadataReplicas int + + // CephFSDataReplicas is the data pool replication factor. + // Default: ReplicaSize. + CephFSDataReplicas int + + // CephFSActiveMDSCount is the number of active MDS daemons. Default: 1. + CephFSActiveMDSCount int + // --- csi-ceph wiring --- // ClusterConnectionName and ClusterAuthenticationName point at the @@ -149,6 +183,15 @@ type CephStorageClassConfig struct { // caller has already configured ModuleConfig on the cluster). SkipModuleEnablement bool + // SkipClusterTeardown leaves the underlying Rook CephCluster and the + // rook-config-override ConfigMap in place during TeardownCephStorageClass. + // Use it when several StorageClasses share a single CephCluster — the + // "owning" call should leave the flag false and tear the cluster down + // last, while every other teardown sets it to true and only removes its + // SC-specific resources (CephStorageClass / connection / auth / pool / + // filesystem). + SkipClusterTeardown bool + // SdsElasticSettings overrides `spec.settings` of the sds-elastic // ModuleConfig. Defaults to the minimal set that makes sense on a // single-node test cluster. @@ -163,12 +206,13 @@ type CephStorageClassConfig struct { // --- Timeouts --- - ModulesReadyTimeout time.Duration // default 15m - CephClusterReadyTimeout time.Duration // default 20m - CephPoolReadyTimeout time.Duration // default 10m - CredentialsTimeout time.Duration // default 10m - CSICephPhaseTimeout time.Duration // default 5m - StorageClassWaitTimeout time.Duration // default 2m + ModulesReadyTimeout time.Duration // default 15m + CephClusterReadyTimeout time.Duration // default 20m + CephPoolReadyTimeout time.Duration // default 10m + CephFilesystemReadyTimeout time.Duration // default 10m + CredentialsTimeout time.Duration // default 10m + CSICephPhaseTimeout time.Duration // default 5m + StorageClassWaitTimeout time.Duration // default 2m } func (c *CephStorageClassConfig) applyDefaults() { @@ -221,6 +265,24 @@ func (c *CephStorageClassConfig) applyDefaults() { if c.RBDDefaultFSType == "" { c.RBDDefaultFSType = "ext4" } + if c.Type == "" { + c.Type = kubernetes.CephStorageClassTypeRBD + } + if c.CephFSName == "" { + c.CephFSName = "ceph-fs" + } + if c.CephFSDataPoolName == "" { + c.CephFSDataPoolName = "data0" + } + if c.CephFSMetadataReplicas <= 0 { + c.CephFSMetadataReplicas = c.ReplicaSize + } + if c.CephFSDataReplicas <= 0 { + c.CephFSDataReplicas = c.ReplicaSize + } + if c.CephFSActiveMDSCount <= 0 { + c.CephFSActiveMDSCount = 1 + } if c.ModulesReadyTimeout == 0 { c.ModulesReadyTimeout = 15 * time.Minute } @@ -230,6 +292,9 @@ func (c *CephStorageClassConfig) applyDefaults() { if c.CephPoolReadyTimeout == 0 { c.CephPoolReadyTimeout = 10 * time.Minute } + if c.CephFilesystemReadyTimeout == 0 { + c.CephFilesystemReadyTimeout = 10 * time.Minute + } if c.CredentialsTimeout == 0 { c.CredentialsTimeout = 10 * time.Minute } @@ -297,20 +362,45 @@ func EnsureCephStorageClass(ctx context.Context, kubeconfig *rest.Config, cfg Ce } logger.StepComplete(4, "CephCluster %s/%s is Created", cfg.Namespace, cfg.CephClusterName) - logger.Step(5, "Creating CephBlockPool %s/%s (replica=%d, failureDomain=%s)", - cfg.Namespace, cfg.PoolName, cfg.ReplicaSize, cfg.FailureDomain) - if err := kubernetes.CreateCephBlockPool(ctx, kubeconfig, kubernetes.CephBlockPoolConfig{ - Name: cfg.PoolName, - Namespace: cfg.Namespace, - FailureDomain: cfg.FailureDomain, - ReplicaSize: cfg.ReplicaSize, - }); err != nil { - return "", fmt.Errorf("create CephBlockPool: %w", err) - } - if err := kubernetes.WaitForCephBlockPoolReady(ctx, kubeconfig, cfg.Namespace, cfg.PoolName, cfg.CephPoolReadyTimeout); err != nil { - return "", fmt.Errorf("wait CephBlockPool: %w", err) + switch cfg.Type { + case kubernetes.CephStorageClassTypeRBD: + logger.Step(5, "Creating CephBlockPool %s/%s (replica=%d, failureDomain=%s)", + cfg.Namespace, cfg.PoolName, cfg.ReplicaSize, cfg.FailureDomain) + if err := kubernetes.CreateCephBlockPool(ctx, kubeconfig, kubernetes.CephBlockPoolConfig{ + Name: cfg.PoolName, + Namespace: cfg.Namespace, + FailureDomain: cfg.FailureDomain, + ReplicaSize: cfg.ReplicaSize, + }); err != nil { + return "", fmt.Errorf("create CephBlockPool: %w", err) + } + if err := kubernetes.WaitForCephBlockPoolReady(ctx, kubeconfig, cfg.Namespace, cfg.PoolName, cfg.CephPoolReadyTimeout); err != nil { + return "", fmt.Errorf("wait CephBlockPool: %w", err) + } + logger.StepComplete(5, "CephBlockPool %s/%s is Ready", cfg.Namespace, cfg.PoolName) + case kubernetes.CephStorageClassTypeCephFS: + logger.Step(5, "Creating CephFilesystem %s/%s (metadata replica=%d, data pool %q replica=%d, failureDomain=%s, activeMDS=%d)", + cfg.Namespace, cfg.CephFSName, + cfg.CephFSMetadataReplicas, cfg.CephFSDataPoolName, cfg.CephFSDataReplicas, + cfg.FailureDomain, cfg.CephFSActiveMDSCount) + if err := kubernetes.CreateCephFilesystem(ctx, kubeconfig, kubernetes.CephFilesystemConfig{ + Name: cfg.CephFSName, + Namespace: cfg.Namespace, + FailureDomain: cfg.FailureDomain, + MetadataPoolReplicas: cfg.CephFSMetadataReplicas, + DataPoolName: cfg.CephFSDataPoolName, + DataPoolReplicas: cfg.CephFSDataReplicas, + MetadataServerActiveCount: cfg.CephFSActiveMDSCount, + }); err != nil { + return "", fmt.Errorf("create CephFilesystem: %w", err) + } + if err := kubernetes.WaitForCephFilesystemReady(ctx, kubeconfig, cfg.Namespace, cfg.CephFSName, cfg.CephFilesystemReadyTimeout); err != nil { + return "", fmt.Errorf("wait CephFilesystem: %w", err) + } + logger.StepComplete(5, "CephFilesystem %s/%s is Ready", cfg.Namespace, cfg.CephFSName) + default: + return "", fmt.Errorf("unsupported CephStorageClass Type: %s", cfg.Type) } - logger.StepComplete(5, "CephBlockPool %s/%s is Ready", cfg.Namespace, cfg.PoolName) logger.Step(6, "Extracting Rook-managed Ceph credentials (fsid, monitors, admin key)") creds, err := kubernetes.WaitForCephCredentials(ctx, kubeconfig, cfg.Namespace, cfg.CredentialsTimeout) @@ -343,15 +433,24 @@ func EnsureCephStorageClass(ctx context.Context, kubeconfig *rest.Config, cfg Ce } logger.StepComplete(7, "csi-ceph wired against Ceph cluster %s", creds.FSID) - logger.Step(8, "Creating CephStorageClass %q → StorageClass", cfg.StorageClassName) - if err := kubernetes.CreateCephStorageClass(ctx, kubeconfig, kubernetes.CephStorageClassConfig{ + logger.Step(8, "Creating CephStorageClass %q (type=%s) → StorageClass", cfg.StorageClassName, cfg.Type) + cscCfg := kubernetes.CephStorageClassConfig{ Name: cfg.StorageClassName, ClusterConnectionName: cfg.ClusterConnectionName, ClusterAuthenticationName: cfg.ClusterAuthenticationName, - Type: kubernetes.CephStorageClassTypeRBD, - RBDPool: cfg.PoolName, - RBDDefaultFSType: cfg.RBDDefaultFSType, - }); err != nil { + Type: cfg.Type, + } + switch cfg.Type { + case kubernetes.CephStorageClassTypeRBD: + cscCfg.RBDPool = cfg.PoolName + cscCfg.RBDDefaultFSType = cfg.RBDDefaultFSType + case kubernetes.CephStorageClassTypeCephFS: + cscCfg.CephFSName = cfg.CephFSName + cscCfg.CephFSPool = kubernetes.CephFSDataPoolFullName(cfg.CephFSName, cfg.CephFSDataPoolName) + default: + return "", fmt.Errorf("unsupported CephStorageClass Type: %s", cfg.Type) + } + if err := kubernetes.CreateCephStorageClass(ctx, kubeconfig, cscCfg); err != nil { return "", fmt.Errorf("create CephStorageClass: %w", err) } if err := kubernetes.WaitForCephStorageClassCreated(ctx, kubeconfig, cfg.StorageClassName, cfg.CSICephPhaseTimeout); err != nil { @@ -362,8 +461,14 @@ func EnsureCephStorageClass(ctx context.Context, kubeconfig *rest.Config, cfg Ce } logger.StepComplete(8, "StorageClass %s is available", cfg.StorageClassName) - logger.Success("Ceph e2e stack ready: CephCluster %s/%s + pool %s → StorageClass %s", - cfg.Namespace, cfg.CephClusterName, cfg.PoolName, cfg.StorageClassName) + switch cfg.Type { + case kubernetes.CephStorageClassTypeCephFS: + logger.Success("Ceph e2e stack ready: CephCluster %s/%s + filesystem %s → StorageClass %s", + cfg.Namespace, cfg.CephClusterName, cfg.CephFSName, cfg.StorageClassName) + default: + logger.Success("Ceph e2e stack ready: CephCluster %s/%s + pool %s → StorageClass %s", + cfg.Namespace, cfg.CephClusterName, cfg.PoolName, cfg.StorageClassName) + } return cfg.StorageClassName, nil } @@ -389,13 +494,22 @@ func TeardownCephStorageClass(ctx context.Context, kubeconfig *rest.Config, cfg } } - logger.Info("Tearing down csi-ceph StorageClass %q", cfg.StorageClassName) + logger.Info("Tearing down csi-ceph StorageClass %q (type=%s)", cfg.StorageClassName, cfg.Type) note(kubernetes.DeleteCephStorageClass(ctx, kubeconfig, cfg.StorageClassName), "delete CephStorageClass") note(kubernetes.DeleteCephClusterConnection(ctx, kubeconfig, cfg.ClusterConnectionName), "delete CephClusterConnection") note(kubernetes.DeleteCephClusterAuthentication(ctx, kubeconfig, cfg.ClusterAuthenticationName), "delete CephClusterAuthentication") - note(kubernetes.DeleteCephBlockPool(ctx, kubeconfig, cfg.Namespace, cfg.PoolName), "delete CephBlockPool") - note(kubernetes.DeleteCephCluster(ctx, kubeconfig, cfg.Namespace, cfg.CephClusterName), "delete CephCluster") - note(kubernetes.DeleteRookConfigOverride(ctx, kubeconfig, cfg.Namespace), "delete rook-config-override") + switch cfg.Type { + case kubernetes.CephStorageClassTypeCephFS: + note(kubernetes.DeleteCephFilesystem(ctx, kubeconfig, cfg.Namespace, cfg.CephFSName), "delete CephFilesystem") + default: + note(kubernetes.DeleteCephBlockPool(ctx, kubeconfig, cfg.Namespace, cfg.PoolName), "delete CephBlockPool") + } + if !cfg.SkipClusterTeardown { + note(kubernetes.DeleteCephCluster(ctx, kubeconfig, cfg.Namespace, cfg.CephClusterName), "delete CephCluster") + note(kubernetes.DeleteRookConfigOverride(ctx, kubeconfig, cfg.Namespace), "delete rook-config-override") + } else { + logger.Info("Skipping CephCluster + rook-config-override teardown (SkipClusterTeardown=true)") + } return firstErr } From e3d4e8d6368a69cad732922d7e588e182ca871e3 Mon Sep 17 00:00:00 2001 From: Aleksandr Zimin Date: Tue, 5 May 2026 11:49:31 +0300 Subject: [PATCH 06/14] Make e2e resilient to network drops and add modulePullOverride env templating This bundles four related fixes that surfaced during csi-ceph e2e diagnosis, all aimed at the same failure mode: a flapping Wi-Fi or unreliable bootstrap network silently breaking a 50-minute test run. 1. modulePullOverride env templating - internal/config/overrides.go (+_test.go): ExpandEnvInModulePullOverride resolves ${VAR} placeholders in cluster_config.yml at config load time. CI sets one MODULE_IMAGE_TAG (e.g. "pr131" / "mr131") and points multiple modules at it without per-run YAML edits. Missing env fails fast with an explicit message so the wrong-image-pull confusion is gone. - Hooks in internal/cluster/cluster.go::LoadClusterConfig and pkg/cluster/cluster.go::loadClusterConfigFromPath after yaml.Unmarshal. - README.md documents the new ${VAR} form. 2. Bootstrap robustness on developer laptops - pkg/cluster/setup.go: pass FORCE_NO_PRIVATE_KEYS=true and USE_AGENT_WITH_NO_PRIVATE_KEYS=true into the dhctl install:main container so lib-connection stops trying to open /root/.ssh/id_rsa and authenticates only via the mounted ssh-agent socket. Fixes "extract config: Failed to read private keys from flags" with a passphrase-protected key. - pkg/cluster/vms.go: cloud-init now pins apt at mirror.yandex.ru and forces IPv4 so package_update + Docker install stop stalling on egress paths where archive.ubuntu.com is partially unreachable. - internal/config/env.go: extracted ApplyDefaults() out of ValidateEnvironment so suites that skip validation still get defaults for SSH_VM_USER / SSH_PRIVATE_KEY / etc. - pkg/cluster/cluster.go::CreateTestCluster now calls ApplyDefaults() and falls back to YAMLConfigFilenameDefaultValue on empty arg. - internal/cluster/cluster.go::GetKubeconfig falls back to clientcmd default loading rules (KUBECONFIG / ~/.kube/config, minified to the current context) when SSH retrieval fails and KUBE_CONFIG_PATH is unset. 3. SSH tunnel auto-reconnect - internal/infrastructure/ssh/client.go: both (*client).StartTunnel and (*jumpHostClient).StartTunnel now share runTunnelLoop driven by a tunnelDialer struct. When the underlying SSH session dies, dial fails with EOF; the loop emits a WARN, calls the existing reconnect() (which already has retry + exponential backoff), and retries the dial once with the rebuilt session. Without this a Wi-Fi flap killed the tunnel and every client-go GET silently returned EOF until the parent readiness timeout fired. 4. Per-call deadline + visible WARN in Ceph readiness pollers - pkg/kubernetes/poll.go (new): pollResourceUntilReady centralizes our Wait*Ready loops. Each Get is bounded by PollGetTimeout (30s) so a hung TCP connect surfaces in seconds, and consecutive Get failures escalate to WARN once they cross 3 so the user sees the cluster connection is dying instead of waiting for the readyTimeout. - pkg/kubernetes/{cephcluster,cephblockpool,cephfilesystem}.go: WaitForCephClusterReady / WaitForCephBlockPoolReady / WaitForCephFilesystemReady migrated. Public signatures unchanged. Docs: - docs/WORKLOG.md: 2026-05-05 entries. - docs/FUNCTIONS_GLOSSARY.md: updated descriptions for the three Wait*Ready helpers. - docs/ARCHITECTURE.md: poll.go and cephfilesystem.go added to the package tree (Sections 1.1 and 3.6); overrides.go in Section 3.1. Signed-off-by: Aleksandr Zimin --- README.md | 19 ++ docs/ARCHITECTURE.md | 5 + docs/FUNCTIONS_GLOSSARY.md | 6 +- docs/WORKLOG.md | 16 ++ internal/cluster/cluster.go | 103 ++++++++-- internal/config/env.go | 16 +- internal/config/overrides.go | 68 ++++++ internal/config/overrides_test.go | 149 ++++++++++++++ internal/infrastructure/ssh/client.go | 285 +++++++++++++------------- pkg/cluster/cluster.go | 24 +++ pkg/cluster/setup.go | 27 ++- pkg/cluster/vms.go | 41 +++- pkg/kubernetes/cephblockpool.go | 46 ++--- pkg/kubernetes/cephcluster.go | 47 ++--- pkg/kubernetes/cephfilesystem.go | 48 ++--- pkg/kubernetes/poll.go | 160 +++++++++++++++ 16 files changed, 779 insertions(+), 281 deletions(-) create mode 100644 internal/config/overrides.go create mode 100644 internal/config/overrides_test.go create mode 100644 pkg/kubernetes/poll.go diff --git a/README.md b/README.md index ec6384f..99f8aab 100644 --- a/README.md +++ b/README.md @@ -80,6 +80,25 @@ Testkit-specific env variables: - `CSI_CEPH_MODULE_PULL_OVERRIDE` — image tag for `csi-ceph`'s ModulePullOverride (dev registries only, e.g. when testing a PR build). +#### `modulePullOverride` env templating + +Any module entry in `cluster_config.yml` may reference an env var with the +`${VAR}` form in `modulePullOverride`. `storage-e2e` resolves those at config +load time, so CI can point a module at a per-PR/MR image without editing the +YAML between runs: + +```yaml +dkpParameters: + modules: + - name: csi-ceph + modulePullOverride: "${MODULE_IMAGE_TAG}" # CI must set MODULE_IMAGE_TAG, e.g. "pr131" on GitHub or "mr131" on GitLab +``` + +If a referenced env var is unset, `LoadClusterConfig` fails fast with +`module "" references env var ${VAR} in modulePullOverride but it is not set` +instead of silently falling back to `main` — so a missing variable in CI is +caught before bootstrap, not after a 30-minute wrong-image install. + Run: ```bash diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 847be88..5030d65 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -77,6 +77,7 @@ storage-e2e/ │ │ ├── blockdevice.go # BlockDevice operations │ │ ├── cephblockpool.go # Rook CephBlockPool operations │ │ ├── cephcluster.go # Rook CephCluster operations +│ │ ├── cephfilesystem.go # Rook CephFilesystem operations │ │ ├── cephclusterconnection.go # csi-ceph connection/auth CRs │ │ ├── cephcredentials.go # Rook Ceph credential discovery │ │ ├── cephstorageclass.go # csi-ceph CephStorageClass CR @@ -88,6 +89,7 @@ storage-e2e/ │ │ ├── nodegroup.go # NodeGroup operations │ │ ├── nodes.go # Node listing, taints, labels │ │ ├── pod.go # Pod operations +│ │ ├── poll.go # Generic readiness poller (per-call timeout, WARN on net errors) │ │ ├── pvc.go # PVC operations │ │ ├── rookconfigoverride.go # Rook ceph.conf override ConfigMap │ │ ├── secrets.go # Secret operations @@ -337,6 +339,7 @@ Tests use Ginkgo's lifecycle hooks: config/ ├── config.go # Main configuration operations ├── env.go # Environment variable definitions and validation +├── overrides.go # ${VAR} expansion in modulePullOverride at config load time ├── types.go # Configuration type definitions └── images.go # OS image URL definitions ``` @@ -499,6 +502,7 @@ pkg/ │ ├── blockdevice.go # BlockDevice operations │ ├── cephblockpool.go # Rook CephBlockPool CRUD + wait │ ├── cephcluster.go # Rook CephCluster CRUD + wait +│ ├── cephfilesystem.go # Rook CephFilesystem CRUD + wait │ ├── cephclusterconnection.go # csi-ceph CephClusterConnection/Auth CRs │ ├── cephcredentials.go # Read fsid/mons/admin-key from Rook secrets │ ├── cephstorageclass.go # csi-ceph CephStorageClass CR @@ -510,6 +514,7 @@ pkg/ │ ├── nodegroup.go # NodeGroup operations │ ├── nodes.go # Node listing, taints, labels │ ├── pod.go # Pod operations +│ ├── poll.go # pollResourceUntilReady helper for Wait*Ready callers │ ├── pvc.go # PVC operations │ ├── rookconfigoverride.go # Rook global ceph.conf override │ ├── secrets.go # Secret operations diff --git a/docs/FUNCTIONS_GLOSSARY.md b/docs/FUNCTIONS_GLOSSARY.md index c24629a..73ab6a8 100644 --- a/docs/FUNCTIONS_GLOSSARY.md +++ b/docs/FUNCTIONS_GLOSSARY.md @@ -249,7 +249,7 @@ All exported functions available in the `pkg/` directory, grouped by resource. `pkg/kubernetes/cephcluster.go` - `CreateCephCluster(ctx, kubeconfig, cfg)` — Creates or updates a Rook `CephCluster` CR using `CephClusterConfig` (image, mon/mgr counts, network provider, OSD storage class / count / size, data-dir host path, etc.). Idempotent. -- `WaitForCephClusterReady(ctx, kubeconfig, namespace, name, timeout)` — Blocks until `status.state == "Created"` (or `status.phase == "Ready"`). HEALTH_WARN is tolerated so single-OSD test clusters still succeed. +- `WaitForCephClusterReady(ctx, kubeconfig, namespace, name, timeout)` — Blocks until `status.state == "Created"` (or `status.phase == "Ready"`). HEALTH_WARN is tolerated so single-OSD test clusters still succeed. Each Get is bounded by `PollGetTimeout` (30s) and consecutive Get failures emit WARN, so a dropped SSH tunnel surfaces in seconds instead of after the readyTimeout. - `DeleteCephCluster(ctx, kubeconfig, namespace, name)` — Deletes the CR; NotFound is treated as success. Does NOT garbage-collect OSD data on host disks. ## CephBlockPool (Rook) @@ -257,7 +257,7 @@ All exported functions available in the `pkg/` directory, grouped by resource. `pkg/kubernetes/cephblockpool.go` - `CreateCephBlockPool(ctx, kubeconfig, cfg)` — Creates or updates a Rook `CephBlockPool` from `CephBlockPoolConfig` (replicated with optional `requireSafeReplicaSize` override, or erasure-coded with `dataChunks`/`codingChunks`; `failureDomain`). -- `WaitForCephBlockPoolReady(ctx, kubeconfig, namespace, name, timeout)` — Polls until `status.phase == "Ready"`. +- `WaitForCephBlockPoolReady(ctx, kubeconfig, namespace, name, timeout)` — Polls until `status.phase == "Ready"`. Each Get is bounded by `PollGetTimeout` (30s) and consecutive Get failures emit WARN, so a dropped SSH tunnel surfaces in seconds instead of after the readyTimeout. - `DeleteCephBlockPool(ctx, kubeconfig, namespace, name)` — Idempotent delete. ## CephFilesystem (Rook) @@ -265,7 +265,7 @@ All exported functions available in the `pkg/` directory, grouped by resource. `pkg/kubernetes/cephfilesystem.go` - `CreateCephFilesystem(ctx, kubeconfig, cfg)` — Creates or updates a Rook `CephFilesystem` from `CephFilesystemConfig` (one replicated metadata pool + one replicated data pool, configurable `failureDomain`, `MetadataServerActiveCount`, optional `RequireSafeReplicaSize`). Idempotent. -- `WaitForCephFilesystemReady(ctx, kubeconfig, namespace, name, timeout)` — Polls until `status.phase == "Ready"`, with a fallback that also accepts `status.conditions[type=Ready,status=True]` for Rook revisions that populate conditions before phase. +- `WaitForCephFilesystemReady(ctx, kubeconfig, namespace, name, timeout)` — Polls until `status.phase == "Ready"`, with a fallback that also accepts `status.conditions[type=Ready,status=True]` for Rook revisions that populate conditions before phase. Each Get is bounded by `PollGetTimeout` (30s) and consecutive Get failures emit WARN, so a dropped SSH tunnel surfaces in seconds instead of after the readyTimeout. - `DeleteCephFilesystem(ctx, kubeconfig, namespace, name)` — Idempotent delete. - `CephFSDataPoolFullName(fsName, dataPoolName)` — Returns the full Ceph pool name (`-`) that should be passed to `CephStorageClass.spec.cephFS.pool`. diff --git a/docs/WORKLOG.md b/docs/WORKLOG.md index 5ff70b2..6d7d508 100644 --- a/docs/WORKLOG.md +++ b/docs/WORKLOG.md @@ -25,3 +25,19 @@ All notable changes to this repository are documented here. New entries are appe - **Add** `.cursor/rules/todo-command.mdc`: `/todo` command for managing `docs/TODO.md` - **Add** `.cursor/rules/backward-compatibility.mdc`: rule to guard backward compatibility of exported `pkg/` API — ask before breaking changes, mark worklog with `[Possible compatibility break]` - **Add** `.cursor/rules/versatile-functions.mdc`: rule to ensure new functions are general-purpose and reusable — return data not decisions, no hardcoded names, compose from existing functions, no empty wrappers + +## 2026-05-05 + +- **Add** `internal/config/overrides.go` + `_test.go`: `ExpandEnvInModulePullOverride` resolves `${VAR}` placeholders in `modulePullOverride` at config load time; missing env fails fast with an explicit error so CI can point modules at `pr` / `mr` images via a single env var (`MODULE_IMAGE_TAG`) without editing `cluster_config.yml`. +- **Update** `internal/cluster/cluster.go::LoadClusterConfig` and `pkg/cluster/cluster.go::loadClusterConfigFromPath`: hook `ExpandEnvInModulePullOverride` right after `yaml.Unmarshal`. +- **Update** `README.md`: documented `${VAR}` form in `modulePullOverride` and the fail-fast behavior on unset env vars. +- **Refactor** `internal/config/env.go`: extracted `ApplyDefaults()` out of `ValidateEnvironment` so suites that don't call validation still get defaults for `SSH_VM_USER` / `SSH_PRIVATE_KEY` / `SSH_PUBLIC_KEY` / `TEST_CLUSTER_NAMESPACE` / `YAML_CONFIG_FILENAME` / `TEST_CLUSTER_CLEANUP`. +- **Update** `pkg/cluster/cluster.go::CreateTestCluster`: call `config.ApplyDefaults()` defensively + fall back to `config.YAMLConfigFilenameDefaultValue` when the filename arg is empty. +- **Update** `internal/cluster/cluster.go::GetKubeconfig`: added a third-tier fallback to `clientcmd.NewDefaultClientConfigLoadingRules` (KUBECONFIG / `~/.kube/config`) + `MinifyConfig` when SSH retrieval fails and `KUBE_CONFIG_PATH` is unset, so a developer whose local `kubectl` already targets the base cluster doesn't have to set anything. +- **Bugfix** `pkg/cluster/setup.go::executeDhctlBootstrap`: pass `FORCE_NO_PRIVATE_KEYS=true` and `USE_AGENT_WITH_NO_PRIVATE_KEYS=true` env vars into the `dhctl bootstrap` container so `lib-connection` stops opening `/root/.ssh/id_rsa` and authenticates exclusively via the mounted ssh-agent socket — fixes "Failed to read private keys from flags" on passphrase-protected keys. +- **Bugfix** `pkg/cluster/vms.go::generateCloudInitUserData`: pin apt to `mirror.yandex.ru` and force IPv4 (`Acquire::ForceIPv4=true`) in cloud-init, so `package_update` and Docker install stop stalling when `archive.ubuntu.com` IPs are partially unreachable. +- **Refactor** `internal/infrastructure/ssh/client.go::StartTunnel` (both `*client` and `*jumpHostClient`): extracted shared `runTunnelLoop` + `tunnelDialer`. On dial failure that looks like a dropped SSH session, the loop now logs a visible WARN, calls the existing `reconnect()` (retry + exponential backoff), and retries the dial once with the freshly rebuilt session. Fixes the "test hangs 20 minutes silently after Wi-Fi flap" failure mode. +- **Add** `pkg/kubernetes/poll.go`: `pollResourceUntilReady` centralizes the `WaitFor*Ready` loops with a per-call `PollGetTimeout` (30s) on every Get and WARN logging once consecutive Get failures cross 3, so a dropped tunnel surfaces in seconds instead of after the 20-minute readyTimeout. +- **Refactor** `pkg/kubernetes/cephcluster.go`, `pkg/kubernetes/cephblockpool.go`, `pkg/kubernetes/cephfilesystem.go`: `WaitForCephClusterReady` / `WaitForCephBlockPoolReady` / `WaitForCephFilesystemReady` migrated to `pollResourceUntilReady`. Public signatures unchanged. +- **Update** `docs/FUNCTIONS_GLOSSARY.md`: noted that the three `WaitForCeph*Ready` helpers now apply a per-call deadline and emit WARN on consecutive Get failures. +- **Update** `docs/ARCHITECTURE.md`: added `pkg/kubernetes/poll.go` to Section 1.1 and Section 3.6, added `pkg/kubernetes/cephfilesystem.go` (carry-over from the prior commit), added `internal/config/overrides.go` to Section 3.1. diff --git a/internal/cluster/cluster.go b/internal/cluster/cluster.go index 1cd7469..af47393 100644 --- a/internal/cluster/cluster.go +++ b/internal/cluster/cluster.go @@ -45,9 +45,11 @@ import ( "gopkg.in/yaml.v3" "k8s.io/client-go/rest" "k8s.io/client-go/tools/clientcmd" + clientcmdapi "k8s.io/client-go/tools/clientcmd/api" "github.com/deckhouse/storage-e2e/internal/config" "github.com/deckhouse/storage-e2e/internal/infrastructure/ssh" + "github.com/deckhouse/storage-e2e/internal/logger" ) // LoadClusterConfig loads and validates a cluster configuration from a YAML file @@ -73,6 +75,14 @@ func LoadClusterConfig(configFilename string) (*config.ClusterDefinition, error) return nil, fmt.Errorf("failed to parse YAML config: %w", err) } + // Expand ${VAR} placeholders in modulePullOverride fields. CI uses this to + // pass a per-PR/MR image tag via a single env var (e.g. MODULE_IMAGE_TAG) + // without editing the YAML between runs. Missing envs fail fast here so we + // don't silently regress to "main" on accidentally unset variables. + if err := config.ExpandEnvInModulePullOverride(&clusterDef); err != nil { + return nil, fmt.Errorf("expand env in modulePullOverride: %w", err) + } + // Validate the configuration if err := validateClusterConfig(&clusterDef); err != nil { return nil, fmt.Errorf("config validation failed: %w", err) @@ -219,30 +229,40 @@ func GetKubeconfig(ctx context.Context, masterIP, user, keyPath string, sshClien var kubeconfigContent []byte // Read kubeconfig via SSH: prefer super-admin.conf when present (see getKubeconfigRemoteShell). - kubeconfigContentStr, err := sshClient.Exec(ctx, getKubeconfigRemoteShell) - if err != nil { - // SSH retrieval failed (likely due to sudo password requirement) - // Try to use KUBE_CONFIG_PATH if set, otherwise notify user - if config.KubeConfigPath != "" { - // Expand path to handle ~ and resolve symlinks if present - resolvedPath, err := expandPath(config.KubeConfigPath) - if err != nil { - return nil, "", fmt.Errorf("failed to expand KUBE_CONFIG_PATH (%s): %w", config.KubeConfigPath, err) - } - // Read kubeconfig content from the provided file - kubeconfigContent, err = os.ReadFile(resolvedPath) - if err != nil { - return nil, "", fmt.Errorf("failed to read kubeconfig from KUBE_CONFIG_PATH (%s): %w", resolvedPath, err) - } - } else { - // KUBE_CONFIG_PATH not set, notify user and fail - return nil, "", fmt.Errorf("failed to read kubeconfig from master (this may occur if sudo requires a password). "+ - "Please download the kubeconfig file manually and provide its full path via KUBE_CONFIG_PATH environment variable. "+ - "Original error: %w", err) - } - } else { + kubeconfigContentStr, sshErr := sshClient.Exec(ctx, getKubeconfigRemoteShell) + switch { + case sshErr == nil: // SSH succeeded - use the content from SSH kubeconfigContent = []byte(kubeconfigContentStr) + + case config.KubeConfigPath != "": + // SSH retrieval failed (likely due to sudo password requirement) and the + // caller pointed us at a specific kubeconfig file via KUBE_CONFIG_PATH. + resolvedPath, expandErr := expandPath(config.KubeConfigPath) + if expandErr != nil { + return nil, "", fmt.Errorf("failed to expand KUBE_CONFIG_PATH (%s): %w", config.KubeConfigPath, expandErr) + } + readContent, readErr := os.ReadFile(resolvedPath) + if readErr != nil { + return nil, "", fmt.Errorf("failed to read kubeconfig from KUBE_CONFIG_PATH (%s): %w", resolvedPath, readErr) + } + kubeconfigContent = readContent + + default: + // SSH failed and no explicit KUBE_CONFIG_PATH. Fall back to kubectl's + // standard resolution (KUBECONFIG env, otherwise ~/.kube/config) so + // that a developer whose `kubectl` already targets the right base + // cluster doesn't have to set anything else. + fallbackContent, fallbackPath, fallbackErr := loadDefaultKubeconfig() + if fallbackErr == nil { + logger.Info("SSH kubeconfig retrieval failed (%v); falling back to local kubeconfig at %s", sshErr, fallbackPath) + kubeconfigContent = fallbackContent + } else { + return nil, "", fmt.Errorf("failed to read kubeconfig from master (this may occur if sudo requires a password) "+ + "and the local kubectl-default kubeconfig fallback also failed (%v). "+ + "Set KUBE_CONFIG_PATH to a working kubeconfig, or ensure $KUBECONFIG / ~/.kube/config points at the base cluster. "+ + "Original SSH error: %w", fallbackErr, sshErr) + } } // Write kubeconfig content to file (always write a working copy, regardless of source) @@ -348,3 +368,42 @@ func UpdateKubeconfigPort(kubeconfigPath string, localPort int) error { return nil } + +// loadDefaultKubeconfig replicates kubectl's standard kubeconfig resolution +// (KUBECONFIG env, otherwise ~/.kube/config; multiple files in KUBECONFIG are +// merged) and returns the serialized merged config plus a human-readable +// description of where it was loaded from. Used as a last-resort fallback when +// SSH-based retrieval fails and KUBE_CONFIG_PATH is not set, so a developer +// whose `kubectl` already points at the right base cluster can simply run the +// suite without exporting any extra variables. +func loadDefaultKubeconfig() ([]byte, string, error) { + loadingRules := clientcmd.NewDefaultClientConfigLoadingRules() + rawConfig, err := loadingRules.Load() + if err != nil { + return nil, "", fmt.Errorf("clientcmd default loader: %w", err) + } + if rawConfig == nil || len(rawConfig.Clusters) == 0 { + return nil, "", fmt.Errorf("no clusters in default kubeconfig (KUBECONFIG=%q, ~/.kube/config)", os.Getenv("KUBECONFIG")) + } + + // Minify down to the current-context only. Otherwise UpdateKubeconfigPort + // would rewrite the `server:` URL of every cluster in a multi-cluster + // kubeconfig, breaking unrelated entries on the developer's machine. + minified := *rawConfig + if err := clientcmdapi.MinifyConfig(&minified); err != nil { + return nil, "", fmt.Errorf("clientcmd minify default kubeconfig: %w", err) + } + + content, err := clientcmd.Write(minified) + if err != nil { + return nil, "", fmt.Errorf("clientcmd serialize default kubeconfig: %w", err) + } + + source := os.Getenv("KUBECONFIG") + if source == "" { + source = "~/.kube/config (current-context=" + minified.CurrentContext + ")" + } else { + source = "KUBECONFIG=" + source + " (current-context=" + minified.CurrentContext + ")" + } + return content, source, nil +} diff --git a/internal/config/env.go b/internal/config/env.go index 2ff087a..888d39e 100644 --- a/internal/config/env.go +++ b/internal/config/env.go @@ -224,8 +224,16 @@ var ( LogTimetampsEnabledDefaultValue = "true" ) -func ValidateEnvironment() error { - // Default values for environment variables +// ApplyDefaults populates package-level config variables that have a documented +// default value but were not provided through the environment. It is idempotent +// and safe to call multiple times. +// +// Suites that don't call ValidateEnvironment() (because they don't need its +// required-variable checks) should still call ApplyDefaults() — otherwise +// optional variables like SSH_VM_USER stay empty and propagate as user="" all +// the way to the SSH server, where it shows up as "Invalid user" / publickey +// rejection that is hard to attribute to a missing default. +func ApplyDefaults() { if YAMLConfigFilename == "" { YAMLConfigFilename = YAMLConfigFilenameDefaultValue } @@ -246,6 +254,10 @@ func ValidateEnvironment() error { if TestClusterNamespace == "" { TestClusterNamespace = TestClusterNamespaceDefaultValue } +} + +func ValidateEnvironment() error { + ApplyDefaults() // There are no default values for these variables and they must be set! Otherwise, the test will fail. if SSHUser == "" { diff --git a/internal/config/overrides.go b/internal/config/overrides.go new file mode 100644 index 0000000..5eed2fc --- /dev/null +++ b/internal/config/overrides.go @@ -0,0 +1,68 @@ +/* +Copyright 2026 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package config + +import ( + "fmt" + "os" + "regexp" +) + +// envVarRefPattern matches ${NAME} placeholders. We accept only the braced +// form (no bare $NAME) to keep substitution intent explicit and avoid +// accidentally rewriting tags that legitimately contain a dollar sign. +var envVarRefPattern = regexp.MustCompile(`\$\{([A-Za-z_][A-Za-z0-9_]*)\}`) + +// ExpandEnvInModulePullOverride expands ${VAR} references in each module's +// ModulePullOverride field. If a referenced env var is not set, returns an +// error pointing at the offending module so CI fails loudly instead of +// silently falling back to the "main" default in configureModulePullOverride. +// +// This lets test suites declare in YAML which modules should track a CI-built +// image without hard-coding any tag: +// +// modules: +// - name: csi-ceph +// modulePullOverride: "${MODULE_IMAGE_TAG}" +// +// CI then sets MODULE_IMAGE_TAG=pr (GitHub) or mr (GitLab), and the +// resulting ModulePullOverride CR points at the right image without anyone +// editing the YAML per run. +// +// Use this hook right after yaml.Unmarshal of cluster_config.yml. Modules +// without any placeholder are left untouched. +func ExpandEnvInModulePullOverride(def *ClusterDefinition) error { + for _, m := range def.DKPParameters.Modules { + if m == nil || m.ModulePullOverride == "" { + continue + } + matches := envVarRefPattern.FindAllStringSubmatch(m.ModulePullOverride, -1) + if len(matches) == 0 { + continue + } + for _, ms := range matches { + if _, ok := os.LookupEnv(ms[1]); !ok { + return fmt.Errorf( + "module %q references env var ${%s} in modulePullOverride but it is not set", + m.Name, ms[1], + ) + } + } + m.ModulePullOverride = os.Expand(m.ModulePullOverride, os.Getenv) + } + return nil +} diff --git a/internal/config/overrides_test.go b/internal/config/overrides_test.go new file mode 100644 index 0000000..dfe79e6 --- /dev/null +++ b/internal/config/overrides_test.go @@ -0,0 +1,149 @@ +/* +Copyright 2026 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package config + +import ( + "os" + "strings" + "testing" +) + +func TestExpandEnvInModulePullOverride_NoPlaceholder(t *testing.T) { + def := &ClusterDefinition{ + DKPParameters: DKPParameters{ + Modules: []*ModuleConfig{ + {Name: "snapshot-controller", ModulePullOverride: "main"}, + {Name: "csi-ceph", ModulePullOverride: ""}, + {Name: "sds-elastic"}, + }, + }, + } + if err := ExpandEnvInModulePullOverride(def); err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got := def.DKPParameters.Modules[0].ModulePullOverride; got != "main" { + t.Errorf("snapshot-controller: got %q, want %q", got, "main") + } + if got := def.DKPParameters.Modules[1].ModulePullOverride; got != "" { + t.Errorf("csi-ceph: got %q, want empty", got) + } + if got := def.DKPParameters.Modules[2].ModulePullOverride; got != "" { + t.Errorf("sds-elastic: got %q, want empty", got) + } +} + +func TestExpandEnvInModulePullOverride_Expands(t *testing.T) { + t.Setenv("MODULE_IMAGE_TAG", "pr131") + def := &ClusterDefinition{ + DKPParameters: DKPParameters{ + Modules: []*ModuleConfig{ + {Name: "csi-ceph", ModulePullOverride: "${MODULE_IMAGE_TAG}"}, + }, + }, + } + if err := ExpandEnvInModulePullOverride(def); err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got := def.DKPParameters.Modules[0].ModulePullOverride; got != "pr131" { + t.Errorf("got %q, want %q", got, "pr131") + } +} + +func TestExpandEnvInModulePullOverride_MissingEnvFails(t *testing.T) { + // Use t.Setenv to register cleanup that restores the original value (if + // any) after the test, then os.Unsetenv to actually drop it for this run. + const name = "MISSING_TAG_FOR_TEST" + t.Setenv(name, "anything") + if err := os.Unsetenv(name); err != nil { + t.Fatalf("os.Unsetenv: %v", err) + } + + def := &ClusterDefinition{ + DKPParameters: DKPParameters{ + Modules: []*ModuleConfig{ + {Name: "snapshot-controller", ModulePullOverride: "main"}, + {Name: "csi-ceph", ModulePullOverride: "${" + name + "}"}, + }, + }, + } + err := ExpandEnvInModulePullOverride(def) + if err == nil { + t.Fatalf("expected error for missing env, got nil") + } + if !strings.Contains(err.Error(), "csi-ceph") { + t.Errorf("error should mention offending module name, got: %v", err) + } + if !strings.Contains(err.Error(), name) { + t.Errorf("error should mention env var name %q, got: %v", name, err) + } +} + +func TestExpandEnvInModulePullOverride_PerModuleEnvs(t *testing.T) { + t.Setenv("CSI_CEPH_TAG", "pr131") + t.Setenv("SDS_ELASTIC_TAG", "mr41") + + def := &ClusterDefinition{ + DKPParameters: DKPParameters{ + Modules: []*ModuleConfig{ + {Name: "csi-ceph", ModulePullOverride: "${CSI_CEPH_TAG}"}, + {Name: "sds-elastic", ModulePullOverride: "${SDS_ELASTIC_TAG}"}, + {Name: "snapshot-controller", ModulePullOverride: "main"}, + }, + }, + } + if err := ExpandEnvInModulePullOverride(def); err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got := def.DKPParameters.Modules[0].ModulePullOverride; got != "pr131" { + t.Errorf("csi-ceph: got %q, want %q", got, "pr131") + } + if got := def.DKPParameters.Modules[1].ModulePullOverride; got != "mr41" { + t.Errorf("sds-elastic: got %q, want %q", got, "mr41") + } + if got := def.DKPParameters.Modules[2].ModulePullOverride; got != "main" { + t.Errorf("snapshot-controller: got %q, want %q", got, "main") + } +} + +func TestExpandEnvInModulePullOverride_MultiplePlaceholdersInOneString(t *testing.T) { + t.Setenv("PREFIX", "branch") + t.Setenv("NAME", "ms-crc") + def := &ClusterDefinition{ + DKPParameters: DKPParameters{ + Modules: []*ModuleConfig{ + {Name: "csi-ceph", ModulePullOverride: "${PREFIX}-${NAME}"}, + }, + }, + } + if err := ExpandEnvInModulePullOverride(def); err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got := def.DKPParameters.Modules[0].ModulePullOverride; got != "branch-ms-crc" { + t.Errorf("got %q, want %q", got, "branch-ms-crc") + } +} + +func TestExpandEnvInModulePullOverride_NilModuleSliceEntry(t *testing.T) { + def := &ClusterDefinition{ + DKPParameters: DKPParameters{ + Modules: []*ModuleConfig{nil, {Name: "csi-ceph", ModulePullOverride: "main"}}, + }, + } + if err := ExpandEnvInModulePullOverride(def); err != nil { + t.Fatalf("unexpected error: %v", err) + } +} diff --git a/internal/infrastructure/ssh/client.go b/internal/infrastructure/ssh/client.go index 61736d2..e3f5170 100644 --- a/internal/infrastructure/ssh/client.go +++ b/internal/infrastructure/ssh/client.go @@ -377,94 +377,20 @@ func (c *client) reconnect(ctx context.Context) error { // StartTunnel starts an SSH tunnel with port forwarding from local to remote // It returns a function to stop the tunnel and an error if the tunnel fails to start func (c *client) StartTunnel(ctx context.Context, localPort, remotePort string) (func() error, error) { - // Check context before starting - if err := ctx.Err(); err != nil { - return nil, fmt.Errorf("context error before starting tunnel: %w", err) - } - - listener, err := net.Listen("tcp", "127.0.0.1:"+localPort) - if err != nil { - return nil, fmt.Errorf("failed to listen on local port %s: %w", localPort, err) - } - - stopChan := make(chan struct{}) - - go func() { - defer listener.Close() - for { - // Check context and stop channel - select { - case <-ctx.Done(): - return - case <-stopChan: - return - default: - } - - // Set deadline for Accept based on context deadline if available - if deadline, ok := ctx.Deadline(); ok { - if err := listener.(*net.TCPListener).SetDeadline(deadline); err != nil { - // If setting deadline fails, continue without it - } - } - - localConn, err := listener.Accept() - if err != nil { - // Listener closed or error occurred - select { - case <-ctx.Done(): - return - case <-stopChan: - return - default: - // Continue if not stopped - continue - } + dialer := tunnelDialer{ + describe: fmt.Sprintf("%s@%s local:%s -> remote:%s", c.user, c.host, localPort, remotePort), + dial: func() (net.Conn, error) { + c.mu.Lock() + sc := c.sshClient + c.mu.Unlock() + if sc == nil { + return nil, fmt.Errorf("ssh client is not initialized") } - - go func() { - defer localConn.Close() - remoteConn, err := c.sshClient.Dial("tcp", "127.0.0.1:"+remotePort) - if err != nil { - // Connection failed, just return - the error will be visible to the client - return - } - defer remoteConn.Close() - - // Copy data bidirectionally with context support - done := make(chan struct{}, 2) - go func() { - _, _ = copyWithContext(ctx, localConn, remoteConn) - done <- struct{}{} - }() - go func() { - _, _ = copyWithContext(ctx, remoteConn, localConn) - done <- struct{}{} - }() - - // Wait for either direction to finish or context cancellation - select { - case <-ctx.Done(): - return - case <-done: - // One direction finished, wait for the other - select { - case <-ctx.Done(): - return - case <-done: - // Both directions finished - } - } - }() - } - }() - - stop := func() error { - close(stopChan) - return listener.Close() + return sc.Dial("tcp", "127.0.0.1:"+remotePort) + }, + reconnect: c.reconnect, } - - return stop, nil + return runTunnelLoop(ctx, localPort, dialer) } // Exec executes a command on the remote host with automatic retry and reconnection @@ -654,7 +580,7 @@ func NewClientWithJumpHost(jumpUser, jumpHost, jumpKeyPath, targetUser, targetHo } // Create SSH config for target host - targetConfig, _, err := createSSHConfig(targetUser, targetKeyPath) + targetConfig, targetKeyInfo, err := createSSHConfig(targetUser, targetKeyPath) if err != nil { jumpClient.Close() return nil, fmt.Errorf("failed to create SSH config for target host: %w", err) @@ -683,14 +609,22 @@ func NewClientWithJumpHost(jumpUser, jumpHost, jumpKeyPath, targetUser, targetHo targetConn, err := jumpClient.Dial("tcp", targetAddr) if err != nil { - lastErr = fmt.Errorf("failed to dial target host %s@%s through jump host: %w", targetUser, targetAddr, err) + lastErr = fmt.Errorf("failed to dial target host %q@%s through jump host %q@%s: %w", + targetUser, targetAddr, jumpUser, jumpAddr, err) continue } targetClientConn, targetChans, targetReqs, err := ssh.NewClientConn(targetConn, targetAddr, targetConfig) if err != nil { targetConn.Close() - lastErr = fmt.Errorf("failed to establish SSH connection to target host: %w", err) + lastErr = fmt.Errorf( + "failed to establish SSH connection to target host %q@%s (via jump %q@%s): %w\n"+ + " Key used: %s (algorithm: %s, fingerprint: %s)\n"+ + " Hint: verify SSH_VM_USER (current=%q) is correct for this VM image and that the key's public part is in %s@%s:~/.ssh/authorized_keys", + targetUser, targetAddr, jumpUser, jumpAddr, err, + targetKeyInfo.Path, targetKeyInfo.Algorithm, targetKeyInfo.Fingerprint, + targetUser, targetUser, targetAddr, + ) continue } @@ -700,7 +634,8 @@ func NewClientWithJumpHost(jumpUser, jumpHost, jumpKeyPath, targetUser, targetHo if targetClient == nil { jumpClient.Close() - return nil, fmt.Errorf("failed to connect to target host after %d attempts: %w", maxRetries, lastErr) + return nil, fmt.Errorf("failed to connect to target host %q@%s after %d attempts: %w", + targetUser, targetAddr, maxRetries, lastErr) } // Start keepalive for both connections @@ -893,17 +828,65 @@ func (c *jumpHostClient) reconnect(ctx context.Context) error { return fmt.Errorf("failed to reconnect after %d attempts: %w", config.SSHRetryCount, lastErr) } -// StartTunnel starts an SSH tunnel with port forwarding from local to remote +// StartTunnel starts an SSH tunnel with port forwarding from local to remote. +// Like the non-jump-host variant, dial errors that look like a dropped SSH +// session trigger a reconnect attempt against jump+target before the next +// retry — Wi-Fi flaps on the developer's laptop are by far the most common +// way for the tunnel to die mid-test. func (c *jumpHostClient) StartTunnel(ctx context.Context, localPort, remotePort string) (func() error, error) { - // Use the target client's StartTunnel method - // We need to access the underlying client's StartTunnel - // Since we can't directly call it, we'll implement it here - return startTunnelOnClient(ctx, c.targetClient, localPort, remotePort) + dialer := tunnelDialer{ + describe: fmt.Sprintf("%s@%s via jump %s@%s local:%s -> remote:%s", + c.targetUser, c.targetHost, c.jumpUser, c.jumpHost, localPort, remotePort), + dial: func() (net.Conn, error) { + c.mu.Lock() + tc := c.targetClient + c.mu.Unlock() + if tc == nil { + return nil, fmt.Errorf("jump-host target client is not initialized") + } + return tc.Dial("tcp", "127.0.0.1:"+remotePort) + }, + reconnect: c.reconnect, + } + return runTunnelLoop(ctx, localPort, dialer) +} + +// tunnelDialer abstracts the per-tunnel concerns that runTunnelLoop needs to +// know about: how to open a fresh remote connection through the active SSH +// session, how to re-establish that session when it dies, and a human-readable +// description for log messages. +type tunnelDialer struct { + // describe identifies the tunnel in WARN/INFO logs. It should encode user, + // host(s) and ports — enough to distinguish concurrent tunnels. + describe string + // dial opens a fresh TCP connection to the remote endpoint via the *current* + // SSH client. Implementations must read the underlying *ssh.Client under + // whatever mutex guards it (so reconnect updates are visible). + dial func() (net.Conn, error) + // reconnect tries to rebuild the broken SSH session(s). Called once per + // accepted local connection when dial fails with a connection-style error. + // May itself perform retries with backoff. + reconnect func(ctx context.Context) error } -// startTunnelOnClient starts a tunnel on a raw ssh.Client -func startTunnelOnClient(ctx context.Context, sshClient *ssh.Client, localPort, remotePort string) (func() error, error) { - // Check context before starting +// runTunnelLoop runs the accept loop for an SSH tunnel. +// +// Compared to the previous inline implementation it adds two things: +// +// 1. **Auto-reconnect on dial failure.** When sshClient.Dial returns a +// connection-style error (EOF, connection lost, broken pipe…) we kick +// dialer.reconnect and retry the dial once with the freshly rebuilt +// SSH session. Without this, a Wi-Fi flap on the developer's laptop +// killed the SSH session permanently, the tunnel listener stayed up +// happily accepting local connects, but every Dial through the dead +// session returned EOF — and the test process spent the entire 20-min +// readiness timeout silently retrying client-go GETs through a port +// that nobody answered. See poll.go for the related per-call deadline. +// 2. **Visible WARN log when reconnect kicks in.** Previously the failure +// was swallowed (`return`); now we emit a WARN every time the tunnel +// has to be rebuilt so users can correlate "tests slowed down" with +// "wifi flapped". +func runTunnelLoop(ctx context.Context, localPort string, dialer tunnelDialer) (func() error, error) { if err := ctx.Err(); err != nil { return nil, fmt.Errorf("context error before starting tunnel: %w", err) } @@ -918,7 +901,6 @@ func startTunnelOnClient(ctx context.Context, sshClient *ssh.Client, localPort, go func() { defer listener.Close() for { - // Check context and stop channel select { case <-ctx.Done(): return @@ -927,63 +909,26 @@ func startTunnelOnClient(ctx context.Context, sshClient *ssh.Client, localPort, default: } - // Set deadline for Accept based on context deadline if available - if deadline, ok := ctx.Deadline(); ok { - if tcpListener, ok := listener.(*net.TCPListener); ok { - if err := tcpListener.SetDeadline(deadline); err != nil { - // If setting deadline fails, continue without it - } - } + // Short Accept deadline so the loop can re-check ctx/stopChan + // promptly even when no clients are connecting; a deadline tied + // to ctx.Deadline() fired only at the very end of the test. + if tcpListener, ok := listener.(*net.TCPListener); ok { + _ = tcpListener.SetDeadline(time.Now().Add(500 * time.Millisecond)) } localConn, err := listener.Accept() if err != nil { - // Listener closed or error occurred select { case <-ctx.Done(): return case <-stopChan: return default: - // Continue if not stopped continue } } - go func() { - defer localConn.Close() - remoteConn, err := sshClient.Dial("tcp", "127.0.0.1:"+remotePort) - if err != nil { - // Connection failed, just return - the error will be visible to the client - return - } - defer remoteConn.Close() - - // Copy data bidirectionally with context support - done := make(chan struct{}, 2) - go func() { - _, _ = copyWithContext(ctx, localConn, remoteConn) - done <- struct{}{} - }() - go func() { - _, _ = copyWithContext(ctx, remoteConn, localConn) - done <- struct{}{} - }() - - // Wait for either direction to finish or context cancellation - select { - case <-ctx.Done(): - return - case <-done: - // One direction finished, wait for the other - select { - case <-ctx.Done(): - return - case <-done: - // Both directions finished - } - } - }() + go handleTunnelConnection(ctx, localConn, dialer) } }() @@ -991,10 +936,64 @@ func startTunnelOnClient(ctx context.Context, sshClient *ssh.Client, localPort, close(stopChan) return listener.Close() } - return stop, nil } +// handleTunnelConnection serves a single accepted local connection. On the +// first dial failure that looks like a dead SSH session we call +// dialer.reconnect and retry once. After that, further failures are surfaced +// to the local client by closing localConn (which causes client-go on the +// other side to see EOF and retry through the freshly opened tunnel on the +// next request). +func handleTunnelConnection(ctx context.Context, localConn net.Conn, dialer tunnelDialer) { + defer localConn.Close() + + remoteConn, err := dialer.dial() + if err != nil { + if !isConnectionError(err) { + // Non-connection errors (e.g. invalid address) won't be fixed by a + // reconnect — drop the local conn so the client sees the failure. + logger.Debug("SSH tunnel %s dial failed (non-retryable): %v", dialer.describe, err) + return + } + + logger.Warn("SSH tunnel %s dial failed (%v); attempting to reconnect SSH session", dialer.describe, err) + if rcErr := dialer.reconnect(ctx); rcErr != nil { + logger.Warn("SSH tunnel %s reconnect failed: %v", dialer.describe, rcErr) + return + } + logger.Info("SSH tunnel %s SSH session reconnected; retrying dial", dialer.describe) + + remoteConn, err = dialer.dial() + if err != nil { + logger.Warn("SSH tunnel %s dial still failing after reconnect: %v", dialer.describe, err) + return + } + } + defer remoteConn.Close() + + done := make(chan struct{}, 2) + go func() { + _, _ = copyWithContext(ctx, localConn, remoteConn) + done <- struct{}{} + }() + go func() { + _, _ = copyWithContext(ctx, remoteConn, localConn) + done <- struct{}{} + }() + + select { + case <-ctx.Done(): + return + case <-done: + select { + case <-ctx.Done(): + return + case <-done: + } + } +} + // Exec executes a command on the remote host with automatic retry and reconnection func (c *jumpHostClient) Exec(ctx context.Context, cmd string) (string, error) { var output string diff --git a/pkg/cluster/cluster.go b/pkg/cluster/cluster.go index 86ae335..f2b1d37 100644 --- a/pkg/cluster/cluster.go +++ b/pkg/cluster/cluster.go @@ -149,6 +149,14 @@ func loadClusterConfigFromPath(configPath string) (*config.ClusterDefinition, er return nil, fmt.Errorf("failed to parse YAML config: %w", err) } + // Expand ${VAR} placeholders in modulePullOverride fields. CI uses this to + // pass a per-PR/MR image tag via a single env var (e.g. MODULE_IMAGE_TAG) + // without editing the YAML between runs. Missing envs fail fast here so we + // don't silently regress to "main" on accidentally unset variables. + if err := config.ExpandEnvInModulePullOverride(&clusterDef); err != nil { + return nil, fmt.Errorf("expand env in modulePullOverride: %w", err) + } + // Validate the configuration (using the same validation logic as internal/cluster) if len(clusterDef.Masters) == 0 { return nil, fmt.Errorf("at least one master node is required") @@ -194,6 +202,22 @@ func CreateTestCluster( ctx context.Context, yamlConfigFilename string, ) (*TestClusterResources, error) { + // Apply env-var defaults defensively so suites that don't call + // config.ValidateEnvironment() (e.g. csi-ceph e2e) still get sensible + // values for SSH_VM_USER / SSH_PRIVATE_KEY / SSH_PUBLIC_KEY / + // TEST_CLUSTER_NAMESPACE / YAML_CONFIG_FILENAME / TEST_CLUSTER_CLEANUP + // instead of empty strings that surface as obscure failures (e.g. + // user="" -> sshd "Invalid user", or "" filename -> directory read). + config.ApplyDefaults() + + // Belt-and-suspenders: function arg also has a documented default. Without + // this, an empty filename gets joined with the test-package directory and + // yields a path to the directory itself, failing later with a confusing + // "is a directory" read error. + if yamlConfigFilename == "" { + yamlConfigFilename = config.YAMLConfigFilenameDefaultValue + } + logger.Step(1, "Loading cluster configuration from %s", yamlConfigFilename) // Find the test package directory by walking the call stack. diff --git a/pkg/cluster/setup.go b/pkg/cluster/setup.go index ea4ffa5..5d43c90 100644 --- a/pkg/cluster/setup.go +++ b/pkg/cluster/setup.go @@ -542,12 +542,31 @@ echo "%s" _, _ = sshClient.Exec(ctx, chmodCmd) // Step 3: Run dhctl bootstrap command with ssh-agent - // Mount SSH_AUTH_SOCK into the container and use it for authentication - // Note: We don't use --ssh-agent-private-keys anymore, dhctl will use SSH_AUTH_SOCK - // Docker needs to run with sudo for access to docker socket + // Mount SSH_AUTH_SOCK into the container and use it for authentication. + // + // dhctl's underlying SSH config parser (`lib-connection`) eagerly opens + // `~/.ssh/id_rsa` (=/root/.ssh/id_rsa inside the container) during config + // extraction, *before* it considers SSH_AUTH_SOCK. With a passphrase- + // protected key uploaded only as cloud@/home/cloud/.ssh/id_rsa, the + // bootstrap aborts with either: + // - "open /root/.ssh/id_rsa: no such file or directory" (no mount), or + // - "stdin is not a terminal, error reading password" (mount but + // non-interactive `docker run`). + // + // `--ssh-agent-private-keys=""` (flag or env) does NOT short-circuit the + // file load in this image, neither does the `DHCTL_CLI_`-prefixed env. + // `lib-connection` exposes two purpose-built env vars for "auth via agent + // only, never touch any key file" (no CLI flags in install:main): + // FORCE_NO_PRIVATE_KEYS=true skip the id_rsa fallback + // USE_AGENT_WITH_NO_PRIVATE_KEYS=true set ForceUseSSHAgent=true so + // that HaveAuthMethods() passes + // Together they force dhctl onto the ssh-agent we already loaded with the + // unlocked key (via askpass) above, with no file-side dependencies. + // + // Docker needs to run with sudo for access to the docker socket. installImage := fmt.Sprintf("%s/install:%s", registryRepo, devBranch) bootstrapCmd := fmt.Sprintf( - "sudo -u %s bash -c 'export SSH_AUTH_SOCK=%s; sudo docker run --network=host --pull=always -v \"/home/%s/config.yml:/config.yml\" -v \"%s:/tmp/ssh-agent.sock\" -e SSH_AUTH_SOCK=/tmp/ssh-agent.sock %s dhctl bootstrap --ssh-host=%s --ssh-user=%s --config=/config.yml > %s 2>&1'", + "sudo -u %s bash -c 'export SSH_AUTH_SOCK=%s; sudo docker run --network=host --pull=always -v \"/home/%s/config.yml:/config.yml\" -v \"%s:/tmp/ssh-agent.sock\" -e SSH_AUTH_SOCK=/tmp/ssh-agent.sock -e FORCE_NO_PRIVATE_KEYS=true -e USE_AGENT_WITH_NO_PRIVATE_KEYS=true %s dhctl bootstrap --ssh-host=%s --ssh-user=%s --config=/config.yml > %s 2>&1'", config.VMSSHUser, actualAgentSocket, config.VMSSHUser, actualAgentSocket, installImage, masterIP, config.VMSSHUser, remoteLogPath, ) diff --git a/pkg/cluster/vms.go b/pkg/cluster/vms.go index 72f5cd1..8376708 100644 --- a/pkg/cluster/vms.go +++ b/pkg/cluster/vms.go @@ -487,10 +487,39 @@ func getCVMINameFromImageURL(imageURL string) string { return name } +// cloudInitAptMirror configures cloud-init to use mirror.yandex.ru as the +// Ubuntu apt mirror for both the primary archive and security pools, and +// pins apt to IPv4. Default Ubuntu mirrors (archive.ubuntu.com / +// security.ubuntu.com) round-robin across many IPs and are partially +// unreachable from some Flant infra (e.g. some egress paths block all the +// IPv6 endpoints, and most IPv4 ones time out for archive.ubuntu.com), +// which makes Step 9 (Wait for Docker) and per-node package_update very flaky +// or outright stall. mirror.yandex.ru carries main/universe/multiverse/restricted +// for the same suites and is reachable in those environments. +// +// The leading newline keeps the indentation flush with the rest of the +// cloud-config when interpolated mid-document. +const cloudInitAptMirror = `apt: + primary: + - arches: [default] + uri: http://mirror.yandex.ru/ubuntu + security: + - arches: [default] + uri: http://mirror.yandex.ru/ubuntu +` + +// cloudInitForceIPv4 disables IPv6 for apt to avoid 30-second connection +// timeouts on every package fetch when the host lacks working IPv6 egress. +// Written via write_files so it is in effect before package_update runs. +const cloudInitForceIPv4Apt = ` - path: /etc/apt/apt.conf.d/99force-ipv4 + content: | + Acquire::ForceIPv4 "true"; +` + // generateCloudInitUserData generates cloud-init user data for VM provisioning (cluster nodes) func generateCloudInitUserData(hostname, sshPubKey string) string { return fmt.Sprintf(`#cloud-config -package_update: true +%spackage_update: true packages: - tmux - htop @@ -515,7 +544,7 @@ users: ssh_authorized_keys: - %s write_files: - - path: /etc/ssh/sshd_config.d/allow_tcp_forwarding.conf +%s - path: /etc/ssh/sshd_config.d/allow_tcp_forwarding.conf content: | # Разрешить TCP forwarding AllowTcpForwarding yes @@ -531,14 +560,14 @@ runcmd: - systemctl daemon-reload - systemctl enable --now qemu-guest-agent.service - echo 'source /root/.kubectl_aliases' >> /root/.bashrc -`, sshPubKey, hostname) +`, cloudInitAptMirror, sshPubKey, cloudInitForceIPv4Apt, hostname) } // generateSetupNodeCloudInit generates cloud-init user data for the setup/bootstrap node. // This includes Docker which is required for running the Deckhouse installer. func generateSetupNodeCloudInit(hostname, sshPubKey string) string { return fmt.Sprintf(`#cloud-config -package_update: true +%spackage_update: true packages: - tmux - htop @@ -560,7 +589,7 @@ users: ssh_authorized_keys: - %s write_files: - - path: /etc/ssh/sshd_config.d/allow_tcp_forwarding.conf +%s - path: /etc/ssh/sshd_config.d/allow_tcp_forwarding.conf content: | # Разрешить TCP forwarding AllowTcpForwarding yes @@ -571,7 +600,7 @@ runcmd: - systemctl daemon-reload - systemctl enable --now qemu-guest-agent.service - systemctl enable --now docker.service -`, sshPubKey, hostname) +`, cloudInitAptMirror, sshPubKey, cloudInitForceIPv4Apt, hostname) } // RemoveAllVMs forcefully stops and deletes virtual machines, virtual disks, and virtual images. diff --git a/pkg/kubernetes/cephblockpool.go b/pkg/kubernetes/cephblockpool.go index 1d112e5..98dba64 100644 --- a/pkg/kubernetes/cephblockpool.go +++ b/pkg/kubernetes/cephblockpool.go @@ -160,43 +160,23 @@ func CreateCephBlockPool(ctx context.Context, kubeconfig *rest.Config, cfg CephB // WaitForCephBlockPoolReady blocks until the CephBlockPool reports // `status.phase == "Ready"`. Rook transitions the pool from Progressing to // Ready once the Ceph OSDs have accepted the new pool and its CRUSH rule. +// +// Per-call deadlines and loud (WARN) logging on consecutive network failures +// are inherited from pollResourceUntilReady, so a dropped SSH tunnel surfaces +// in seconds instead of after the parent timeout. func WaitForCephBlockPoolReady(ctx context.Context, kubeconfig *rest.Config, namespace, name string, timeout time.Duration) error { - if namespace == "" || name == "" { - return fmt.Errorf("namespace and name are required") - } - - logger.Debug("Waiting for CephBlockPool %s/%s to become Ready (timeout: %v)", namespace, name, timeout) - - dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) - if err != nil { - return fmt.Errorf("failed to create dynamic client: %w", err) - } - - ctx, cancel := context.WithTimeout(ctx, timeout) - defer cancel() - - ticker := time.NewTicker(5 * time.Second) - defer ticker.Stop() - - for { - obj, err := dynamicClient.Resource(CephBlockPoolGVR).Namespace(namespace).Get(ctx, name, metav1.GetOptions{}) - if err == nil { + return pollResourceUntilReady( + ctx, kubeconfig, CephBlockPoolGVR, namespace, name, + timeout, PollTickInterval, "CephBlockPool", + func(obj *unstructured.Unstructured) (bool, string) { phase, _, _ := unstructured.NestedString(obj.Object, "status", "phase") if phase == "Ready" { - logger.Success("CephBlockPool %s/%s is Ready", namespace, name) - return nil + return true, "phase=Ready" } - logger.Debug("CephBlockPool %s/%s phase: %q, waiting...", namespace, name, phase) - } else if !apierrors.IsNotFound(err) { - logger.Debug("Error getting CephBlockPool %s/%s: %v", namespace, name, err) - } - - select { - case <-ctx.Done(): - return fmt.Errorf("timeout waiting for CephBlockPool %s/%s: %w", namespace, name, ctx.Err()) - case <-ticker.C: - } - } + logger.Debug("CephBlockPool %s/%s phase: %q, waiting...", obj.GetNamespace(), obj.GetName(), phase) + return false, "" + }, + ) } // DeleteCephBlockPool deletes a CephBlockPool. Safe to call if the pool does diff --git a/pkg/kubernetes/cephcluster.go b/pkg/kubernetes/cephcluster.go index eeb1a14..9aa87ae 100644 --- a/pkg/kubernetes/cephcluster.go +++ b/pkg/kubernetes/cephcluster.go @@ -331,46 +331,27 @@ func toInterfaceSlice(in []string) []interface{} { // // We return success once `state == "Created"`. HEALTH_ERR is reported in the // log and does not short-circuit (Rook may recover). +// +// Network errors are logged loud (WARN) after a few consecutive failures so a +// dropped SSH tunnel surfaces in seconds instead of getting buried in Debug +// output. See pollResourceUntilReady for the per-call deadline rationale. func WaitForCephClusterReady(ctx context.Context, kubeconfig *rest.Config, namespace, name string, timeout time.Duration) error { - if namespace == "" || name == "" { - return fmt.Errorf("namespace and name are required") - } - - logger.Debug("Waiting for CephCluster %s/%s to reach Created state (timeout: %v)", namespace, name, timeout) - - dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) - if err != nil { - return fmt.Errorf("failed to create dynamic client: %w", err) - } - - ctx, cancel := context.WithTimeout(ctx, timeout) - defer cancel() - - ticker := time.NewTicker(10 * time.Second) - defer ticker.Stop() - - for { - obj, err := dynamicClient.Resource(CephClusterGVR).Namespace(namespace).Get(ctx, name, metav1.GetOptions{}) - if err == nil { + return pollResourceUntilReady( + ctx, kubeconfig, CephClusterGVR, namespace, name, + timeout, 10*time.Second, "CephCluster", + func(obj *unstructured.Unstructured) (bool, string) { state, _, _ := unstructured.NestedString(obj.Object, "status", "state") health, _, _ := unstructured.NestedString(obj.Object, "status", "ceph", "health") phase, _, _ := unstructured.NestedString(obj.Object, "status", "phase") if state == "Created" || phase == "Ready" { - logger.Success("CephCluster %s/%s is Created (ceph health: %s)", namespace, name, health) - return nil + return true, fmt.Sprintf("state=%s phase=%s ceph health: %s", state, phase, health) } - logger.Debug("CephCluster %s/%s state=%q phase=%q health=%q", namespace, name, state, phase, health) - } else if !apierrors.IsNotFound(err) { - logger.Debug("Error getting CephCluster %s/%s: %v", namespace, name, err) - } - - select { - case <-ctx.Done(): - return fmt.Errorf("timeout waiting for CephCluster %s/%s: %w", namespace, name, ctx.Err()) - case <-ticker.C: - } - } + logger.Debug("CephCluster %s/%s state=%q phase=%q health=%q", + obj.GetNamespace(), obj.GetName(), state, phase, health) + return false, "" + }, + ) } // DeleteCephCluster removes a CephCluster. Tearing down the cluster this way diff --git a/pkg/kubernetes/cephfilesystem.go b/pkg/kubernetes/cephfilesystem.go index e4d3c4a..bb185ee 100644 --- a/pkg/kubernetes/cephfilesystem.go +++ b/pkg/kubernetes/cephfilesystem.go @@ -189,47 +189,25 @@ func CreateCephFilesystem(ctx context.Context, kubeconfig *rest.Config, cfg Ceph // `status.phase == "Ready"`. As a fallback (some Rook revisions populate // `status.conditions` first) the function also accepts a Ready=True // condition. +// +// Per-call deadlines and loud (WARN) logging on consecutive network failures +// are inherited from pollResourceUntilReady. func WaitForCephFilesystemReady(ctx context.Context, kubeconfig *rest.Config, namespace, name string, timeout time.Duration) error { - if namespace == "" || name == "" { - return fmt.Errorf("namespace and name are required") - } - - logger.Debug("Waiting for CephFilesystem %s/%s to become Ready (timeout: %v)", namespace, name, timeout) - - dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) - if err != nil { - return fmt.Errorf("failed to create dynamic client: %w", err) - } - - ctx, cancel := context.WithTimeout(ctx, timeout) - defer cancel() - - ticker := time.NewTicker(5 * time.Second) - defer ticker.Stop() - - for { - obj, err := dynamicClient.Resource(CephFilesystemGVR).Namespace(namespace).Get(ctx, name, metav1.GetOptions{}) - if err == nil { + return pollResourceUntilReady( + ctx, kubeconfig, CephFilesystemGVR, namespace, name, + timeout, PollTickInterval, "CephFilesystem", + func(obj *unstructured.Unstructured) (bool, string) { phase, _, _ := unstructured.NestedString(obj.Object, "status", "phase") if phase == "Ready" { - logger.Success("CephFilesystem %s/%s is Ready (status.phase)", namespace, name) - return nil + return true, "status.phase" } if cephFilesystemReadyByCondition(obj.Object) { - logger.Success("CephFilesystem %s/%s is Ready (status.conditions[Ready]=True)", namespace, name) - return nil + return true, "status.conditions[Ready]=True" } - logger.Debug("CephFilesystem %s/%s phase: %q, waiting...", namespace, name, phase) - } else if !apierrors.IsNotFound(err) { - logger.Debug("Error getting CephFilesystem %s/%s: %v", namespace, name, err) - } - - select { - case <-ctx.Done(): - return fmt.Errorf("timeout waiting for CephFilesystem %s/%s: %w", namespace, name, ctx.Err()) - case <-ticker.C: - } - } + logger.Debug("CephFilesystem %s/%s phase: %q, waiting...", obj.GetNamespace(), obj.GetName(), phase) + return false, "" + }, + ) } func cephFilesystemReadyByCondition(obj map[string]interface{}) bool { diff --git a/pkg/kubernetes/poll.go b/pkg/kubernetes/poll.go new file mode 100644 index 0000000..6cd4ba4 --- /dev/null +++ b/pkg/kubernetes/poll.go @@ -0,0 +1,160 @@ +/* +Copyright 2026 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubernetes + +import ( + "context" + "fmt" + "time" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/dynamic" + "k8s.io/client-go/rest" + + "github.com/deckhouse/storage-e2e/internal/logger" +) + +// PollGetTimeout caps a single Get call inside readiness pollers. Without +// this cap a hung TCP connect (e.g. SSH tunnel that died after a Wi-Fi flap +// on the developer's laptop) eats the entire parent timeout silently — the +// poller appears to "hang" until the per-resource ReadyTimeout fires 15-20 +// minutes later. With a 30s cap each Get fails fast, so we surface the +// network problem early via the WARN log emitted by pollResourceUntilReady. +const PollGetTimeout = 30 * time.Second + +// PollTickInterval is the default tick interval between Get attempts when +// waiting for a Kubernetes resource to reach a ready state. +const PollTickInterval = 5 * time.Second + +// pollResourceUntilReady polls a single namespaced unstructured resource +// until isReady returns (true, "") or the parent timeout expires. +// +// It centralizes three behaviors that all of our Wait*Ready helpers want: +// - per-call deadline (PollGetTimeout) on every Get, so a dead network +// surfaces in seconds instead of after the readiness timeout; +// - WARN logs with a counter when consecutive network errors happen — silent +// pollers were the root cause of "test hangs forever after Wi-Fi flap"; +// - tolerance of NotFound (the resource may not have been seen by the +// watch cache yet) and of `isReady=false` (still progressing). +// +// Parameters: +// +// - kubeconfig: rest config used to construct the dynamic client. +// - gvr: GroupVersionResource of the resource being polled. +// - namespace, name: scope of the resource. Must both be non-empty. +// - readyTimeout: overall budget. Returns timeout error after this. +// - tickInterval: gap between Get attempts. Pass PollTickInterval if +// unsure; resources with slow reconcilers can use longer intervals. +// - resourceLabel: string used in log lines (e.g. "CephCluster"). Keep +// short — the namespace/name is appended for context. +// - isReady: decider over the unstructured object. Returns +// (ready, humanReason). If ready is true, pollResourceUntilReady +// prints a Success log including the reason and returns nil. +func pollResourceUntilReady( + ctx context.Context, + kubeconfig *rest.Config, + gvr schema.GroupVersionResource, + namespace, name string, + readyTimeout time.Duration, + tickInterval time.Duration, + resourceLabel string, + isReady func(obj *unstructured.Unstructured) (ready bool, reason string), +) error { + if namespace == "" || name == "" { + return fmt.Errorf("namespace and name are required") + } + if isReady == nil { + return fmt.Errorf("isReady is required") + } + if tickInterval <= 0 { + tickInterval = PollTickInterval + } + + logger.Debug("Waiting for %s %s/%s to become Ready (timeout: %v)", resourceLabel, namespace, name, readyTimeout) + + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + + deadlineCtx, cancel := context.WithTimeout(ctx, readyTimeout) + defer cancel() + + ticker := time.NewTicker(tickInterval) + defer ticker.Stop() + + var consecutiveErrs int + for { + obj, err := getWithTimeout(deadlineCtx, dynamicClient, gvr, namespace, name, PollGetTimeout) + switch { + case err == nil: + consecutiveErrs = 0 + if ready, reason := isReady(obj); ready { + if reason != "" { + logger.Success("%s %s/%s is Ready (%s)", resourceLabel, namespace, name, reason) + } else { + logger.Success("%s %s/%s is Ready", resourceLabel, namespace, name) + } + return nil + } + case apierrors.IsNotFound(err): + // Resource hasn't propagated yet. Treat as "still progressing" + // without warning so we don't spam logs on healthy clusters that + // just haven't observed the create yet. + consecutiveErrs = 0 + logger.Debug("%s %s/%s not found yet", resourceLabel, namespace, name) + default: + consecutiveErrs++ + // Quiet the first two failures (spurious 5xx, leader re-election), + // loud after that. Loud == WARN at every iteration so the user + // can see the cluster connection is dying instead of waiting for + // the readyTimeout to fire. + if consecutiveErrs >= 3 { + logger.Warn( + "%s %s/%s GET failed for %d consecutive iterations: %v", + resourceLabel, namespace, name, consecutiveErrs, err, + ) + } else { + logger.Debug("Error getting %s %s/%s: %v", resourceLabel, namespace, name, err) + } + } + + select { + case <-deadlineCtx.Done(): + return fmt.Errorf("timeout waiting for %s %s/%s: %w", resourceLabel, namespace, name, deadlineCtx.Err()) + case <-ticker.C: + } + } +} + +// getWithTimeout wraps dynamicClient.Get with a per-call deadline derived +// from the parent context. The wrapper avoids leaking goroutines blocked on +// a dead TCP connection. +func getWithTimeout( + parent context.Context, + dynamicClient dynamic.Interface, + gvr schema.GroupVersionResource, + namespace, name string, + perCallTimeout time.Duration, +) (*unstructured.Unstructured, error) { + callCtx, cancel := context.WithTimeout(parent, perCallTimeout) + defer cancel() + return dynamicClient.Resource(gvr).Namespace(namespace).Get(callCtx, name, metav1.GetOptions{}) +} From d3853d70bb312cee1c7914beb39cecc88aeb8766 Mon Sep 17 00:00:00 2001 From: Aleksandr Zimin Date: Tue, 5 May 2026 12:20:55 +0300 Subject: [PATCH 07/14] Warn loudly when SSH kubeconfig falls back to ~/.kube/config GetKubeconfig used to log a single info-level line when SSH retrieval of admin.conf failed and we silently dropped to the developer's local kubeconfig. In practice that hid a class of nasty bugs where tests were acquiring stale locks on unrelated SAN clusters or installing modules against the wrong stand because $KUBECONFIG happened to point elsewhere. Make the fallback obvious: * Tag every kubeconfig source path with a short label (SSH(...), KUBE_CONFIG_PATH=..., LOCAL_FALLBACK(...)). * Promote the fallback message to logger.Warn, include the resolved current-context and cluster server URL, and tell the user how to fail-fast (unset KUBECONFIG, drop ~/.kube/config) if that behaviour is undesirable. * Always print a final "Loaded kubeconfig (source=..., current-context=..., server=...)" line so the actual cluster is visible in test logs regardless of which resolution path fired. The new kubeconfigContextSummary helper parses the serialized kubeconfig through clientcmd.Load and degrades to "" on any error so the surrounding log line stays safe to print. Signed-off-by: Aleksandr Zimin --- internal/cluster/cluster.go | 70 +++++++++++++++++++++++++++++++++---- 1 file changed, 64 insertions(+), 6 deletions(-) diff --git a/internal/cluster/cluster.go b/internal/cluster/cluster.go index af47393..62d286c 100644 --- a/internal/cluster/cluster.go +++ b/internal/cluster/cluster.go @@ -226,7 +226,15 @@ func GetKubeconfig(ctx context.Context, masterIP, user, keyPath string, sshClien kubeconfigPath := filepath.Join(outputDir, fmt.Sprintf("kubeconfig-%s.yml", masterIP)) - var kubeconfigContent []byte + var ( + kubeconfigContent []byte + // kubeconfigSource is a short, human-readable tag identifying where the + // kubeconfig came from. It's printed at the end of GetKubeconfig so it + // is always obvious in test logs which cluster we're actually about to + // hit — important after diagnosing wrong-cluster bugs that look like + // "stale lock" or "unexpected modules". + kubeconfigSource string + ) // Read kubeconfig via SSH: prefer super-admin.conf when present (see getKubeconfigRemoteShell). kubeconfigContentStr, sshErr := sshClient.Exec(ctx, getKubeconfigRemoteShell) @@ -234,6 +242,7 @@ func GetKubeconfig(ctx context.Context, masterIP, user, keyPath string, sshClien case sshErr == nil: // SSH succeeded - use the content from SSH kubeconfigContent = []byte(kubeconfigContentStr) + kubeconfigSource = fmt.Sprintf("SSH(%s@%s:/etc/kubernetes/{super-admin,admin}.conf)", user, masterIP) case config.KubeConfigPath != "": // SSH retrieval failed (likely due to sudo password requirement) and the @@ -247,23 +256,45 @@ func GetKubeconfig(ctx context.Context, masterIP, user, keyPath string, sshClien return nil, "", fmt.Errorf("failed to read kubeconfig from KUBE_CONFIG_PATH (%s): %w", resolvedPath, readErr) } kubeconfigContent = readContent + kubeconfigSource = fmt.Sprintf("KUBE_CONFIG_PATH=%s", resolvedPath) default: // SSH failed and no explicit KUBE_CONFIG_PATH. Fall back to kubectl's // standard resolution (KUBECONFIG env, otherwise ~/.kube/config) so // that a developer whose `kubectl` already targets the right base // cluster doesn't have to set anything else. + // + // This branch is *very loud* on purpose: silent fallback to the + // developer's personal ~/.kube/config has historically caused tests + // to acquire stale locks on unrelated SAN clusters or deploy modules + // against the wrong stand. We make sure both the WARN line and the + // final source-stamp surface what just happened. fallbackContent, fallbackPath, fallbackErr := loadDefaultKubeconfig() - if fallbackErr == nil { - logger.Info("SSH kubeconfig retrieval failed (%v); falling back to local kubeconfig at %s", sshErr, fallbackPath) - kubeconfigContent = fallbackContent - } else { + if fallbackErr != nil { return nil, "", fmt.Errorf("failed to read kubeconfig from master (this may occur if sudo requires a password) "+ "and the local kubectl-default kubeconfig fallback also failed (%v). "+ "Set KUBE_CONFIG_PATH to a working kubeconfig, or ensure $KUBECONFIG / ~/.kube/config points at the base cluster. "+ "Original SSH error: %w", fallbackErr, sshErr) } - } + fbCtx, fbServer := kubeconfigContextSummary(fallbackContent) + logger.Warn( + "SSH kubeconfig retrieval from %s@%s failed (%v); falling back to LOCAL kubeconfig at %s "+ + "(current-context=%q, server=%q). "+ + "This is almost certainly NOT the cluster you intended to test against — check SSH_HOST/SSH_USER, "+ + "or set KUBE_CONFIG_PATH to a specific kubeconfig file. "+ + "To fail fast instead of silently falling back, unset $KUBECONFIG and remove ~/.kube/config", + user, masterIP, sshErr, fallbackPath, fbCtx, fbServer, + ) + kubeconfigContent = fallbackContent + kubeconfigSource = fmt.Sprintf("LOCAL_FALLBACK(%s)", fallbackPath) + } + + // Always stamp the kubeconfig source + the resulting current-context/server + // in the log. With this single line a developer reading the output knows + // for sure which cluster the test is about to talk to, regardless of which + // of the three resolution paths fired above. + finalCtx, finalServer := kubeconfigContextSummary(kubeconfigContent) + logger.Info("Loaded kubeconfig (source=%s, current-context=%q, server=%q)", kubeconfigSource, finalCtx, finalServer) // Write kubeconfig content to file (always write a working copy, regardless of source) kubeconfigFile, err := os.Create(kubeconfigPath) @@ -369,6 +400,33 @@ func UpdateKubeconfigPort(kubeconfigPath string, localPort int) error { return nil } +// kubeconfigContextSummary parses a serialized kubeconfig and returns its +// current-context name and the matching cluster's `server:` URL. Used purely +// for human-readable log lines that identify which cluster the test is about +// to talk to. On any parse failure the helper returns "" / "" +// rather than an error: failing here would defeat its only purpose, which is +// to make the surrounding log message safer to print under partial failures. +func kubeconfigContextSummary(content []byte) (currentContext, server string) { + currentContext = "" + server = "" + if len(content) == 0 { + return + } + cfg, err := clientcmd.Load(content) + if err != nil || cfg == nil { + return + } + if cfg.CurrentContext != "" { + currentContext = cfg.CurrentContext + } + if ctx, ok := cfg.Contexts[cfg.CurrentContext]; ok && ctx != nil { + if cl, ok := cfg.Clusters[ctx.Cluster]; ok && cl != nil && cl.Server != "" { + server = cl.Server + } + } + return +} + // loadDefaultKubeconfig replicates kubectl's standard kubeconfig resolution // (KUBECONFIG env, otherwise ~/.kube/config; multiple files in KUBECONFIG are // merged) and returns the serialized merged config plus a human-readable From b810de2e5ffa0a723475dc53b948e0f7f453cd88 Mon Sep 17 00:00:00 2001 From: Aleksandr Zimin Date: Tue, 5 May 2026 13:08:20 +0300 Subject: [PATCH 08/14] Wait for Ceph CRs to disappear during teardown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TeardownCephStorageClass now waits for each CR to be GC'd before deleting its parent. Without that synchronization the parent CephCluster could be deleted while a child CephBlockPool / CephFilesystem is still alive, leaving Rook stuck with DeletionIsBlocked / ObjectHasDependents and the cluster in phase=Deleting indefinitely. Adds: - pollResourceUntilGone helper with periodic deletionTimestamp / finalizers progress logging, so a stuck finalizer surfaces immediately instead of after a silent timeout. - WaitFor*Gone helpers for CephCluster, CephBlockPool, CephFilesystem, CephClusterAuthentication, CephClusterConnection, CephStorageClass with sensible per-CR default budgets. - errIfTerminating guard in every Create* helper so an Ensure* call finds a Terminating CR and fails fast instead of issuing a silent no-op Update and trapping WaitFor*Ready for 15-20m. - pollResourceUntilReady fail-fast on deletionTimestamp != nil for the same reason. Fail-fast policy on Wait*Gone timeouts: errors are aggregated and returned, no auto-strip of finalizers — that would mask real Rook bugs. Operator must investigate the cluster manually before re-running. Signed-off-by: Aleksandr Zimin --- docs/FUNCTIONS_GLOSSARY.md | 38 +++-- pkg/kubernetes/cephblockpool.go | 29 +++- pkg/kubernetes/cephcluster.go | 35 +++++ pkg/kubernetes/cephclusterconnection.go | 44 +++++- pkg/kubernetes/cephfilesystem.go | 28 +++- pkg/kubernetes/cephstorageclass.go | 24 ++- pkg/kubernetes/poll.go | 201 ++++++++++++++++++++++-- pkg/testkit/ceph.go | 35 +++++ 8 files changed, 402 insertions(+), 32 deletions(-) diff --git a/docs/FUNCTIONS_GLOSSARY.md b/docs/FUNCTIONS_GLOSSARY.md index 73ab6a8..7aba661 100644 --- a/docs/FUNCTIONS_GLOSSARY.md +++ b/docs/FUNCTIONS_GLOSSARY.md @@ -248,43 +248,49 @@ All exported functions available in the `pkg/` directory, grouped by resource. `pkg/kubernetes/cephcluster.go` -- `CreateCephCluster(ctx, kubeconfig, cfg)` — Creates or updates a Rook `CephCluster` CR using `CephClusterConfig` (image, mon/mgr counts, network provider, OSD storage class / count / size, data-dir host path, etc.). Idempotent. -- `WaitForCephClusterReady(ctx, kubeconfig, namespace, name, timeout)` — Blocks until `status.state == "Created"` (or `status.phase == "Ready"`). HEALTH_WARN is tolerated so single-OSD test clusters still succeed. Each Get is bounded by `PollGetTimeout` (30s) and consecutive Get failures emit WARN, so a dropped SSH tunnel surfaces in seconds instead of after the readyTimeout. -- `DeleteCephCluster(ctx, kubeconfig, namespace, name)` — Deletes the CR; NotFound is treated as success. Does NOT garbage-collect OSD data on host disks. +- `CreateCephCluster(ctx, kubeconfig, cfg)` — Creates or updates a Rook `CephCluster` CR using `CephClusterConfig` (image, mon/mgr counts, network provider, OSD storage class / count / size, data-dir host path, etc.). Idempotent. **Fail-fast:** if an existing CR has `metadata.deletionTimestamp != nil`, returns an error instead of trying to update a Terminating object (which would silently no-op and trap the next `WaitForCephClusterReady` for 15-20 minutes). +- `WaitForCephClusterReady(ctx, kubeconfig, namespace, name, timeout)` — Blocks until `status.state == "Created"` (or `status.phase == "Ready"`). HEALTH_WARN is tolerated so single-OSD test clusters still succeed. Each Get is bounded by `PollGetTimeout` (30s) and consecutive Get failures emit WARN, so a dropped SSH tunnel surfaces in seconds instead of after the readyTimeout. **Fail-fast** when the CR comes back with `deletionTimestamp != nil` — there's no point waiting for Ready on a Terminating object. +- `DeleteCephCluster(ctx, kubeconfig, namespace, name)` — Fire-and-forget delete; NotFound is treated as success. Does NOT garbage-collect OSD data on host disks. Pair with `WaitForCephClusterGone` if the next step depends on the CR being fully GC'd (e.g. before re-creating the cluster, or to detect a stuck `cephcluster.ceph.rook.io` finalizer early). +- `WaitForCephClusterGone(ctx, kubeconfig, namespace, name, timeout)` — Polls until the CR returns NotFound (default `CephClusterGoneTimeout` = 10m when timeout is 0). Logs deletionTimestamp / finalizers progress periodically, so a stuck finalizer (typical after a teardown that left dependents alive — see `DeletionIsBlocked`) is visible immediately instead of after a silent timeout. Fail-fast on timeout: does NOT auto-strip finalizers — investigate the cluster manually before re-running. ## CephBlockPool (Rook) `pkg/kubernetes/cephblockpool.go` -- `CreateCephBlockPool(ctx, kubeconfig, cfg)` — Creates or updates a Rook `CephBlockPool` from `CephBlockPoolConfig` (replicated with optional `requireSafeReplicaSize` override, or erasure-coded with `dataChunks`/`codingChunks`; `failureDomain`). -- `WaitForCephBlockPoolReady(ctx, kubeconfig, namespace, name, timeout)` — Polls until `status.phase == "Ready"`. Each Get is bounded by `PollGetTimeout` (30s) and consecutive Get failures emit WARN, so a dropped SSH tunnel surfaces in seconds instead of after the readyTimeout. -- `DeleteCephBlockPool(ctx, kubeconfig, namespace, name)` — Idempotent delete. +- `CreateCephBlockPool(ctx, kubeconfig, cfg)` — Creates or updates a Rook `CephBlockPool` from `CephBlockPoolConfig` (replicated with optional `requireSafeReplicaSize` override, or erasure-coded with `dataChunks`/`codingChunks`; `failureDomain`). **Fail-fast** when the existing CR has `deletionTimestamp != nil`. +- `WaitForCephBlockPoolReady(ctx, kubeconfig, namespace, name, timeout)` — Polls until `status.phase == "Ready"`. Each Get is bounded by `PollGetTimeout` (30s) and consecutive Get failures emit WARN, so a dropped SSH tunnel surfaces in seconds instead of after the readyTimeout. Fail-fast on `deletionTimestamp != nil`. +- `DeleteCephBlockPool(ctx, kubeconfig, namespace, name)` — Fire-and-forget delete; NotFound is treated as success. Pair with `WaitForCephBlockPoolGone` to make sure the parent CephCluster's deletion isn't blocked by `ObjectHasDependents`. +- `WaitForCephBlockPoolGone(ctx, kubeconfig, namespace, name, timeout)` — Polls until the CR is GC'd (default `CephBlockPoolGoneTimeout` = 5m). Logs progress periodically. ## CephFilesystem (Rook) `pkg/kubernetes/cephfilesystem.go` -- `CreateCephFilesystem(ctx, kubeconfig, cfg)` — Creates or updates a Rook `CephFilesystem` from `CephFilesystemConfig` (one replicated metadata pool + one replicated data pool, configurable `failureDomain`, `MetadataServerActiveCount`, optional `RequireSafeReplicaSize`). Idempotent. -- `WaitForCephFilesystemReady(ctx, kubeconfig, namespace, name, timeout)` — Polls until `status.phase == "Ready"`, with a fallback that also accepts `status.conditions[type=Ready,status=True]` for Rook revisions that populate conditions before phase. Each Get is bounded by `PollGetTimeout` (30s) and consecutive Get failures emit WARN, so a dropped SSH tunnel surfaces in seconds instead of after the readyTimeout. -- `DeleteCephFilesystem(ctx, kubeconfig, namespace, name)` — Idempotent delete. +- `CreateCephFilesystem(ctx, kubeconfig, cfg)` — Creates or updates a Rook `CephFilesystem` from `CephFilesystemConfig` (one replicated metadata pool + one replicated data pool, configurable `failureDomain`, `MetadataServerActiveCount`, optional `RequireSafeReplicaSize`). Idempotent. **Fail-fast** when the existing CR has `deletionTimestamp != nil`. +- `WaitForCephFilesystemReady(ctx, kubeconfig, namespace, name, timeout)` — Polls until `status.phase == "Ready"`, with a fallback that also accepts `status.conditions[type=Ready,status=True]` for Rook revisions that populate conditions before phase. Each Get is bounded by `PollGetTimeout` (30s) and consecutive Get failures emit WARN, so a dropped SSH tunnel surfaces in seconds instead of after the readyTimeout. Fail-fast on `deletionTimestamp != nil`. +- `DeleteCephFilesystem(ctx, kubeconfig, namespace, name)` — Fire-and-forget delete; NotFound is treated as success. Pair with `WaitForCephFilesystemGone` to make sure the parent CephCluster's deletion isn't blocked by `ObjectHasDependents`. +- `WaitForCephFilesystemGone(ctx, kubeconfig, namespace, name, timeout)` — Polls until the CR is GC'd (default `CephFilesystemGoneTimeout` = 5m). Logs progress periodically. - `CephFSDataPoolFullName(fsName, dataPoolName)` — Returns the full Ceph pool name (`-`) that should be passed to `CephStorageClass.spec.cephFS.pool`. ## CephClusterConnection / CephClusterAuthentication (csi-ceph) `pkg/kubernetes/cephclusterconnection.go` -- `CreateCephClusterAuthentication(ctx, kubeconfig, cfg)` — Creates or updates a `CephClusterAuthentication` CR (`userID` + `userKey`) used by csi-ceph to log in to Ceph. -- `DeleteCephClusterAuthentication(ctx, kubeconfig, name)` — Idempotent delete. -- `CreateCephClusterConnection(ctx, kubeconfig, cfg)` — Creates or updates a `CephClusterConnection` CR (`clusterID == fsid`, `monitors`, `userID`, `userKey`). `clusterID` is immutable: existing-resource updates leave it unchanged and only sync monitors/user. -- `DeleteCephClusterConnection(ctx, kubeconfig, name)` — Idempotent delete. +- `CreateCephClusterAuthentication(ctx, kubeconfig, cfg)` — Creates or updates a `CephClusterAuthentication` CR (`userID` + `userKey`) used by csi-ceph to log in to Ceph. **Fail-fast** when the existing CR has `deletionTimestamp != nil`. +- `DeleteCephClusterAuthentication(ctx, kubeconfig, name)` — Fire-and-forget delete; NotFound is treated as success. +- `WaitForCephClusterAuthenticationGone(ctx, kubeconfig, name, timeout)` — Polls until the CR is GC'd (default `CephClusterAuthenticationGoneTimeout` = 1m). +- `CreateCephClusterConnection(ctx, kubeconfig, cfg)` — Creates or updates a `CephClusterConnection` CR (`clusterID == fsid`, `monitors`, `userID`, `userKey`). `clusterID` is immutable: existing-resource updates leave it unchanged and only sync monitors/user. **Fail-fast** when the existing CR has `deletionTimestamp != nil`. +- `DeleteCephClusterConnection(ctx, kubeconfig, name)` — Fire-and-forget delete; NotFound is treated as success. +- `WaitForCephClusterConnectionGone(ctx, kubeconfig, name, timeout)` — Polls until the CR is GC'd (default `CephClusterConnectionGoneTimeout` = 1m). - `WaitForCephClusterConnectionCreated(ctx, kubeconfig, name, timeout)` — Polls until csi-ceph reports `status.phase == "Created"` (credentials + monitors validated against the live Ceph cluster). ## CephStorageClass (csi-ceph) `pkg/kubernetes/cephstorageclass.go` -- `CreateCephStorageClass(ctx, kubeconfig, cfg)` — Creates or updates a csi-ceph `CephStorageClass` CR (RBD by default; CephFS when `Type == "CephFS"` and `CephFSName` / `CephFSPool` are set). The csi-ceph controller provisions a corresponding core `storage.k8s.io/v1 StorageClass` as a side effect. -- `DeleteCephStorageClass(ctx, kubeconfig, name)` — Idempotent delete; the controller removes the backing StorageClass. +- `CreateCephStorageClass(ctx, kubeconfig, cfg)` — Creates or updates a csi-ceph `CephStorageClass` CR (RBD by default; CephFS when `Type == "CephFS"` and `CephFSName` / `CephFSPool` are set). The csi-ceph controller provisions a corresponding core `storage.k8s.io/v1 StorageClass` as a side effect. **Fail-fast** when the existing CR has `deletionTimestamp != nil`. +- `DeleteCephStorageClass(ctx, kubeconfig, name)` — Fire-and-forget delete; the controller removes the backing StorageClass. +- `WaitForCephStorageClassGone(ctx, kubeconfig, name, timeout)` — Polls until the CR is GC'd (default `CephStorageClassGoneTimeout` = 1m). - `WaitForCephStorageClassCreated(ctx, kubeconfig, name, timeout)` — Polls until `status.phase == "Created"`. ## Default StorageClass (Testkit) @@ -300,7 +306,7 @@ All exported functions available in the `pkg/` directory, grouped by resource. - `EnsureCephStorageClass(ctx, kubeconfig, cfg)` — High-level end-to-end helper that turns an empty test cluster into one with a working csi-ceph `StorageClass`. Steps: (1) enable `sds-node-configurator`, `sds-elastic`, `csi-ceph` modules and wait Ready; (2) optionally call `EnsureDefaultStorageClass` to auto-provision a sds-local-volume SC for OSDs when `OSDStorageClass` is empty; (3) seed `rook-config-override` with `GlobalCephConfigOverrides` (e.g. `ms_crc_data=false`); (4) create Rook `CephCluster` and wait Created; (5) create the backing pool primitive — `CephBlockPool` (when `Type == "RBD"`, default) or `CephFilesystem` (when `Type == "CephFS"`) — and wait Ready; (6) read fsid/monitors/admin-key from Rook-managed secrets; (7) wire csi-ceph by creating `CephClusterAuthentication` + `CephClusterConnection`; (8) create the matching `CephStorageClass` (RBD pool or `-` for CephFS) and wait for the backing core StorageClass. Idempotent; returns the resulting StorageClass name. - `EnsureDefaultCephStorageClass(ctx, kubeconfig, cfg)` — `EnsureCephStorageClass` + `SetGlobalDefaultStorageClass` so new PVCs without an explicit `storageClassName` use the provisioned Ceph (RBD or CephFS) class. -- `TeardownCephStorageClass(ctx, kubeconfig, cfg)` — Reverse of `EnsureCephStorageClass`. Deletes the `CephStorageClass`, `CephClusterConnection`, `CephClusterAuthentication`, and the `CephBlockPool` / `CephFilesystem` matching `cfg.Type`. Also removes the `CephCluster` and `rook-config-override` ConfigMap unless `SkipClusterTeardown` is set (use that flag when several StorageClasses share one `CephCluster` and only the last teardown should drop the cluster). NotFound is treated as success; the first error is returned but later deletions are still attempted. +- `TeardownCephStorageClass(ctx, kubeconfig, cfg)` — Reverse of `EnsureCephStorageClass`. After every Delete it now waits for the CR to be fully GC'd via the matching `WaitForXxxGone` helper. Order is: `CephStorageClass` → `CephClusterConnection` → `CephClusterAuthentication` → (`CephBlockPool` or `CephFilesystem` per `cfg.Type`) → `CephCluster` (unless `SkipClusterTeardown`) → `rook-config-override` ConfigMap. Without those waits the parent `CephCluster` would be deleted before its dependents are gone, Rook would record `DeletionIsBlocked / ObjectHasDependents`, and the next test run would either find a stuck Terminating CR or hang in `WaitForCephClusterReady`. Fail-fast on a Wait*Gone timeout: errors are aggregated and returned, no auto-strip of finalizers — investigate the cluster manually before re-running. NotFound is still treated as success; subsequent deletions are still attempted on partial failures. ## Ceph Cluster (Testkit) — no csi-ceph wiring diff --git a/pkg/kubernetes/cephblockpool.go b/pkg/kubernetes/cephblockpool.go index 98dba64..8ad2dfc 100644 --- a/pkg/kubernetes/cephblockpool.go +++ b/pkg/kubernetes/cephblockpool.go @@ -150,6 +150,9 @@ func CreateCephBlockPool(ctx context.Context, kubeconfig *rest.Config, cfg CephB if err != nil { return fmt.Errorf("failed to fetch existing CephBlockPool %s/%s: %w", cfg.Namespace, cfg.Name, err) } + if err := errIfTerminating(existing, "CephBlockPool", formatRef(cfg.Namespace, cfg.Name)); err != nil { + return err + } existing.Object["spec"] = spec if _, err := dynamicClient.Resource(CephBlockPoolGVR).Namespace(cfg.Namespace).Update(ctx, existing, metav1.UpdateOptions{}); err != nil { return fmt.Errorf("failed to update CephBlockPool %s/%s: %w", cfg.Namespace, cfg.Name, err) @@ -180,7 +183,11 @@ func WaitForCephBlockPoolReady(ctx context.Context, kubeconfig *rest.Config, nam } // DeleteCephBlockPool deletes a CephBlockPool. Safe to call if the pool does -// not exist. +// not exist. NOTE: this is fire-and-forget — the API call returns as soon as +// the apiserver accepts the request, but Rook may still be running its +// finalizer (`cephblockpool.ceph.rook.io`) for a few minutes afterwards. If +// you want to be certain the CR is fully gone before continuing, follow up +// with WaitForCephBlockPoolGone. func DeleteCephBlockPool(ctx context.Context, kubeconfig *rest.Config, namespace, name string) error { dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) if err != nil { @@ -196,3 +203,23 @@ func DeleteCephBlockPool(ctx context.Context, kubeconfig *rest.Config, namespace logger.Info("Deleted CephBlockPool %s/%s", namespace, name) return nil } + +// CephBlockPoolGoneTimeout is the default budget for WaitForCephBlockPoolGone. +// Rook removes the underlying RBD pool from Ceph before lifting the +// finalizer; with one OSD the pool delete normally completes in seconds but +// can take a few minutes if the cluster is unhealthy. +const CephBlockPoolGoneTimeout = 5 * time.Minute + +// WaitForCephBlockPoolGone polls until the CephBlockPool is fully GC'd by +// Kubernetes (GET returns NotFound). Use this after DeleteCephBlockPool to +// be sure the parent CephCluster won't be blocked by `ObjectHasDependents` +// when it gets deleted next. +func WaitForCephBlockPoolGone(ctx context.Context, kubeconfig *rest.Config, namespace, name string, timeout time.Duration) error { + if timeout <= 0 { + timeout = CephBlockPoolGoneTimeout + } + return pollResourceUntilGone( + ctx, kubeconfig, CephBlockPoolGVR, namespace, name, + timeout, PollTickInterval, "CephBlockPool", + ) +} diff --git a/pkg/kubernetes/cephcluster.go b/pkg/kubernetes/cephcluster.go index 9aa87ae..501d8d8 100644 --- a/pkg/kubernetes/cephcluster.go +++ b/pkg/kubernetes/cephcluster.go @@ -193,6 +193,9 @@ func CreateCephCluster(ctx context.Context, kubeconfig *rest.Config, cfg CephClu if err != nil { return fmt.Errorf("failed to fetch existing CephCluster %s/%s: %w", cfg.Namespace, cfg.Name, err) } + if err := errIfTerminating(existing, "CephCluster", formatRef(cfg.Namespace, cfg.Name)); err != nil { + return err + } existing.Object["spec"] = spec if _, err := dynamicClient.Resource(CephClusterGVR).Namespace(cfg.Namespace).Update(ctx, existing, metav1.UpdateOptions{}); err != nil { return fmt.Errorf("failed to update CephCluster %s/%s: %w", cfg.Namespace, cfg.Name, err) @@ -359,6 +362,16 @@ func WaitForCephClusterReady(ctx context.Context, kubeconfig *rest.Config, names // `dataDirHostPath` and operator-managed PVCs will not be garbage-collected // automatically. The operation is still idempotent: a NotFound error is // swallowed. +// +// NOTE: this is fire-and-forget. The apiserver returns success as soon as it +// records the delete intent; Rook then runs its `cephcluster.ceph.rook.io` +// finalizer for several minutes, removing pools, mon/mgr/osd pods, and so +// on. If any dependent CR (CephBlockPool, CephFilesystem, ...) is still +// alive, Rook records `DeletionIsBlocked / ObjectHasDependents` and the CR +// stays in `phase=Deleting` indefinitely. Always tear down dependents first +// (and call WaitForCephBlockPoolGone / WaitForCephFilesystemGone on them) +// before invoking DeleteCephCluster, then follow up with +// WaitForCephClusterGone. func DeleteCephCluster(ctx context.Context, kubeconfig *rest.Config, namespace, name string) error { dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) if err != nil { @@ -374,3 +387,25 @@ func DeleteCephCluster(ctx context.Context, kubeconfig *rest.Config, namespace, logger.Info("Deleted CephCluster %s/%s", namespace, name) return nil } + +// CephClusterGoneTimeout is the default budget for WaitForCephClusterGone. +// Rook needs to drain mon/mgr/osd pods, remove the CRUSH map, and unset +// finalizers — easily 5+ minutes on a single-OSD cluster, longer on +// degraded ones. +const CephClusterGoneTimeout = 10 * time.Minute + +// WaitForCephClusterGone polls until the CephCluster is fully GC'd by +// Kubernetes (GET returns NotFound). The poller logs the +// deletionTimestamp/finalizers progress periodically so a stuck finalizer +// (typical e2e failure: orphan dependent CR, broken Ceph health) is +// immediately visible in the test log instead of being hidden behind a +// silent timeout. +func WaitForCephClusterGone(ctx context.Context, kubeconfig *rest.Config, namespace, name string, timeout time.Duration) error { + if timeout <= 0 { + timeout = CephClusterGoneTimeout + } + return pollResourceUntilGone( + ctx, kubeconfig, CephClusterGVR, namespace, name, + timeout, PollTickInterval, "CephCluster", + ) +} diff --git a/pkg/kubernetes/cephclusterconnection.go b/pkg/kubernetes/cephclusterconnection.go index 3110cfb..f8117db 100644 --- a/pkg/kubernetes/cephclusterconnection.go +++ b/pkg/kubernetes/cephclusterconnection.go @@ -103,6 +103,9 @@ func CreateCephClusterAuthentication(ctx context.Context, kubeconfig *rest.Confi if err != nil { return fmt.Errorf("failed to fetch CephClusterAuthentication %s: %w", cfg.Name, err) } + if err := errIfTerminating(existing, "CephClusterAuthentication", cfg.Name); err != nil { + return err + } existing.Object["spec"] = obj.Object["spec"] if _, err := dynamicClient.Resource(CephClusterAuthenticationGVR).Update(ctx, existing, metav1.UpdateOptions{}); err != nil { return fmt.Errorf("failed to update CephClusterAuthentication %s: %w", cfg.Name, err) @@ -111,7 +114,8 @@ func CreateCephClusterAuthentication(ctx context.Context, kubeconfig *rest.Confi } // DeleteCephClusterAuthentication removes a CephClusterAuthentication. -// NotFound is treated as success. +// NotFound is treated as success. Pair with WaitForCephClusterAuthenticationGone +// when teardown order matters. func DeleteCephClusterAuthentication(ctx context.Context, kubeconfig *rest.Config, name string) error { dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) if err != nil { @@ -127,6 +131,22 @@ func DeleteCephClusterAuthentication(ctx context.Context, kubeconfig *rest.Confi return nil } +// CephClusterAuthenticationGoneTimeout is the default budget for +// WaitForCephClusterAuthenticationGone. The CR has no heavy finalizer. +const CephClusterAuthenticationGoneTimeout = 1 * time.Minute + +// WaitForCephClusterAuthenticationGone polls until the CephClusterAuthentication +// is fully GC'd by Kubernetes (GET returns NotFound). +func WaitForCephClusterAuthenticationGone(ctx context.Context, kubeconfig *rest.Config, name string, timeout time.Duration) error { + if timeout <= 0 { + timeout = CephClusterAuthenticationGoneTimeout + } + return pollResourceUntilGone( + ctx, kubeconfig, CephClusterAuthenticationGVR, "", name, + timeout, PollTickInterval, "CephClusterAuthentication", + ) +} + // CephClusterConnectionConfig describes a csi-ceph CephClusterConnection CR. // Its spec.clusterID (== Ceph fsid) is immutable once created. type CephClusterConnectionConfig struct { @@ -196,6 +216,9 @@ func CreateCephClusterConnection(ctx context.Context, kubeconfig *rest.Config, c if err != nil { return fmt.Errorf("failed to fetch CephClusterConnection %s: %w", cfg.Name, err) } + if err := errIfTerminating(existing, "CephClusterConnection", cfg.Name); err != nil { + return err + } if err := unstructured.SetNestedSlice(existing.Object, monitors, "spec", "monitors"); err != nil { return fmt.Errorf("set monitors: %w", err) } @@ -212,7 +235,8 @@ func CreateCephClusterConnection(ctx context.Context, kubeconfig *rest.Config, c } // DeleteCephClusterConnection removes a CephClusterConnection. -// NotFound is treated as success. +// NotFound is treated as success. Pair with WaitForCephClusterConnectionGone +// when teardown order matters. func DeleteCephClusterConnection(ctx context.Context, kubeconfig *rest.Config, name string) error { dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) if err != nil { @@ -228,6 +252,22 @@ func DeleteCephClusterConnection(ctx context.Context, kubeconfig *rest.Config, n return nil } +// CephClusterConnectionGoneTimeout is the default budget for +// WaitForCephClusterConnectionGone. The CR has no heavy finalizer. +const CephClusterConnectionGoneTimeout = 1 * time.Minute + +// WaitForCephClusterConnectionGone polls until the CephClusterConnection is +// fully GC'd by Kubernetes (GET returns NotFound). +func WaitForCephClusterConnectionGone(ctx context.Context, kubeconfig *rest.Config, name string, timeout time.Duration) error { + if timeout <= 0 { + timeout = CephClusterConnectionGoneTimeout + } + return pollResourceUntilGone( + ctx, kubeconfig, CephClusterConnectionGVR, "", name, + timeout, PollTickInterval, "CephClusterConnection", + ) +} + // WaitForCephClusterConnectionCreated polls until the CephClusterConnection // status reports phase=Created. csi-ceph's controller flips the status from // Pending to Created once it has verified the supplied fsid / monitors / diff --git a/pkg/kubernetes/cephfilesystem.go b/pkg/kubernetes/cephfilesystem.go index bb185ee..91fab14 100644 --- a/pkg/kubernetes/cephfilesystem.go +++ b/pkg/kubernetes/cephfilesystem.go @@ -178,6 +178,9 @@ func CreateCephFilesystem(ctx context.Context, kubeconfig *rest.Config, cfg Ceph if err != nil { return fmt.Errorf("failed to fetch existing CephFilesystem %s/%s: %w", cfg.Namespace, cfg.Name, err) } + if err := errIfTerminating(existing, "CephFilesystem", formatRef(cfg.Namespace, cfg.Name)); err != nil { + return err + } existing.Object["spec"] = spec if _, err := dynamicClient.Resource(CephFilesystemGVR).Namespace(cfg.Namespace).Update(ctx, existing, metav1.UpdateOptions{}); err != nil { return fmt.Errorf("failed to update CephFilesystem %s/%s: %w", cfg.Namespace, cfg.Name, err) @@ -230,7 +233,11 @@ func cephFilesystemReadyByCondition(obj map[string]interface{}) bool { } // DeleteCephFilesystem deletes a CephFilesystem. Safe to call if the -// filesystem does not exist. +// filesystem does not exist. NOTE: fire-and-forget — Rook's +// `cephfilesystem.ceph.rook.io` finalizer takes time to detach the MDS +// daemons and remove the metadata/data pools. Pair with +// WaitForCephFilesystemGone if you need to know the CR has actually been +// GC'd before doing something else (e.g. deleting the parent CephCluster). func DeleteCephFilesystem(ctx context.Context, kubeconfig *rest.Config, namespace, name string) error { dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) if err != nil { @@ -246,3 +253,22 @@ func DeleteCephFilesystem(ctx context.Context, kubeconfig *rest.Config, namespac logger.Info("Deleted CephFilesystem %s/%s", namespace, name) return nil } + +// CephFilesystemGoneTimeout is the default budget for WaitForCephFilesystemGone. +// MDS shutdown + pool removal usually settles in 1-2 minutes; we allow more +// to absorb operator restarts and slow Ceph mons. +const CephFilesystemGoneTimeout = 5 * time.Minute + +// WaitForCephFilesystemGone polls until the CephFilesystem is fully GC'd by +// Kubernetes (GET returns NotFound). Use this after DeleteCephFilesystem to +// be sure the parent CephCluster's deletion won't be blocked by +// `ObjectHasDependents`. +func WaitForCephFilesystemGone(ctx context.Context, kubeconfig *rest.Config, namespace, name string, timeout time.Duration) error { + if timeout <= 0 { + timeout = CephFilesystemGoneTimeout + } + return pollResourceUntilGone( + ctx, kubeconfig, CephFilesystemGVR, namespace, name, + timeout, PollTickInterval, "CephFilesystem", + ) +} diff --git a/pkg/kubernetes/cephstorageclass.go b/pkg/kubernetes/cephstorageclass.go index 6bf256c..942dd49 100644 --- a/pkg/kubernetes/cephstorageclass.go +++ b/pkg/kubernetes/cephstorageclass.go @@ -161,6 +161,9 @@ func CreateCephStorageClass(ctx context.Context, kubeconfig *rest.Config, cfg Ce if err != nil { return fmt.Errorf("failed to fetch CephStorageClass %s: %w", cfg.Name, err) } + if err := errIfTerminating(existing, "CephStorageClass", cfg.Name); err != nil { + return err + } existing.Object["spec"] = spec if _, err := dynamicClient.Resource(CephStorageClassGVR).Update(ctx, existing, metav1.UpdateOptions{}); err != nil { return fmt.Errorf("failed to update CephStorageClass %s: %w", cfg.Name, err) @@ -170,7 +173,8 @@ func CreateCephStorageClass(ctx context.Context, kubeconfig *rest.Config, cfg Ce // DeleteCephStorageClass removes a CephStorageClass. NotFound is treated as // success. The underlying k8s StorageClass is removed by the csi-ceph -// controller as a side effect. +// controller as a side effect. Use WaitForCephStorageClassGone to confirm +// the CR is fully GC'd. func DeleteCephStorageClass(ctx context.Context, kubeconfig *rest.Config, name string) error { dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) if err != nil { @@ -186,6 +190,24 @@ func DeleteCephStorageClass(ctx context.Context, kubeconfig *rest.Config, name s return nil } +// CephStorageClassGoneTimeout is the default budget for +// WaitForCephStorageClassGone. CephStorageClass has no heavyweight finalizer +// (csi-ceph just deletes the backing k8s StorageClass), so this typically +// completes in seconds. +const CephStorageClassGoneTimeout = 1 * time.Minute + +// WaitForCephStorageClassGone polls until the CephStorageClass is fully GC'd +// by Kubernetes (GET returns NotFound). +func WaitForCephStorageClassGone(ctx context.Context, kubeconfig *rest.Config, name string, timeout time.Duration) error { + if timeout <= 0 { + timeout = CephStorageClassGoneTimeout + } + return pollResourceUntilGone( + ctx, kubeconfig, CephStorageClassGVR, "", name, + timeout, PollTickInterval, "CephStorageClass", + ) +} + // WaitForCephStorageClassCreated polls until the CephStorageClass status // reports phase=Created (the csi-ceph controller flips this once the backing // k8s StorageClass has been provisioned). diff --git a/pkg/kubernetes/poll.go b/pkg/kubernetes/poll.go index 6cd4ba4..4fc833f 100644 --- a/pkg/kubernetes/poll.go +++ b/pkg/kubernetes/poll.go @@ -77,8 +77,8 @@ func pollResourceUntilReady( resourceLabel string, isReady func(obj *unstructured.Unstructured) (ready bool, reason string), ) error { - if namespace == "" || name == "" { - return fmt.Errorf("namespace and name are required") + if name == "" { + return fmt.Errorf("name is required") } if isReady == nil { return fmt.Errorf("isReady is required") @@ -87,7 +87,8 @@ func pollResourceUntilReady( tickInterval = PollTickInterval } - logger.Debug("Waiting for %s %s/%s to become Ready (timeout: %v)", resourceLabel, namespace, name, readyTimeout) + ref := formatRef(namespace, name) + logger.Debug("Waiting for %s %s to become Ready (timeout: %v)", resourceLabel, ref, readyTimeout) dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) if err != nil { @@ -106,11 +107,26 @@ func pollResourceUntilReady( switch { case err == nil: consecutiveErrs = 0 + // Refuse to wait for Ready on a Terminating object. Without this + // short-circuit a stale `Deleting` CR (e.g. CephCluster left over + // by a previous run that didn't finish teardown) would keep us + // polling for the full readyTimeout: phase=Deleting never matches + // any "Ready" condition. Failing fast here gives the operator a + // chance to clean up (or strip finalizers) instead of hiding the + // real state of the cluster behind a 15-20 minute timeout. + if dt := obj.GetDeletionTimestamp(); dt != nil { + return fmt.Errorf( + "%s %s is being deleted (deletionTimestamp=%s, finalizers=%v); "+ + "refusing to wait for Ready on a Terminating object", + resourceLabel, ref, + dt.Format(time.RFC3339), obj.GetFinalizers(), + ) + } if ready, reason := isReady(obj); ready { if reason != "" { - logger.Success("%s %s/%s is Ready (%s)", resourceLabel, namespace, name, reason) + logger.Success("%s %s is Ready (%s)", resourceLabel, ref, reason) } else { - logger.Success("%s %s/%s is Ready", resourceLabel, namespace, name) + logger.Success("%s %s is Ready", resourceLabel, ref) } return nil } @@ -119,7 +135,7 @@ func pollResourceUntilReady( // without warning so we don't spam logs on healthy clusters that // just haven't observed the create yet. consecutiveErrs = 0 - logger.Debug("%s %s/%s not found yet", resourceLabel, namespace, name) + logger.Debug("%s %s not found yet", resourceLabel, ref) default: consecutiveErrs++ // Quiet the first two failures (spurious 5xx, leader re-election), @@ -128,25 +144,185 @@ func pollResourceUntilReady( // the readyTimeout to fire. if consecutiveErrs >= 3 { logger.Warn( - "%s %s/%s GET failed for %d consecutive iterations: %v", - resourceLabel, namespace, name, consecutiveErrs, err, + "%s %s GET failed for %d consecutive iterations: %v", + resourceLabel, ref, consecutiveErrs, err, ) } else { - logger.Debug("Error getting %s %s/%s: %v", resourceLabel, namespace, name, err) + logger.Debug("Error getting %s %s: %v", resourceLabel, ref, err) } } select { case <-deadlineCtx.Done(): - return fmt.Errorf("timeout waiting for %s %s/%s: %w", resourceLabel, namespace, name, deadlineCtx.Err()) + return fmt.Errorf("timeout waiting for %s %s: %w", resourceLabel, ref, deadlineCtx.Err()) case <-ticker.C: } } } +// PollGoneProgressEvery controls how often pollResourceUntilGone emits a +// progress INFO line while the resource is still alive. We don't want a log +// per tick (chatty) but we also don't want long stretches of silence when a +// finalizer is stuck for minutes — every ~30s strikes a balance. +const PollGoneProgressEvery = 30 * time.Second + +// pollResourceUntilGone polls a single namespaced unstructured resource +// until a GET returns NotFound (i.e. the API server has GC'd the object) or +// the parent timeout expires. +// +// Mirrors pollResourceUntilReady but with inverted success criterion. Three +// behaviors worth calling out: +// - per-call deadline (PollGetTimeout) on every Get; +// - WARN logs after a few consecutive non-NotFound errors so a dropped +// SSH tunnel surfaces in seconds rather than at the timeout; +// - periodic INFO progress log including the object's deletionTimestamp +// and finalizers — that's exactly the diagnostic info you need to know +// why Rook hasn't finished tearing the resource down. We avoid logging +// this on every tick (chatty) and instead emit at most once per +// PollGoneProgressEvery. +func pollResourceUntilGone( + ctx context.Context, + kubeconfig *rest.Config, + gvr schema.GroupVersionResource, + namespace, name string, + goneTimeout time.Duration, + tickInterval time.Duration, + resourceLabel string, +) error { + if name == "" { + return fmt.Errorf("name is required") + } + if tickInterval <= 0 { + tickInterval = PollTickInterval + } + + ref := formatRef(namespace, name) + logger.Debug("Waiting for %s %s to be gone (timeout: %v)", resourceLabel, ref, goneTimeout) + + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + + deadlineCtx, cancel := context.WithTimeout(ctx, goneTimeout) + defer cancel() + + ticker := time.NewTicker(tickInterval) + defer ticker.Stop() + + var ( + consecutiveErrs int + lastProgress time.Time + lastFinalizers []string + lastDeletionTS string + ) + for { + obj, err := getWithTimeout(deadlineCtx, dynamicClient, gvr, namespace, name, PollGetTimeout) + switch { + case apierrors.IsNotFound(err): + logger.Success("%s %s is gone", resourceLabel, ref) + return nil + case err == nil: + consecutiveErrs = 0 + finalizers := obj.GetFinalizers() + deletionTS := "" + if dt := obj.GetDeletionTimestamp(); dt != nil { + deletionTS = dt.Format(time.RFC3339) + } + // Surface progress periodically OR whenever the visible state + // changes (finalizers list shrunk, deletionTimestamp finally + // appeared after a Delete request was missed, ...). + stateChanged := deletionTS != lastDeletionTS || !sameFinalizers(finalizers, lastFinalizers) + if stateChanged || time.Since(lastProgress) >= PollGoneProgressEvery { + if deletionTS == "" { + logger.Info("%s %s still alive (no deletionTimestamp yet, finalizers=%v)", + resourceLabel, ref, finalizers) + } else { + logger.Info("%s %s still terminating (deletionTimestamp=%s, finalizers=%v)", + resourceLabel, ref, deletionTS, finalizers) + } + lastProgress = time.Now() + lastFinalizers = append(lastFinalizers[:0], finalizers...) + lastDeletionTS = deletionTS + } + default: + consecutiveErrs++ + if consecutiveErrs >= 3 { + logger.Warn( + "%s %s GET failed for %d consecutive iterations: %v", + resourceLabel, ref, consecutiveErrs, err, + ) + } else { + logger.Debug("Error getting %s %s: %v", resourceLabel, ref, err) + } + } + + select { + case <-deadlineCtx.Done(): + // Surface the last observed state in the timeout error so the + // caller (and the dev reading the test log) can immediately tell + // whether they're stuck on a finalizer, on a missing + // deletionTimestamp, or on a network issue. + lastSeen := "no observation yet" + if lastDeletionTS != "" || len(lastFinalizers) > 0 { + lastSeen = fmt.Sprintf("deletionTimestamp=%q, finalizers=%v", lastDeletionTS, lastFinalizers) + } + return fmt.Errorf("timeout waiting for %s %s to be gone (%s): %w", + resourceLabel, ref, lastSeen, deadlineCtx.Err()) + case <-ticker.C: + } + } +} + +// formatRef renders a resource reference as either "name" (cluster-scoped) +// or "namespace/name" (namespaced) for log lines and error messages. +func formatRef(namespace, name string) string { + if namespace == "" { + return name + } + return namespace + "/" + name +} + +// errIfTerminating returns a descriptive error if obj has a non-nil +// metadata.deletionTimestamp. Used by Create* helpers to fail-fast in the +// IsAlreadyExists branch when an existing CR is in `Terminating` state — +// updating its spec would be a no-op (the controller is busy unwinding the +// finalizer), and a follow-up Wait*Ready would hang forever because phase +// transitions never reach a Ready state on a Terminating object. +// +// `kind` is the human-readable kind ("CephCluster") and `ref` is the +// formatted "[namespace/]name" identifier. +func errIfTerminating(obj *unstructured.Unstructured, kind, ref string) error { + dt := obj.GetDeletionTimestamp() + if dt == nil { + return nil + } + return fmt.Errorf( + "%s %s exists but is being deleted (deletionTimestamp=%s, finalizers=%v); "+ + "wait for it to disappear or remove finalizers manually before re-running", + kind, ref, dt.Format(time.RFC3339), obj.GetFinalizers(), + ) +} + +// sameFinalizers returns true when both slices contain the same strings in +// the same order. Used by pollResourceUntilGone to decide if the visible +// state has changed. +func sameFinalizers(a, b []string) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +} + // getWithTimeout wraps dynamicClient.Get with a per-call deadline derived // from the parent context. The wrapper avoids leaking goroutines blocked on -// a dead TCP connection. +// a dead TCP connection. An empty namespace selects the cluster-scoped +// path (used by csi-ceph CRs like CephClusterConnection). func getWithTimeout( parent context.Context, dynamicClient dynamic.Interface, @@ -156,5 +332,8 @@ func getWithTimeout( ) (*unstructured.Unstructured, error) { callCtx, cancel := context.WithTimeout(parent, perCallTimeout) defer cancel() + if namespace == "" { + return dynamicClient.Resource(gvr).Get(callCtx, name, metav1.GetOptions{}) + } return dynamicClient.Resource(gvr).Namespace(namespace).Get(callCtx, name, metav1.GetOptions{}) } diff --git a/pkg/testkit/ceph.go b/pkg/testkit/ceph.go index 9c6a08a..7427967 100644 --- a/pkg/testkit/ceph.go +++ b/pkg/testkit/ceph.go @@ -477,6 +477,21 @@ func EnsureCephStorageClass(ctx context.Context, kubeconfig *rest.Config, cfg Ce // call on partial state (missing resources are skipped — the first error is // returned but subsequent deletions are still attempted). // +// Each Delete is followed by a Wait*Gone that waits for the apiserver to +// actually GC the CR. Without this synchronization the next test run (in +// alwaysUseExisting mode, or a fresh bootstrap that re-creates the same +// namespace) would race against Rook's finalizer and either: +// - find the CR still in Terminating and try to update its spec (no-op +// while the controller unwinds the finalizer); +// - delete the parent CephCluster while a child CephBlockPool / +// CephFilesystem is still alive — Rook then sets `DeletionIsBlocked / +// ObjectHasDependents` and the CephCluster sticks in `phase=Deleting` +// forever. +// +// On a Wait*Gone timeout we DO NOT auto-strip finalizers: the failure is +// surfaced as an aggregated error so the operator can investigate the +// cluster (typical reasons: HEALTH_ERR Ceph, stuck OSD prepare, dead mgr). +// // It deliberately does NOT disable the Deckhouse modules: they may be owned // by the cluster admin, and re-bootstrapping is cheaper than a full // module-disable → module-enable cycle. @@ -495,17 +510,37 @@ func TeardownCephStorageClass(ctx context.Context, kubeconfig *rest.Config, cfg } logger.Info("Tearing down csi-ceph StorageClass %q (type=%s)", cfg.StorageClassName, cfg.Type) + + // 1. CephStorageClass: leaf, no finalizer dependency on the rest. note(kubernetes.DeleteCephStorageClass(ctx, kubeconfig, cfg.StorageClassName), "delete CephStorageClass") + note(kubernetes.WaitForCephStorageClassGone(ctx, kubeconfig, cfg.StorageClassName, 0), "wait CephStorageClass gone") + + // 2. CephClusterConnection / CephClusterAuthentication: csi-ceph CRs. + // Order between conn and auth doesn't matter — neither depends on the + // other. note(kubernetes.DeleteCephClusterConnection(ctx, kubeconfig, cfg.ClusterConnectionName), "delete CephClusterConnection") + note(kubernetes.WaitForCephClusterConnectionGone(ctx, kubeconfig, cfg.ClusterConnectionName, 0), "wait CephClusterConnection gone") + note(kubernetes.DeleteCephClusterAuthentication(ctx, kubeconfig, cfg.ClusterAuthenticationName), "delete CephClusterAuthentication") + note(kubernetes.WaitForCephClusterAuthenticationGone(ctx, kubeconfig, cfg.ClusterAuthenticationName, 0), "wait CephClusterAuthentication gone") + + // 3. Pool / Filesystem: must be fully gone before deleting CephCluster, + // otherwise Rook records DeletionIsBlocked / ObjectHasDependents. switch cfg.Type { case kubernetes.CephStorageClassTypeCephFS: note(kubernetes.DeleteCephFilesystem(ctx, kubeconfig, cfg.Namespace, cfg.CephFSName), "delete CephFilesystem") + note(kubernetes.WaitForCephFilesystemGone(ctx, kubeconfig, cfg.Namespace, cfg.CephFSName, 0), "wait CephFilesystem gone") default: note(kubernetes.DeleteCephBlockPool(ctx, kubeconfig, cfg.Namespace, cfg.PoolName), "delete CephBlockPool") + note(kubernetes.WaitForCephBlockPoolGone(ctx, kubeconfig, cfg.Namespace, cfg.PoolName, 0), "wait CephBlockPool gone") } + + // 4. CephCluster: only when this teardown call owns it (the other + // TeardownCephStorageClass call shares the same Rook cluster — see + // SkipClusterTeardown doc-comment). if !cfg.SkipClusterTeardown { note(kubernetes.DeleteCephCluster(ctx, kubeconfig, cfg.Namespace, cfg.CephClusterName), "delete CephCluster") + note(kubernetes.WaitForCephClusterGone(ctx, kubeconfig, cfg.Namespace, cfg.CephClusterName, 0), "wait CephCluster gone") note(kubernetes.DeleteRookConfigOverride(ctx, kubeconfig, cfg.Namespace), "delete rook-config-override") } else { logger.Info("Skipping CephCluster + rook-config-override teardown (SkipClusterTeardown=true)") From 4bd9a28496c7b5b3621cfc3dcc5ca69ad8033085 Mon Sep 17 00:00:00 2001 From: Aleksandr Zimin Date: Tue, 5 May 2026 13:53:32 +0300 Subject: [PATCH 09/14] Add pod-exec testkit primitives (with distroless support via ephemeral containers) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit storage-e2e had no pod-exec helpers at all (pkg/kubernetes/pod.go only covers WaitFor*Ready). Each downstream test suite was forced to roll its own — see csi-ceph/e2e/tests/e2e_shared_test.go::execInPod which wraps remotecommand.NewSPDYExecutor and only works on containers that have cat (i.e. test probe pods, not the actual distroless csi-controllers). This commit lifts pod exec into the shared testkit so any module's e2e suite can reuse it. New file: pkg/kubernetes/pod_exec.go - ExecInPod(ctx, kubeconfig, ns, pod, container, cmd) (stdout, stderr string, error). General SPDY exec on /pods//exec. Returns stdout/stderr SEPARATELY (the csi-ceph copy concatenates them and loses signal). - ReadFileFromPod(...) — ExecInPod + cat . For containers that ship a real userland. - ReadFileFromDistrolessPod(..., opts ReadFileOptions) — adds a short-lived ephemeral container with TargetContainerName set, polls until it goes Running, then cat /proc/1/root. The distroless path leans on Kubernetes Ephemeral Containers (GA since 1.25). They're added through the dedicated /pods//ephemeralcontainers subresource — NOT via the regular pod PUT/PATCH path, which is why the apiserver explicitly allows this mutation on a running pod and existing containers do NOT restart. metadata.generation, spec.containers, pod sandbox UID and ReplicaSet/DaemonSet observation all stay intact, so e2e suites that subsequently assert on checksum/... annotations or rollout state see a clean signal — the FS read does not contaminate it. Caveat documented in the doc-comment: ephemeral containers cannot be removed once added; sleep 60 lets the cat process exit on its own. For long-running test suites the entry just stays as Terminated in pod.status.ephemeralContainerStatuses until the next rollout recycles the pod. docs/FUNCTIONS_GLOSSARY.md gets a new entry under the Pod subsection listing the three primitives with selection guidance (which to pick for distroless vs. shell-bearing containers). Signed-off-by: Aleksandr Zimin --- docs/FUNCTIONS_GLOSSARY.md | 6 + go.mod | 3 + go.sum | 5 + pkg/kubernetes/pod_exec.go | 294 +++++++++++++++++++++++++++++++++++++ 4 files changed, 308 insertions(+) create mode 100644 pkg/kubernetes/pod_exec.go diff --git a/docs/FUNCTIONS_GLOSSARY.md b/docs/FUNCTIONS_GLOSSARY.md index 7aba661..de233ce 100644 --- a/docs/FUNCTIONS_GLOSSARY.md +++ b/docs/FUNCTIONS_GLOSSARY.md @@ -145,6 +145,12 @@ All exported functions available in the `pkg/` directory, grouped by resource. - `WaitForAllPodsReadyInNamespace(ctx, kubeconfig, namespace, timeout)` — Waits for all pods in a namespace to be in Ready condition. - `WaitForPodsStatus(ctx, clientset, namespace, labelSelector, status, expectedCount, maxAttempts, interval)` — Waits for pods matching a label selector to reach a specific status (Running, Completed, etc.). +`pkg/kubernetes/pod_exec.go` + +- `ExecInPod(ctx, kubeconfig, namespace, pod, container, cmd) (stdout, stderr, err)` — Runs a command inside a container via the apiserver's `pods/exec` subresource (SPDY). Returns stdout and stderr separately; the container must ship every binary referenced by `cmd`. Use this when the container has a usable shell/userland. +- `ReadFileFromPod(ctx, kubeconfig, namespace, pod, container, path)` — `ExecInPod` + `cat `. Convenience wrapper for non-distroless images. +- `ReadFileFromDistrolessPod(ctx, kubeconfig, namespace, pod, targetContainer, path, opts)` — Reads a file from a distroless / scratch container that ships no `cat`/`sh`/`tar`. Injects a short-lived ephemeral container (image from `opts.DebugImage`, defaults to `DefaultDebugImage = "busybox:1.36"`) with `targetContainerName=targetContainer`, polls until it goes Running (`opts.StartupTimeout`, defaults to 60s), then `cat /proc/1/root` — `/proc/1/root` is the kernel-exposed FS root of PID 1 in the target container, which the ephemeral container can see thanks to the shared PID namespace. Adding the ephemeral container goes through the dedicated `/pods//ephemeralcontainers` subresource, so existing containers and the pod sandbox are NOT restarted, `metadata.generation` is not bumped, and ReplicaSet/DaemonSet observation is unaffected — downstream rollout / `checksum/...` annotation assertions still see a clean signal. Caveat: ephemeral containers cannot be removed once added, but each call generates a unique name and the `sleep 60` command exits on its own; entries pile up in `pod.status.ephemeralContainerStatuses` until the next pod recycle. + ## PVC (PersistentVolumeClaim) `pkg/kubernetes/pvc.go` diff --git a/go.mod b/go.mod index e8935c3..da7465b 100644 --- a/go.mod +++ b/go.mod @@ -35,13 +35,16 @@ require ( github.com/google/go-cmp v0.7.0 // indirect github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad // indirect github.com/google/uuid v1.6.0 // indirect + github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/kr/fs v0.1.0 // indirect github.com/mailru/easyjson v0.7.7 // indirect + github.com/moby/spdystream v0.5.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect github.com/openshift/api v0.0.0-20230503133300-8bbcb7ca7183 // indirect github.com/openshift/custom-resource-status v1.1.2 // indirect github.com/pkg/errors v0.9.1 // indirect diff --git a/go.sum b/go.sum index eb9ac25..ba4b41a 100644 --- a/go.sum +++ b/go.sum @@ -107,6 +107,8 @@ github.com/googleapis/gnostic v0.5.1/go.mod h1:6U4PtQXGIEt/Z3h5MAT7FNofLnw9vXk2c github.com/googleapis/gnostic v0.5.5/go.mod h1:7+EbHbldMins07ALC74bsA81Ovc97DwqyJO1AENw9kA= github.com/gorilla/mux v1.8.0/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So= github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= +github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 h1:JeSE6pjso5THxAzdVpqr6/geYxZytqFMBCOtn/ujyeo= +github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674/go.mod h1:r4w70xmWCQKmi1ONH4KIaBptdivuRPyosB9RmPlGEwA= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= @@ -133,6 +135,8 @@ github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0 github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= github.com/moby/spdystream v0.2.0/go.mod h1:f7i0iNDQJ059oMTcWxx8MA/zKFIuD/lY+0GqbN2Wy8c= +github.com/moby/spdystream v0.5.0 h1:7r0J1Si3QO/kjRitvSLVVFUjxMEb/YLj6S9FF62JBCU= +github.com/moby/spdystream v0.5.0/go.mod h1:xBAYlnt/ay+11ShkdFKNAG7LsyK/tmNBVvVOwrfMgdI= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= @@ -143,6 +147,7 @@ github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWu github.com/munnerz/goautoneg v0.0.0-20120707110453-a547fc61f48d/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A= diff --git a/pkg/kubernetes/pod_exec.go b/pkg/kubernetes/pod_exec.go new file mode 100644 index 0000000..dcd27d1 --- /dev/null +++ b/pkg/kubernetes/pod_exec.go @@ -0,0 +1,294 @@ +/* +Copyright 2026 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubernetes + +import ( + "bytes" + "context" + "crypto/rand" + "encoding/hex" + "fmt" + "time" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/remotecommand" +) + +// DefaultDebugImage is the image ReadFileFromDistrolessPod injects as the +// short-lived ephemeral container. busybox ships cat, sleep and a +// minimal sh — exactly the toolset we need to read /proc/1/root/ +// in the target container's filesystem. Tests against an air-gapped +// registry can override this via ReadFileOptions.DebugImage. +const DefaultDebugImage = "busybox:1.36" + +// DefaultEphemeralStartupTimeout caps the wait for the injected +// ephemeral container to transition into Running. Image pull from a +// warm registry usually takes a couple of seconds; 60 s is a generous +// upper bound that still surfaces ImagePullBackOff/ErrImagePull early. +const DefaultEphemeralStartupTimeout = 60 * time.Second + +// ephemeralPollInterval is how often we re-Get the pod when waiting for +// the ephemeral container to start. 500 ms is a deliberate compromise: +// fast enough that the typical 1-3 s pull is observed promptly, slow +// enough that we don't hammer the apiserver. +const ephemeralPollInterval = 500 * time.Millisecond + +// ReadFileOptions tunes ReadFileFromDistrolessPod. +type ReadFileOptions struct { + // DebugImage overrides the ephemeral container image. Defaults to + // DefaultDebugImage. Use this on air-gapped clusters to point at an + // internal mirror. + DebugImage string + // StartupTimeout caps the wait for the ephemeral container to reach + // state.Running. Defaults to DefaultEphemeralStartupTimeout. + StartupTimeout time.Duration +} + +// ExecInPod runs cmd inside container of pod namespace/pod via the +// apiserver's pods/exec subresource and returns stdout and stderr +// separately, plus any transport- or exec-level error. +// +// The container must ship every binary referenced by cmd; ExecInPod does +// NOT inject any helper. For distroless containers without cat / sh, +// see ReadFileFromDistrolessPod. +func ExecInPod( + ctx context.Context, + kubeconfig *rest.Config, + namespace, pod, container string, + cmd []string, +) (stdout, stderr string, err error) { + clientset, err := NewClientsetWithRetry(ctx, kubeconfig) + if err != nil { + return "", "", fmt.Errorf("create clientset: %w", err) + } + + req := clientset.CoreV1().RESTClient().Post(). + Resource("pods"). + Name(pod). + Namespace(namespace). + SubResource("exec"). + VersionedParams(&corev1.PodExecOptions{ + Container: container, + Command: cmd, + Stdout: true, + Stderr: true, + }, scheme.ParameterCodec) + + executor, err := remotecommand.NewSPDYExecutor(kubeconfig, "POST", req.URL()) + if err != nil { + return "", "", fmt.Errorf("create SPDY executor for %s/%s[%s]: %w", + namespace, pod, container, err) + } + + var stdoutBuf, stderrBuf bytes.Buffer + err = executor.StreamWithContext(ctx, remotecommand.StreamOptions{ + Stdout: &stdoutBuf, + Stderr: &stderrBuf, + }) + stdout = stdoutBuf.String() + stderr = stderrBuf.String() + if err != nil { + return stdout, stderr, fmt.Errorf("exec %v in %s/%s[%s]: %w (stderr=%q)", + cmd, namespace, pod, container, err, stderr) + } + return stdout, stderr, nil +} + +// ReadFileFromPod cat's `path` from inside `container` of pod +// `namespace/pod`. Equivalent to `kubectl exec -c container -- cat +// path`, with stderr surfaced as part of the error if non-empty. +// +// Requires the container image to ship cat. For distroless / scratch +// images, use ReadFileFromDistrolessPod. +func ReadFileFromPod( + ctx context.Context, + kubeconfig *rest.Config, + namespace, pod, container, path string, +) (string, error) { + stdout, stderr, err := ExecInPod(ctx, kubeconfig, namespace, pod, container, []string{"cat", path}) + if err != nil { + return stdout, err + } + if stderr != "" { + return stdout, fmt.Errorf("cat %s in %s/%s[%s] reported stderr: %s", + path, namespace, pod, container, stderr) + } + return stdout, nil +} + +// ReadFileFromDistrolessPod reads `path` from inside `targetContainer` +// of pod `namespace/pod` even when targetContainer ships no shell, no +// cat and no tar — i.e. a distroless or scratch image like +// csi-controller. It does so by injecting a short-lived ephemeral +// container (TargetContainerName=targetContainer, which gives it a +// shared PID namespace with the target) and then catting +// /proc/1/root. /proc/1 is PID 1 inside the target container's +// PID namespace, and /proc//root is the well-known kernel-exposed +// view of that process's filesystem root. +// +// Why this does NOT restart the target pod or any of its containers: +// +// - Ephemeral containers are added through the dedicated +// /pods//ephemeralcontainers subresource (UpdateEphemeralContainers +// in client-go). The apiserver explicitly allows this mutation on a +// running pod; the ordinary pod PUT/PATCH path that would trigger +// re-creation is bypassed entirely. Without this dedicated path, +// adding a container to a live pod would be flat-out forbidden. +// - metadata.generation, spec.containers, the pod sandbox UID and the +// ReplicaSet/DaemonSet observation all stay intact. The kubelet +// simply launches the new container in the existing pod sandbox +// without disturbing existing containers. Workload-controller +// rollouts and pod-template `checksum/...` annotations are not +// affected, so e2e suites that subsequently assert on rollout +// state see a clean signal — the FS read does not contaminate it. +// - Ephemeral containers are forbidden from declaring ports, probes, +// lifecycle hooks or resources, which guarantees the inject is a +// cheap no-op for the pod's lifecycle. +// +// Caveat: ephemeral containers cannot be removed once added. The cat +// process exits with the container after `sleep 60`, but the entry +// remains in pod.spec.ephemeralContainers and +// pod.status.ephemeralContainerStatuses (state=Terminated). For +// long-running suites those entries simply pile up until the next pod +// recycle. Each invocation here generates a unique container name, so +// repeat calls against the same pod are safe. +func ReadFileFromDistrolessPod( + ctx context.Context, + kubeconfig *rest.Config, + namespace, pod, targetContainer, path string, + opts ReadFileOptions, +) (string, error) { + if opts.DebugImage == "" { + opts.DebugImage = DefaultDebugImage + } + if opts.StartupTimeout <= 0 { + opts.StartupTimeout = DefaultEphemeralStartupTimeout + } + + clientset, err := NewClientsetWithRetry(ctx, kubeconfig) + if err != nil { + return "", fmt.Errorf("create clientset: %w", err) + } + pods := clientset.CoreV1().Pods(namespace) + + ecName, err := randomEphemeralName("filereader-") + if err != nil { + return "", fmt.Errorf("generate ephemeral container name: %w", err) + } + + livePod, err := pods.Get(ctx, pod, metav1.GetOptions{}) + if err != nil { + return "", fmt.Errorf("get pod %s/%s: %w", namespace, pod, err) + } + livePod.Spec.EphemeralContainers = append(livePod.Spec.EphemeralContainers, corev1.EphemeralContainer{ + EphemeralContainerCommon: corev1.EphemeralContainerCommon{ + Name: ecName, + Image: opts.DebugImage, + Command: []string{"sleep", "60"}, + ImagePullPolicy: corev1.PullIfNotPresent, + TerminationMessagePolicy: corev1.TerminationMessageReadFile, + }, + TargetContainerName: targetContainer, + }) + if _, err := pods.UpdateEphemeralContainers(ctx, pod, livePod, metav1.UpdateOptions{}); err != nil { + return "", fmt.Errorf("inject ephemeral container %q into %s/%s: %w", + ecName, namespace, pod, err) + } + + if err := waitEphemeralContainerRunning(ctx, pods, pod, ecName, opts.StartupTimeout); err != nil { + return "", err + } + + stdout, stderr, err := ExecInPod(ctx, kubeconfig, namespace, pod, ecName, []string{"cat", "/proc/1/root" + path}) + if err != nil { + return stdout, fmt.Errorf("read %s from %s/%s[%s] via ephemeral %s: %w", + path, namespace, pod, targetContainer, ecName, err) + } + if stderr != "" { + return stdout, fmt.Errorf("read %s from %s/%s[%s] via ephemeral %s: stderr=%s", + path, namespace, pod, targetContainer, ecName, stderr) + } + return stdout, nil +} + +// waitEphemeralContainerRunning polls pod.status.ephemeralContainerStatuses +// until the container with name ecName reports state.Running != nil. +// Returns immediately on Terminated / hard pull failures so tests don't +// have to sit through the full timeout when the debug image is +// unreachable. +func waitEphemeralContainerRunning( + ctx context.Context, + pods typedcorev1.PodInterface, + podName, ecName string, + timeout time.Duration, +) error { + deadlineCtx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + ticker := time.NewTicker(ephemeralPollInterval) + defer ticker.Stop() + + for { + p, getErr := pods.Get(deadlineCtx, podName, metav1.GetOptions{}) + switch { + case apierrors.IsNotFound(getErr): + return fmt.Errorf("pod %s disappeared while waiting for ephemeral container %q", + podName, ecName) + case getErr == nil: + for _, st := range p.Status.EphemeralContainerStatuses { + if st.Name != ecName { + continue + } + if st.State.Running != nil { + return nil + } + if st.State.Terminated != nil { + return fmt.Errorf("ephemeral container %q in pod %s terminated before exec: reason=%s exitCode=%d", + ecName, podName, + st.State.Terminated.Reason, st.State.Terminated.ExitCode) + } + if w := st.State.Waiting; w != nil && (w.Reason == "ImagePullBackOff" || w.Reason == "ErrImagePull") { + return fmt.Errorf("ephemeral container %q in pod %s cannot start: %s: %s", + ecName, podName, w.Reason, w.Message) + } + } + } + + select { + case <-deadlineCtx.Done(): + return fmt.Errorf("timeout (%s) waiting for ephemeral container %q in pod %s to be Running", + timeout, ecName, podName) + case <-ticker.C: + } + } +} + +// randomEphemeralName returns prefix + 8 hex chars from crypto/rand. +// Sufficient entropy for uniqueness across a single test run; we don't +// need cryptographic strength but crypto/rand keeps us out of math/rand +// seeding pitfalls. +func randomEphemeralName(prefix string) (string, error) { + var b [4]byte + if _, err := rand.Read(b[:]); err != nil { + return "", err + } + return prefix + hex.EncodeToString(b[:]), nil +} From 235ccb84ddddbbeabc54a635a07d4b48786afa94 Mon Sep 17 00:00:00 2001 From: Aleksandr Zimin Date: Tue, 5 May 2026 17:03:24 +0300 Subject: [PATCH 10/14] Add DistrolessReader for cheap repeated reads from distroless pods MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ReadFileFromDistrolessPod was designed as one-shot: every call injects a fresh ephemeral container, waits for the kubelet to launch it, runs cat once, and exits. That's fine for diagnostics, but makes Eventually-style polling loops painfully slow — each iteration pays the full ephemeral-container cold-start cost (~10-20 s for kubelet to launch a new container in the existing pod sandbox), so a "predicate matches in 30 s" case can spend 2+ minutes inside the loop. A real trace from the msCrcData matrix shows ~127 s for an rbd FS-poll that should have settled in well under a minute. This commit splits the helper into a session API: - OpenDistrolessReader(...) injects ONE ephemeral container with a long sleep (default 30 minutes via opts.SessionTTL), waits for it to go Running, and returns a DistrolessReader bound to that ephemeral container. - DistrolessReader.ReadFile(ctx, path) is just a pods/exec round-trip into the already-running ephemeral container — sub-second. - ReadFileFromDistrolessPod is now a thin wrapper (open + read) for one-shot callers. Behaviour is unchanged from their perspective, but ReadFileOptions grows a SessionTTL field used by the session path. Reader API is what callers running poll loops should use; the single-shot helper stays for the one-shot diagnostics case. The reader cannot outlive its target pod — there's no Close() because Kubernetes does not allow removing an ephemeral container, and a pod recycle (rollout) drops the entry along with the rest of the pod status. Callers that need fresh sessions across pod identities should re-open against the new pod (DistrolessReader.PodName helps detect this). Signed-off-by: Aleksandr Zimin --- pkg/kubernetes/pod_exec.go | 132 +++++++++++++++++++++++++++++++------ 1 file changed, 113 insertions(+), 19 deletions(-) diff --git a/pkg/kubernetes/pod_exec.go b/pkg/kubernetes/pod_exec.go index dcd27d1..92297a0 100644 --- a/pkg/kubernetes/pod_exec.go +++ b/pkg/kubernetes/pod_exec.go @@ -46,13 +46,20 @@ const DefaultDebugImage = "busybox:1.36" // upper bound that still surfaces ImagePullBackOff/ErrImagePull early. const DefaultEphemeralStartupTimeout = 60 * time.Second +// DefaultDistrolessSessionTTL is the lifetime of the `sleep` process +// inside the injected ephemeral container when used as a long-lived +// reader session (OpenDistrolessReader / DistrolessReader.ReadFile). +// 30 minutes comfortably outlasts any single test cell while still +// guaranteeing eventual self-cleanup if the caller crashes. +const DefaultDistrolessSessionTTL = 30 * time.Minute + // ephemeralPollInterval is how often we re-Get the pod when waiting for // the ephemeral container to start. 500 ms is a deliberate compromise: // fast enough that the typical 1-3 s pull is observed promptly, slow // enough that we don't hammer the apiserver. const ephemeralPollInterval = 500 * time.Millisecond -// ReadFileOptions tunes ReadFileFromDistrolessPod. +// ReadFileOptions tunes ReadFileFromDistrolessPod and OpenDistrolessReader. type ReadFileOptions struct { // DebugImage overrides the ephemeral container image. Defaults to // DefaultDebugImage. Use this on air-gapped clusters to point at an @@ -61,6 +68,12 @@ type ReadFileOptions struct { // StartupTimeout caps the wait for the ephemeral container to reach // state.Running. Defaults to DefaultEphemeralStartupTimeout. StartupTimeout time.Duration + // SessionTTL controls how long the injected ephemeral container's + // `sleep` process stays alive. Defaults to DefaultDistrolessSessionTTL. + // Used by OpenDistrolessReader; ReadFileFromDistrolessPod does not + // rely on this value (the entry's status flip after the cat exits + // has no effect on the pod). + SessionTTL time.Duration } // ExecInPod runs cmd inside container of pod namespace/pod via the @@ -165,69 +178,150 @@ func ReadFileFromPod( // cheap no-op for the pod's lifecycle. // // Caveat: ephemeral containers cannot be removed once added. The cat -// process exits with the container after `sleep 60`, but the entry -// remains in pod.spec.ephemeralContainers and +// process exits with the container after `sleep`, but the entry remains +// in pod.spec.ephemeralContainers and // pod.status.ephemeralContainerStatuses (state=Terminated). For // long-running suites those entries simply pile up until the next pod // recycle. Each invocation here generates a unique container name, so // repeat calls against the same pod are safe. +// +// For polling loops or any scenario that reads the same pod multiple +// times, prefer OpenDistrolessReader: each ReadFileFromDistrolessPod +// call pays the full ephemeral-container cold-start cost (~10–20 s for +// kubelet to launch a new container in the existing pod sandbox), and +// that cost dominates the runtime of a Eventually-style poll. func ReadFileFromDistrolessPod( ctx context.Context, kubeconfig *rest.Config, namespace, pod, targetContainer, path string, opts ReadFileOptions, ) (string, error) { + r, err := OpenDistrolessReader(ctx, kubeconfig, namespace, pod, targetContainer, opts) + if err != nil { + return "", err + } + return r.ReadFile(ctx, path) +} + +// DistrolessReader is a long-lived ephemeral-container reader session +// against a single distroless pod. Open one with OpenDistrolessReader, +// then call ReadFile as many times as you need — each ReadFile is just +// an exec into the already-running ephemeral container (cheap), so a +// polling loop pays the ephemeral-container cold start ONCE instead of +// per-iteration. +// +// The session expires when the ephemeral container's `sleep` +// (opts.SessionTTL, default DefaultDistrolessSessionTTL) elapses; there +// is no Close — Kubernetes does not allow removing an ephemeral +// container — but the inert "Terminated" status entry has no effect on +// the pod. Callers that need fresh sessions across pod identities +// (e.g. after a workload rollout) should re-open against the new pod. +type DistrolessReader struct { + kubeconfig *rest.Config + namespace string + podName string + targetContainer string + ephemeralName string +} + +// PodName returns the name of the pod this reader is bound to. Useful +// for callers that need to detect rollouts (the pod name changes when +// the workload-controller recycles the pod) and re-open the session. +func (r *DistrolessReader) PodName() string { return r.podName } + +// EphemeralName returns the auto-generated name of the injected +// ephemeral container, mostly for logging. +func (r *DistrolessReader) EphemeralName() string { return r.ephemeralName } + +// ReadFile cat's `path` from inside the target container's filesystem +// (resolved through the ephemeral container's view of /proc/1/root). +// Cheap — just a pods/exec round-trip; no apiserver mutations. +func (r *DistrolessReader) ReadFile(ctx context.Context, path string) (string, error) { + stdout, stderr, err := ExecInPod(ctx, r.kubeconfig, r.namespace, r.podName, r.ephemeralName, + []string{"cat", "/proc/1/root" + path}) + if err != nil { + return stdout, fmt.Errorf("read %s from %s/%s[%s] via ephemeral %s: %w", + path, r.namespace, r.podName, r.targetContainer, r.ephemeralName, err) + } + if stderr != "" { + return stdout, fmt.Errorf("read %s from %s/%s[%s] via ephemeral %s: stderr=%s", + path, r.namespace, r.podName, r.targetContainer, r.ephemeralName, stderr) + } + return stdout, nil +} + +// OpenDistrolessReader injects a long-lived ephemeral container into +// the target pod and waits for it to become Running. The returned +// DistrolessReader can then be used for arbitrarily many cheap +// ReadFile calls until opts.SessionTTL elapses (default 30 minutes). +// +// Failure modes (returned as errors): pod not found, ephemeral +// container terminates before Running, image pull failure, startup +// timeout. On any of these no usable reader is returned. +// +// See ReadFileFromDistrolessPod for the rationale on why this does +// not restart the target pod or any of its existing containers. +func OpenDistrolessReader( + ctx context.Context, + kubeconfig *rest.Config, + namespace, pod, targetContainer string, + opts ReadFileOptions, +) (*DistrolessReader, error) { if opts.DebugImage == "" { opts.DebugImage = DefaultDebugImage } if opts.StartupTimeout <= 0 { opts.StartupTimeout = DefaultEphemeralStartupTimeout } + if opts.SessionTTL <= 0 { + opts.SessionTTL = DefaultDistrolessSessionTTL + } clientset, err := NewClientsetWithRetry(ctx, kubeconfig) if err != nil { - return "", fmt.Errorf("create clientset: %w", err) + return nil, fmt.Errorf("create clientset: %w", err) } pods := clientset.CoreV1().Pods(namespace) ecName, err := randomEphemeralName("filereader-") if err != nil { - return "", fmt.Errorf("generate ephemeral container name: %w", err) + return nil, fmt.Errorf("generate ephemeral container name: %w", err) } livePod, err := pods.Get(ctx, pod, metav1.GetOptions{}) if err != nil { - return "", fmt.Errorf("get pod %s/%s: %w", namespace, pod, err) + return nil, fmt.Errorf("get pod %s/%s: %w", namespace, pod, err) + } + sleepSeconds := int64(opts.SessionTTL.Seconds()) + if sleepSeconds < 1 { + sleepSeconds = 1 } livePod.Spec.EphemeralContainers = append(livePod.Spec.EphemeralContainers, corev1.EphemeralContainer{ EphemeralContainerCommon: corev1.EphemeralContainerCommon{ Name: ecName, Image: opts.DebugImage, - Command: []string{"sleep", "60"}, + Command: []string{"sleep", fmt.Sprintf("%d", sleepSeconds)}, ImagePullPolicy: corev1.PullIfNotPresent, TerminationMessagePolicy: corev1.TerminationMessageReadFile, }, TargetContainerName: targetContainer, }) if _, err := pods.UpdateEphemeralContainers(ctx, pod, livePod, metav1.UpdateOptions{}); err != nil { - return "", fmt.Errorf("inject ephemeral container %q into %s/%s: %w", + return nil, fmt.Errorf("inject ephemeral container %q into %s/%s: %w", ecName, namespace, pod, err) } if err := waitEphemeralContainerRunning(ctx, pods, pod, ecName, opts.StartupTimeout); err != nil { - return "", err + return nil, err } - stdout, stderr, err := ExecInPod(ctx, kubeconfig, namespace, pod, ecName, []string{"cat", "/proc/1/root" + path}) - if err != nil { - return stdout, fmt.Errorf("read %s from %s/%s[%s] via ephemeral %s: %w", - path, namespace, pod, targetContainer, ecName, err) - } - if stderr != "" { - return stdout, fmt.Errorf("read %s from %s/%s[%s] via ephemeral %s: stderr=%s", - path, namespace, pod, targetContainer, ecName, stderr) - } - return stdout, nil + return &DistrolessReader{ + kubeconfig: kubeconfig, + namespace: namespace, + podName: pod, + targetContainer: targetContainer, + ephemeralName: ecName, + }, nil } // waitEphemeralContainerRunning polls pod.status.ephemeralContainerStatuses From d41d397697092be2c06e4642c136ea048c37a2c5 Mon Sep 17 00:00:00 2001 From: Aleksandr Zimin Date: Tue, 5 May 2026 22:52:20 +0300 Subject: [PATCH 11/14] Bounce rook-mds + rook-operator on ms_crc_data flip; gate on CephFilesystem.Ready MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RestartCephDaemons used to rolling-restart only mon/mgr/osd, which left two classes of state stuck on the pre-flip ms_crc_data: 1. rook-ceph-mds: a CephFS daemon that talks to mons over the same messenger that ms_crc_data toggles CRC for. With mons on the new value and MDS still on the old one, the MDS↔mon channel silently desynchronises, CephFS goes degraded, and any csi-cephfs PVC hangs in Pending until somebody bounces MDS by hand. Reproduced reliably in the msCrcData matrix on cell `protocol=cephfs server=off client=off -> Bound`: PVC stuck for ~2 minutes, unstuck only after kubectl rollout restart of the d8-sds-elastic namespace. 2. The rook-operator pod: itself a Ceph admin client that uses an in-pod ceph.conf rendered at startup. Without a pod restart it keeps using the stale ms_crc_data and can't talk to the freshly- bounced mons, surfacing as cephcluster CR phase=Ready / state=Error / `failed to get status. . timed out` until the next reconcile after operator pod recycle. Fix: * Extend RestartCephDaemons selector to mon/mgr/osd/mds/rgw. rgw is pre-included for forward-compat with future S3 tests; absence is not an error. * Add RestartRookOperator helper that bounces the rook-operator Deployment and waits for Ready. Operator-Deployment name is derived from the namespace by stripping the leading `d8-` prefix (`d8-sds-elastic` → `sds-elastic`), matching how Deckhouse packages the operator binary as a per-module Helm release. Vanilla Rook (`rook-ceph-operator` in `rook-ceph` namespace) is not supported — storage-e2e targets the Deckhouse flavor exclusively. Returns a descriptive error if the namespace doesn't have the expected prefix or the derived Deployment isn't there. * Wire RestartRookOperator into SetMsCrcDataOnServer (after the daemon restart so the operator boots against fresh-config mons). * Gate the whole flip on every CephFilesystem in the namespace reaching Ready before returning. Catches the MDS-stuck-on-old-CRC class of bug at the source instead of letting it surface as a PVC timeout downstream. RBD-only clusters are a no-op (no CephFilesystem CRs to wait for). Net cost: ~30s extra per flip (mds + operator restart). In return: no manual kubectl rollout restart between matrix cells, no spurious HEALTH_ERR on cephcluster CR, and CephFS PVCs stop hanging in Pending when CRC flips back to a matched state. Signed-off-by: Aleksandr Zimin --- pkg/testkit/ceph_crc.go | 150 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 144 insertions(+), 6 deletions(-) diff --git a/pkg/testkit/ceph_crc.go b/pkg/testkit/ceph_crc.go index 5214fde..39fb9a7 100644 --- a/pkg/testkit/ceph_crc.go +++ b/pkg/testkit/ceph_crc.go @@ -20,6 +20,7 @@ import ( "context" "fmt" "strconv" + "strings" "time" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -111,12 +112,152 @@ func SetMsCrcDataOnServer(ctx context.Context, kubeconfig *rest.Config, namespac if err := RestartCephDaemons(ctx, kubeconfig, namespace, 10*time.Minute); err != nil { return fmt.Errorf("restart ceph daemons: %w", err) } + + // The operator pod is itself a Ceph admin client: it talks to mons + // to update CephCluster.status, evaluate CephFilesystem health, + // etc. Its in-pod ceph.conf was rendered at startup, so until it + // restarts it keeps using the old `ms_crc_data` value and can't + // connect to the freshly-bounced mons. Symptom: cephcluster CR + // flips to phase=Ready/state=Error with `failed to get status. . + // timed out` until the next reconcile after operator pod recycle. + // Bounce it now so the operator's view of the cluster lines up + // with reality before we return. + if err := RestartRookOperator(ctx, kubeconfig, namespace, 5*time.Minute); err != nil { + return fmt.Errorf("restart rook-ceph-operator: %w", err) + } + + // Final sanity check: any CephFilesystem in the namespace must be + // Ready before we consider the flip "live". This is the gate that + // catches the MDS-stuck-on-old-CRC class of bug — if the MDS + // daemons we just bounced fail to rejoin the mons, the CR will + // linger in a non-Ready phase and we'd rather surface that here + // than have a downstream csi-cephfs PVC hang for minutes. + if err := waitCephFilesystemsReady(ctx, kubeconfig, namespace, 5*time.Minute); err != nil { + return fmt.Errorf("wait CephFilesystem ready after CRC flip: %w", err) + } + logger.Success("Server-side ms_crc_data=%s is now live on all Ceph daemons", msCrcDataString(enabled)) return nil } -// RestartCephDaemons rollout-restarts Rook's mon/mgr/osd Deployments and -// waits for them to reach their desired ready replica count. +// RestartRookOperator rollout-restarts the rook-operator Deployment +// in the given namespace and waits for the new pod to become Ready. +// +// The operator runs as a Ceph admin client (uses the cluster admin +// keyring + a baked-in ceph.conf to query mon/osd state). When tests +// flip a global wire-protocol knob like `ms_crc_data` and bounce the +// daemons, the operator's existing connections become invalid — but +// without a pod restart it'll keep retrying with the stale ceph.conf +// and the cephcluster CR ends up reporting `HEALTH_ERR` / +// `state: Error` until the next operator reconcile cycle. +// +// Deckhouse packages the rook-operator binary inside a Deployment +// named after the Helm release, which conventionally equals the +// namespace minus the leading `d8-` prefix (`d8-sds-elastic` → +// `sds-elastic`, `d8-sds-replicated-volume` → `sds-replicated-volume`, +// etc.). storage-e2e targets that flavor exclusively — vanilla Rook +// (`rook-ceph-operator` Deployment in `rook-ceph` namespace) is not +// supported here. +func RestartRookOperator(ctx context.Context, kubeconfig *rest.Config, namespace string, timeout time.Duration) error { + if namespace == "" { + namespace = kubernetes.DefaultRookNamespace + } + clientset, err := kubernetes.NewClientsetWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create clientset: %w", err) + } + + operatorName, ok := strings.CutPrefix(namespace, "d8-") + if !ok || operatorName == "" { + return fmt.Errorf("namespace %q is not a deckhouse module namespace (expected d8- prefix); cannot derive rook-operator Deployment name", namespace) + } + if _, err := clientset.AppsV1().Deployments(namespace).Get(ctx, operatorName, metav1.GetOptions{}); err != nil { + return fmt.Errorf("get rook-operator Deployment %s/%s: %w", namespace, operatorName, err) + } + + logger.Info("Rolling-restarting %s/%s so its Ceph admin client picks up the new ceph.conf", namespace, operatorName) + stamp := time.Now().UTC().Format(time.RFC3339Nano) + patch := []byte(fmt.Sprintf( + `{"spec":{"template":{"metadata":{"annotations":{"storage-e2e/restarted-at":%q}}}}}`, stamp)) + if _, err := clientset.AppsV1().Deployments(namespace).Patch( + ctx, operatorName, types.StrategicMergePatchType, patch, metav1.PatchOptions{}); err != nil { + return fmt.Errorf("annotate Deployment %s/%s for rollout: %w", namespace, operatorName, err) + } + + waitCtx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + + for { + d, err := clientset.AppsV1().Deployments(namespace).Get(waitCtx, operatorName, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("get Deployment %s/%s: %w", namespace, operatorName, err) + } + desired := int32(1) + if d.Spec.Replicas != nil { + desired = *d.Spec.Replicas + } + if d.Status.ObservedGeneration >= d.Generation && d.Status.UpdatedReplicas >= desired && d.Status.AvailableReplicas >= desired { + logger.Success("%s/%s is Ready after rollout", namespace, operatorName) + return nil + } + select { + case <-waitCtx.Done(): + return fmt.Errorf("timed out after %s waiting for Deployment %s/%s to become ready", timeout, namespace, operatorName) + case <-ticker.C: + } + } +} + +// waitCephFilesystemsReady lists every CephFilesystem CR in +// `namespace` and waits for each to reach `status.phase=Ready` (or a +// matching Ready condition). If the namespace has no CephFilesystem +// CRs (RBD-only cluster), the function is a no-op. +func waitCephFilesystemsReady(ctx context.Context, kubeconfig *rest.Config, namespace string, timeout time.Duration) error { + if namespace == "" { + namespace = kubernetes.DefaultRookNamespace + } + dynamicClient, err := kubernetes.NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + + list, err := dynamicClient.Resource(kubernetes.CephFilesystemGVR).Namespace(namespace).List(ctx, metav1.ListOptions{}) + if err != nil { + return fmt.Errorf("list CephFilesystem in %s: %w", namespace, err) + } + if len(list.Items) == 0 { + return nil + } + + for i := range list.Items { + name := list.Items[i].GetName() + if err := kubernetes.WaitForCephFilesystemReady(ctx, kubeconfig, namespace, name, timeout); err != nil { + return fmt.Errorf("CephFilesystem %s/%s did not become Ready after CRC flip: %w", namespace, name, err) + } + } + return nil +} + +// RestartCephDaemons rollout-restarts every Rook-managed Ceph daemon +// Deployment that consumes `/etc/ceph/ceph.conf` (mon, mgr, osd, mds, +// rgw) and waits for each to reach its desired Ready replica count. +// +// Why all five roles, not just mon/mgr/osd: a global ConfigMap knob +// like `ms_crc_data` lives in ceph.conf, which means every daemon +// needs to be restarted for it to take effect. If only mon/mgr/osd +// are bounced and an MDS keeps running with the old value, the +// resulting CRC mismatch silently severs the MDS↔mon messenger +// channel, CephFS goes degraded, and any csi-cephfs PVC hangs in +// Pending until somebody (often the human running the test) bounces +// MDS by hand. Including `rook-ceph-mds` here is what unblocks the +// CephFS half of the msCrcData matrix. +// +// The selector also covers `rook-ceph-rgw` for forward-compat with +// future S3 tests; if no rgw Deployments exist in the cluster, the +// match list is just smaller and the function continues. Operator +// restart is intentionally out of scope here — see RestartRookOperator. func RestartCephDaemons(ctx context.Context, kubeconfig *rest.Config, namespace string, timeout time.Duration) error { if namespace == "" { namespace = kubernetes.DefaultRookNamespace @@ -127,10 +268,7 @@ func RestartCephDaemons(ctx context.Context, kubeconfig *rest.Config, namespace } // Rook labels each Ceph daemon Deployment with `app=rook-ceph-`. - // We restart the daemons that actually consume `/etc/ceph/ceph.conf`: - // mon, mgr and osd. (The operator itself reads rook-config-override - // directly and does not need a bounce.) - labelSel := "app in (rook-ceph-mon,rook-ceph-mgr,rook-ceph-osd)" + labelSel := "app in (rook-ceph-mon,rook-ceph-mgr,rook-ceph-osd,rook-ceph-mds,rook-ceph-rgw)" deployList, err := clientset.AppsV1().Deployments(namespace).List(ctx, metav1.ListOptions{LabelSelector: labelSel}) if err != nil { return fmt.Errorf("list ceph daemon Deployments (%s): %w", labelSel, err) From 28d08b9709e6cd920c1ba61f945974854d2dd62e Mon Sep 17 00:00:00 2001 From: Aleksandr Zimin Date: Wed, 6 May 2026 11:47:38 +0300 Subject: [PATCH 12/14] Revert silent ~/.kube/config fallback in GetKubeconfig When SSH retrieval of /etc/kubernetes/{super-admin,admin}.conf from the master fails and KUBE_CONFIG_PATH is not set, GetKubeconfig now fails fast again instead of silently loading the developer's $KUBECONFIG / ~/.kube/config via clientcmd.NewDefaultClientConfigLoadingRules. The fallback (added in e3d4e8d) was convenient on dev laptops but too risky in CI and on machines whose `kubectl` already targets an unrelated cluster: tests would silently deploy modules to / acquire cluster locks on the wrong stand. Reverting preserves the original fail-fast contract that downstream suites already relied on. - internal/cluster/cluster.go: replace the default switch branch with a descriptive error pointing at KUBE_CONFIG_PATH and embedding the SSH error; drop loadDefaultKubeconfig and the now-unused clientcmdapi import. - docs/WORKLOG.md: rewrite the 2026-05-05 GetKubeconfig bullet to reflect the final fail-fast behavior. Signed-off-by: Aleksandr Zimin --- docs/WORKLOG.md | 2 +- internal/cluster/cluster.go | 78 ++++++------------------------------- 2 files changed, 13 insertions(+), 67 deletions(-) diff --git a/docs/WORKLOG.md b/docs/WORKLOG.md index 6d7d508..cba70bc 100644 --- a/docs/WORKLOG.md +++ b/docs/WORKLOG.md @@ -33,7 +33,7 @@ All notable changes to this repository are documented here. New entries are appe - **Update** `README.md`: documented `${VAR}` form in `modulePullOverride` and the fail-fast behavior on unset env vars. - **Refactor** `internal/config/env.go`: extracted `ApplyDefaults()` out of `ValidateEnvironment` so suites that don't call validation still get defaults for `SSH_VM_USER` / `SSH_PRIVATE_KEY` / `SSH_PUBLIC_KEY` / `TEST_CLUSTER_NAMESPACE` / `YAML_CONFIG_FILENAME` / `TEST_CLUSTER_CLEANUP`. - **Update** `pkg/cluster/cluster.go::CreateTestCluster`: call `config.ApplyDefaults()` defensively + fall back to `config.YAMLConfigFilenameDefaultValue` when the filename arg is empty. -- **Update** `internal/cluster/cluster.go::GetKubeconfig`: added a third-tier fallback to `clientcmd.NewDefaultClientConfigLoadingRules` (KUBECONFIG / `~/.kube/config`) + `MinifyConfig` when SSH retrieval fails and `KUBE_CONFIG_PATH` is unset, so a developer whose local `kubectl` already targets the base cluster doesn't have to set anything. +- **Update** `internal/cluster/cluster.go::GetKubeconfig`: when SSH retrieval of `/etc/kubernetes/{super-admin,admin}.conf` fails, the function now fails fast unless `KUBE_CONFIG_PATH` is set explicitly. The previously considered fallback to `clientcmd.NewDefaultClientConfigLoadingRules` (KUBECONFIG / `~/.kube/config`) was dropped before release to preserve the original fail-fast contract — a silent fallback to the developer's personal kubeconfig is too risky in CI and on machines whose `kubectl` already points at an unrelated cluster. - **Bugfix** `pkg/cluster/setup.go::executeDhctlBootstrap`: pass `FORCE_NO_PRIVATE_KEYS=true` and `USE_AGENT_WITH_NO_PRIVATE_KEYS=true` env vars into the `dhctl bootstrap` container so `lib-connection` stops opening `/root/.ssh/id_rsa` and authenticates exclusively via the mounted ssh-agent socket — fixes "Failed to read private keys from flags" on passphrase-protected keys. - **Bugfix** `pkg/cluster/vms.go::generateCloudInitUserData`: pin apt to `mirror.yandex.ru` and force IPv4 (`Acquire::ForceIPv4=true`) in cloud-init, so `package_update` and Docker install stop stalling when `archive.ubuntu.com` IPs are partially unreachable. - **Refactor** `internal/infrastructure/ssh/client.go::StartTunnel` (both `*client` and `*jumpHostClient`): extracted shared `runTunnelLoop` + `tunnelDialer`. On dial failure that looks like a dropped SSH session, the loop now logs a visible WARN, calls the existing `reconnect()` (retry + exponential backoff), and retries the dial once with the freshly rebuilt session. Fixes the "test hangs 20 minutes silently after Wi-Fi flap" failure mode. diff --git a/internal/cluster/cluster.go b/internal/cluster/cluster.go index 62d286c..cf713e8 100644 --- a/internal/cluster/cluster.go +++ b/internal/cluster/cluster.go @@ -45,7 +45,6 @@ import ( "gopkg.in/yaml.v3" "k8s.io/client-go/rest" "k8s.io/client-go/tools/clientcmd" - clientcmdapi "k8s.io/client-go/tools/clientcmd/api" "github.com/deckhouse/storage-e2e/internal/config" "github.com/deckhouse/storage-e2e/internal/infrastructure/ssh" @@ -259,34 +258,19 @@ func GetKubeconfig(ctx context.Context, masterIP, user, keyPath string, sshClien kubeconfigSource = fmt.Sprintf("KUBE_CONFIG_PATH=%s", resolvedPath) default: - // SSH failed and no explicit KUBE_CONFIG_PATH. Fall back to kubectl's - // standard resolution (KUBECONFIG env, otherwise ~/.kube/config) so - // that a developer whose `kubectl` already targets the right base - // cluster doesn't have to set anything else. - // - // This branch is *very loud* on purpose: silent fallback to the - // developer's personal ~/.kube/config has historically caused tests - // to acquire stale locks on unrelated SAN clusters or deploy modules - // against the wrong stand. We make sure both the WARN line and the - // final source-stamp surface what just happened. - fallbackContent, fallbackPath, fallbackErr := loadDefaultKubeconfig() - if fallbackErr != nil { - return nil, "", fmt.Errorf("failed to read kubeconfig from master (this may occur if sudo requires a password) "+ - "and the local kubectl-default kubeconfig fallback also failed (%v). "+ - "Set KUBE_CONFIG_PATH to a working kubeconfig, or ensure $KUBECONFIG / ~/.kube/config points at the base cluster. "+ - "Original SSH error: %w", fallbackErr, sshErr) - } - fbCtx, fbServer := kubeconfigContextSummary(fallbackContent) - logger.Warn( - "SSH kubeconfig retrieval from %s@%s failed (%v); falling back to LOCAL kubeconfig at %s "+ - "(current-context=%q, server=%q). "+ - "This is almost certainly NOT the cluster you intended to test against — check SSH_HOST/SSH_USER, "+ - "or set KUBE_CONFIG_PATH to a specific kubeconfig file. "+ - "To fail fast instead of silently falling back, unset $KUBECONFIG and remove ~/.kube/config", - user, masterIP, sshErr, fallbackPath, fbCtx, fbServer, + // SSH failed and the caller did not opt into a specific kubeconfig via + // KUBE_CONFIG_PATH. Fail fast rather than silently picking up the + // developer's ~/.kube/config / $KUBECONFIG, which has historically + // caused tests to acquire stale locks on unrelated SAN clusters or + // deploy modules against the wrong stand. + return nil, "", fmt.Errorf( + "failed to read kubeconfig from master via SSH (%s@%s) "+ + "and KUBE_CONFIG_PATH is not set; "+ + "set KUBE_CONFIG_PATH to a kubeconfig pointing at the target cluster, "+ + "or fix SSH credentials so passwordless sudo works on the master. "+ + "Original SSH error: %w", + user, masterIP, sshErr, ) - kubeconfigContent = fallbackContent - kubeconfigSource = fmt.Sprintf("LOCAL_FALLBACK(%s)", fallbackPath) } // Always stamp the kubeconfig source + the resulting current-context/server @@ -427,41 +411,3 @@ func kubeconfigContextSummary(content []byte) (currentContext, server string) { return } -// loadDefaultKubeconfig replicates kubectl's standard kubeconfig resolution -// (KUBECONFIG env, otherwise ~/.kube/config; multiple files in KUBECONFIG are -// merged) and returns the serialized merged config plus a human-readable -// description of where it was loaded from. Used as a last-resort fallback when -// SSH-based retrieval fails and KUBE_CONFIG_PATH is not set, so a developer -// whose `kubectl` already points at the right base cluster can simply run the -// suite without exporting any extra variables. -func loadDefaultKubeconfig() ([]byte, string, error) { - loadingRules := clientcmd.NewDefaultClientConfigLoadingRules() - rawConfig, err := loadingRules.Load() - if err != nil { - return nil, "", fmt.Errorf("clientcmd default loader: %w", err) - } - if rawConfig == nil || len(rawConfig.Clusters) == 0 { - return nil, "", fmt.Errorf("no clusters in default kubeconfig (KUBECONFIG=%q, ~/.kube/config)", os.Getenv("KUBECONFIG")) - } - - // Minify down to the current-context only. Otherwise UpdateKubeconfigPort - // would rewrite the `server:` URL of every cluster in a multi-cluster - // kubeconfig, breaking unrelated entries on the developer's machine. - minified := *rawConfig - if err := clientcmdapi.MinifyConfig(&minified); err != nil { - return nil, "", fmt.Errorf("clientcmd minify default kubeconfig: %w", err) - } - - content, err := clientcmd.Write(minified) - if err != nil { - return nil, "", fmt.Errorf("clientcmd serialize default kubeconfig: %w", err) - } - - source := os.Getenv("KUBECONFIG") - if source == "" { - source = "~/.kube/config (current-context=" + minified.CurrentContext + ")" - } else { - source = "KUBECONFIG=" + source + " (current-context=" + minified.CurrentContext + ")" - } - return content, source, nil -} From 56153f31e3885112b2a4c88e29f418a0f9e30d3a Mon Sep 17 00:00:00 2001 From: Aleksandr Zimin Date: Wed, 6 May 2026 12:38:50 +0300 Subject: [PATCH 13/14] docs: sync glossary, architecture, worklog with feature branch Backfill the documentation that earlier commits in this branch should have updated as they landed. No code changes. FUNCTIONS_GLOSSARY.md: - Pod section: documented OpenDistrolessReader and the three *DistrolessReader methods (PodName, EphemeralName, ReadFile) added alongside the single-shot ReadFileFromDistrolessPod helper. - New sections: "Ceph CRC (Testkit)" (EnableServerCRC / DisableServerCRC / ResetServerCRCToDefault / SetMsCrcDataOnServer / RestartCephDaemons / RestartRookOperator) and "VolumeSnapshotClass" (CreateVolumeSnapshotClass / WaitForVolumeSnapshotClass). - StorageClass section: documented CreateStorageClass (in pkg/kubernetes/storageclass_manage.go). - Rook Config Override section: documented RenderCephGlobalConfig. - Table of Contents: added missing entries for "Ceph Cluster (Testkit) - no csi-ceph wiring", "VolumeSnapshotClass", and "Ceph CRC (Testkit)". ARCHITECTURE.md: - Section 1.1 (Package Structure): added internal/config/overrides.go (was only listed in 3.1) and pkg/kubernetes/pod_exec.go. - Section 3.6 (Public API): added pkg/kubernetes/pod_exec.go. - Section 7 (Environment Variables): documented the new fail-fast KUBE_CONFIG_PATH semantics and the generic ${VAR} expansion in modulePullOverride (e.g. MODULE_IMAGE_TAG). WORKLOG.md: - 2026-05-05: backfilled entries for pod_exec.go, DistrolessReader, the WaitFor*Gone family + Create-time deletionTimestamp guards, TeardownCephStorageClass rewrite, RestartCephDaemons selector extension (mds/rgw), RestartRookOperator, SetMsCrcDataOnServer rework. The GetKubeconfig revert (which actually landed today) was hoisted out of 2026-05-05 into a new 2026-05-06 heading. Signed-off-by: Aleksandr Zimin --- docs/ARCHITECTURE.md | 6 +++++- docs/FUNCTIONS_GLOSSARY.md | 32 +++++++++++++++++++++++++++++++- docs/WORKLOG.md | 15 ++++++++++++++- 3 files changed, 50 insertions(+), 3 deletions(-) diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 5030d65..a831ccd 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -23,6 +23,7 @@ storage-e2e/ │ ├── config/ # Configuration management │ │ ├── config.go # Main configuration struct │ │ ├── env.go # Environment variable parsing +│ │ ├── overrides.go # ${VAR} expansion in modulePullOverride at config load time │ │ ├── types.go # Configuration type definitions │ │ └── images.go # OS image definitions │ │ @@ -89,6 +90,7 @@ storage-e2e/ │ │ ├── nodegroup.go # NodeGroup operations │ │ ├── nodes.go # Node listing, taints, labels │ │ ├── pod.go # Pod operations +│ │ ├── pod_exec.go # Pods/exec helpers + DistrolessReader for distroless containers │ │ ├── poll.go # Generic readiness poller (per-call timeout, WARN on net errors) │ │ ├── pvc.go # PVC operations │ │ ├── rookconfigoverride.go # Rook ceph.conf override ConfigMap @@ -514,6 +516,7 @@ pkg/ │ ├── nodegroup.go # NodeGroup operations │ ├── nodes.go # Node listing, taints, labels │ ├── pod.go # Pod operations +│ ├── pod_exec.go # Exec helpers + DistrolessReader (ephemeral-container session) │ ├── poll.go # pollResourceUntilReady helper for Wait*Ready callers │ ├── pvc.go # PVC operations │ ├── rookconfigoverride.go # Rook global ceph.conf override @@ -756,7 +759,8 @@ logger.Error("Failed to create resource: %v", err) | `TEST_CLUSTER_NAMESPACE` | `e2e-test-cluster` | Test namespace name | | `TEST_CLUSTER_CLEANUP` | `false` | Cleanup cluster after tests | | `LOG_LEVEL` | `debug` | Log level (debug/info/warn/error) | -| `KUBE_CONFIG_PATH` | - | Fallback kubeconfig path | +| `KUBE_CONFIG_PATH` | - | Explicit kubeconfig path. Used when SSH retrieval of `/etc/kubernetes/{super-admin,admin}.conf` from the master fails. If unset and SSH also fails, `GetKubeconfig` returns an error (no silent fallback to `~/.kube/config`). | +| `MODULE_IMAGE_TAG` (and any other custom name) | - | Any `${VAR}` placeholder used inside `modulePullOverride:` in `cluster_config.yml` is expanded at config load time by `internal/config/overrides.ExpandEnvInModulePullOverride`. Missing/empty placeholders fail fast with an explicit error so CI can point modules at `pr` / `mr` images via a single env var without editing the YAML between runs. | ### Commander Variables (only when `TEST_CLUSTER_CREATE_MODE=commander`) diff --git a/docs/FUNCTIONS_GLOSSARY.md b/docs/FUNCTIONS_GLOSSARY.md index de233ce..07f08fa 100644 --- a/docs/FUNCTIONS_GLOSSARY.md +++ b/docs/FUNCTIONS_GLOSSARY.md @@ -16,6 +16,7 @@ All exported functions available in the `pkg/` directory, grouped by resource. - [Pod](#pod) - [PVC (PersistentVolumeClaim)](#pvc-persistentvolumeclaim) - [StorageClass](#storageclass) +- [VolumeSnapshotClass](#volumesnapshotclass) - [BlockDevice](#blockdevice) - [LVMVolumeGroup](#lvmvolumegroup) - [LocalStorageClass](#localstorageclass) @@ -33,7 +34,9 @@ All exported functions available in the `pkg/` directory, grouped by resource. - [CephStorageClass (csi-ceph)](#cephstorageclass-csi-ceph) - [Default StorageClass (Testkit)](#default-storageclass-testkit) - [Ceph StorageClass (Testkit)](#ceph-storageclass-testkit) +- [Ceph Cluster (Testkit) — no csi-ceph wiring](#ceph-cluster-testkit--no-csi-ceph-wiring) - [Stress Tests (Testkit)](#stress-tests-testkit) +- [Ceph CRC (Testkit)](#ceph-crc-testkit) --- @@ -149,7 +152,11 @@ All exported functions available in the `pkg/` directory, grouped by resource. - `ExecInPod(ctx, kubeconfig, namespace, pod, container, cmd) (stdout, stderr, err)` — Runs a command inside a container via the apiserver's `pods/exec` subresource (SPDY). Returns stdout and stderr separately; the container must ship every binary referenced by `cmd`. Use this when the container has a usable shell/userland. - `ReadFileFromPod(ctx, kubeconfig, namespace, pod, container, path)` — `ExecInPod` + `cat `. Convenience wrapper for non-distroless images. -- `ReadFileFromDistrolessPod(ctx, kubeconfig, namespace, pod, targetContainer, path, opts)` — Reads a file from a distroless / scratch container that ships no `cat`/`sh`/`tar`. Injects a short-lived ephemeral container (image from `opts.DebugImage`, defaults to `DefaultDebugImage = "busybox:1.36"`) with `targetContainerName=targetContainer`, polls until it goes Running (`opts.StartupTimeout`, defaults to 60s), then `cat /proc/1/root` — `/proc/1/root` is the kernel-exposed FS root of PID 1 in the target container, which the ephemeral container can see thanks to the shared PID namespace. Adding the ephemeral container goes through the dedicated `/pods//ephemeralcontainers` subresource, so existing containers and the pod sandbox are NOT restarted, `metadata.generation` is not bumped, and ReplicaSet/DaemonSet observation is unaffected — downstream rollout / `checksum/...` annotation assertions still see a clean signal. Caveat: ephemeral containers cannot be removed once added, but each call generates a unique name and the `sleep 60` command exits on its own; entries pile up in `pod.status.ephemeralContainerStatuses` until the next pod recycle. +- `ReadFileFromDistrolessPod(ctx, kubeconfig, namespace, pod, targetContainer, path, opts)` — Reads a file from a distroless / scratch container that ships no `cat`/`sh`/`tar`. Injects a short-lived ephemeral container (image from `opts.DebugImage`, defaults to `DefaultDebugImage = "busybox:1.36"`) with `targetContainerName=targetContainer`, polls until it goes Running (`opts.StartupTimeout`, defaults to 60s), then `cat /proc/1/root` — `/proc/1/root` is the kernel-exposed FS root of PID 1 in the target container, which the ephemeral container can see thanks to the shared PID namespace. Adding the ephemeral container goes through the dedicated `/pods//ephemeralcontainers` subresource, so existing containers and the pod sandbox are NOT restarted, `metadata.generation` is not bumped, and ReplicaSet/DaemonSet observation is unaffected — downstream rollout / `checksum/...` annotation assertions still see a clean signal. Caveat: ephemeral containers cannot be removed once added, but each call generates a unique name and the `sleep 60` command exits on its own; entries pile up in `pod.status.ephemeralContainerStatuses` until the next pod recycle. Internally a one-shot wrapper around `OpenDistrolessReader` + `(*DistrolessReader).ReadFile`. +- `OpenDistrolessReader(ctx, kubeconfig, namespace, pod, targetContainer, opts) (*DistrolessReader, error)` — Long-lived variant of `ReadFileFromDistrolessPod`: injects ONE ephemeral container (sleeps for `opts.SessionTTL`, defaults to `DefaultDistrolessSessionTTL` = 30 min) and returns a session that can serve arbitrarily many cheap reads. Use this for polling loops (e.g. `Eventually(...)` waiting for a file's content to flip) so the ephemeral-container cold start is paid once instead of per iteration. +- `(*DistrolessReader) ReadFile(ctx, path)` — `cat /proc/1/root` against the pre-injected ephemeral container. Cheap — just a `pods/exec` round-trip; no apiserver mutations. +- `(*DistrolessReader) PodName()` — Name of the pod this reader is bound to. Used by callers that need to detect rollouts (the pod name changes when the workload-controller recycles the pod) and re-`OpenDistrolessReader` against the new pod. +- `(*DistrolessReader) EphemeralName()` — Auto-generated name of the injected ephemeral container, mostly for logs. ## PVC (PersistentVolumeClaim) @@ -169,6 +176,17 @@ All exported functions available in the `pkg/` directory, grouped by resource. - `GetStorageClass(ctx, kubeconfig, name)` — Returns the `*storagev1.StorageClass` with the given name, or `(nil, nil)` if it does not exist. - `SetGlobalDefaultStorageClass(ctx, kubeconfig, storageClassName)` — Updates the "global" ModuleConfig to set `spec.settings.storageClass` to the given name, making it the cluster default. +`pkg/kubernetes/storageclass_manage.go` + +- `CreateStorageClass(ctx, kubeconfig, cfg)` — Creates a `storage.k8s.io/v1 StorageClass` directly from `StorageClassCreateConfig` (`Name`, `Provisioner`, `Parameters`, `VolumeBindingMode`, `ReclaimPolicy`, `AllowExpansion`, `MakeDefault`, plus optional extra labels/annotations). When `MakeDefault=true` both the GA and beta `is-default-class` annotations are set. Idempotent: `AlreadyExists` is logged and treated as success. + +## VolumeSnapshotClass + +`pkg/kubernetes/volumesnapshotclass.go` + +- `CreateVolumeSnapshotClass(ctx, kubeconfig, cfg)` — Creates a `snapshot.storage.k8s.io/v1 VolumeSnapshotClass` from `VolumeSnapshotClassConfig` (`Name`, `Driver`, `DeletionPolicy` defaulting to `Delete`, `Parameters`, `MakeDefault`). Idempotent: `AlreadyExists` is logged and treated as success. +- `WaitForVolumeSnapshotClass(ctx, kubeconfig, name, timeout)` — Polls until the named VolumeSnapshotClass is Get-able. + ## BlockDevice `pkg/kubernetes/blockdevice.go` @@ -243,6 +261,7 @@ All exported functions available in the `pkg/` directory, grouped by resource. - `SetRookConfigOverride(ctx, kubeconfig, namespace, globals)` — Creates or updates the `rook-config-override` ConfigMap in the Rook operator namespace. The provided map is rendered under `[global]` and Rook picks it up into every Ceph daemon's `ceph.conf` (used for `ms_crc_data`, `bdev_enable_discard`, and similar knobs). Keys are sorted for stable output. - `DeleteRookConfigOverride(ctx, kubeconfig, namespace)` — Removes the ConfigMap; safe if it does not exist. +- `RenderCephGlobalConfig(globals)` — Pure helper that renders a `[global]` section for `ceph.conf` from a `map[string]string`. Keys are sorted so the output is byte-stable across calls with logically-equivalent maps (used by `SetRookConfigOverride` to avoid spurious ConfigMap updates and by callers that need to compare the desired vs. live ConfigMap content before deciding to roll daemons). ## Ceph Credentials @@ -329,3 +348,14 @@ All exported functions available in the `pkg/` directory, grouped by resource. - `(*Config) Validate()` — Validates the stress test configuration (namespace, storage class, PVC size, mode-specific params). - `(*StressTestRunner) Run(ctx)` — Executes the stress test based on configured mode: flog, check_fs_only, check_cloning, check_restoring_from_snapshot, snapshot_only, or snapshot_resize_cloning. - `CleanupStressNamespaces(ctx, kubeconfig)` — Deletes all namespaces with the `load-test=true` label. + +## Ceph CRC (Testkit) + +`pkg/testkit/ceph_crc.go` + +- `EnableServerCRC(ctx, kubeconfig, namespace)` — Sets `ms_crc_data=true` on the server side: rewrites `rook-config-override` and rolling-restarts every Rook-managed Ceph daemon Deployment (mon/mgr/osd/mds/rgw) plus the rook-operator. Use when a test wants Ceph pinned in the explicit CRC-on state. Thin wrapper over `SetMsCrcDataOnServer(..., ptr(true))`. +- `DisableServerCRC(ctx, kubeconfig, namespace)` — Same as `EnableServerCRC` but flips Ceph into `ms_crc_data=false`. Paired with a csi-ceph client that defaults to `msCrcData=true` this reproduces the msCrcData matrix mismatch case. Thin wrapper over `SetMsCrcDataOnServer(..., ptr(false))`. +- `ResetServerCRCToDefault(ctx, kubeconfig, namespace)` — Removes `ms_crc_data` from `rook-config-override` so Ceph falls back to its compile-time default (`true`). Convenient for `AfterAll` / `AfterEach` restoration. Thin wrapper over `SetMsCrcDataOnServer(..., nil)`. +- `SetMsCrcDataOnServer(ctx, kubeconfig, namespace, enabled *bool)` — Lower-level primitive behind the three readability wrappers. Rewrites `rook-config-override` so that only `ms_crc_data=` ends up under `[global]` (`nil` removes the key entirely). Idempotent: when the ConfigMap already encodes the desired state, nothing is restarted. Otherwise it (1) rolling-restarts Rook-managed Ceph daemons via `RestartCephDaemons`, (2) restarts the rook-operator via `RestartRookOperator`, and (3) waits for every `CephFilesystem` in the namespace to come back to Ready. Prefer the named wrappers at call sites; this primitive exists so a boolean test parameter (e.g. a CRC matrix) doesn't have to branch. +- `RestartCephDaemons(ctx, kubeconfig, namespace, timeout)` — Rollout-restarts every Rook-managed Ceph daemon Deployment that consumes `/etc/ceph/ceph.conf` — the selector covers `rook-ceph-mon`, `rook-ceph-mgr`, `rook-ceph-osd`, `rook-ceph-mds`, `rook-ceph-rgw` — and waits for each to reach its desired Ready replica count. All five roles are bounced because a global ConfigMap knob like `ms_crc_data` lives in `ceph.conf` and any daemon left running with the old value (typically MDS) silently breaks the messenger handshake and degrades CephFS / blocks csi-cephfs PVCs in Pending. Operator restart is intentionally out of scope here — see `RestartRookOperator`. +- `RestartRookOperator(ctx, kubeconfig, namespace, timeout)` — Rollout-restarts the rook-operator Deployment in the given namespace and waits for the new pod to become Ready. Required after every wire-protocol bounce: the operator runs as a Ceph admin client (admin keyring + baked-in `ceph.conf`), and without a pod restart it keeps retrying with the stale `ceph.conf`, which surfaces in the cephcluster CR as `HEALTH_ERR` / `state: Error` until the next reconcile. Deckhouse-specific naming: the Deployment name is derived from the namespace by stripping the leading `d8-` prefix (`d8-sds-elastic` → `sds-elastic`). Vanilla Rook (`rook-ceph-operator` in `rook-ceph`) is not supported. diff --git a/docs/WORKLOG.md b/docs/WORKLOG.md index cba70bc..edb5440 100644 --- a/docs/WORKLOG.md +++ b/docs/WORKLOG.md @@ -33,11 +33,24 @@ All notable changes to this repository are documented here. New entries are appe - **Update** `README.md`: documented `${VAR}` form in `modulePullOverride` and the fail-fast behavior on unset env vars. - **Refactor** `internal/config/env.go`: extracted `ApplyDefaults()` out of `ValidateEnvironment` so suites that don't call validation still get defaults for `SSH_VM_USER` / `SSH_PRIVATE_KEY` / `SSH_PUBLIC_KEY` / `TEST_CLUSTER_NAMESPACE` / `YAML_CONFIG_FILENAME` / `TEST_CLUSTER_CLEANUP`. - **Update** `pkg/cluster/cluster.go::CreateTestCluster`: call `config.ApplyDefaults()` defensively + fall back to `config.YAMLConfigFilenameDefaultValue` when the filename arg is empty. -- **Update** `internal/cluster/cluster.go::GetKubeconfig`: when SSH retrieval of `/etc/kubernetes/{super-admin,admin}.conf` fails, the function now fails fast unless `KUBE_CONFIG_PATH` is set explicitly. The previously considered fallback to `clientcmd.NewDefaultClientConfigLoadingRules` (KUBECONFIG / `~/.kube/config`) was dropped before release to preserve the original fail-fast contract — a silent fallback to the developer's personal kubeconfig is too risky in CI and on machines whose `kubectl` already points at an unrelated cluster. - **Bugfix** `pkg/cluster/setup.go::executeDhctlBootstrap`: pass `FORCE_NO_PRIVATE_KEYS=true` and `USE_AGENT_WITH_NO_PRIVATE_KEYS=true` env vars into the `dhctl bootstrap` container so `lib-connection` stops opening `/root/.ssh/id_rsa` and authenticates exclusively via the mounted ssh-agent socket — fixes "Failed to read private keys from flags" on passphrase-protected keys. - **Bugfix** `pkg/cluster/vms.go::generateCloudInitUserData`: pin apt to `mirror.yandex.ru` and force IPv4 (`Acquire::ForceIPv4=true`) in cloud-init, so `package_update` and Docker install stop stalling when `archive.ubuntu.com` IPs are partially unreachable. - **Refactor** `internal/infrastructure/ssh/client.go::StartTunnel` (both `*client` and `*jumpHostClient`): extracted shared `runTunnelLoop` + `tunnelDialer`. On dial failure that looks like a dropped SSH session, the loop now logs a visible WARN, calls the existing `reconnect()` (retry + exponential backoff), and retries the dial once with the freshly rebuilt session. Fixes the "test hangs 20 minutes silently after Wi-Fi flap" failure mode. - **Add** `pkg/kubernetes/poll.go`: `pollResourceUntilReady` centralizes the `WaitFor*Ready` loops with a per-call `PollGetTimeout` (30s) on every Get and WARN logging once consecutive Get failures cross 3, so a dropped tunnel surfaces in seconds instead of after the 20-minute readyTimeout. - **Refactor** `pkg/kubernetes/cephcluster.go`, `pkg/kubernetes/cephblockpool.go`, `pkg/kubernetes/cephfilesystem.go`: `WaitForCephClusterReady` / `WaitForCephBlockPoolReady` / `WaitForCephFilesystemReady` migrated to `pollResourceUntilReady`. Public signatures unchanged. +- **Add** `pkg/kubernetes/pod_exec.go`: `ExecInPod` (pods/exec via SPDY), `ReadFileFromPod` (`cat ` wrapper for non-distroless images), and `ReadFileFromDistrolessPod` (single-shot ephemeral container injection that reads through `/proc/1/root` thanks to the shared PID namespace; uses the dedicated `ephemeralcontainers` subresource so the target pod and its sandbox are NOT restarted and `metadata.generation` is not bumped — keeps downstream rollout assertions clean). +- **Add** `pkg/kubernetes/pod_exec.go::DistrolessReader` + `OpenDistrolessReader`: long-lived ephemeral-container session for cheap repeated reads. `(*DistrolessReader).ReadFile` is a plain `pods/exec` round-trip against the already-running ephemeral container; `(*DistrolessReader).PodName()` lets callers detect rollouts and re-open against the new pod. Pays the ephemeral-container cold start once instead of per `Eventually` iteration. +- **Add** `pkg/kubernetes/poll.go::pollResourceUntilGone` + per-CR `WaitForCephClusterGone` / `WaitForCephBlockPoolGone` / `WaitForCephFilesystemGone` / `WaitForCephClusterAuthenticationGone` / `WaitForCephClusterConnectionGone` / `WaitForCephStorageClassGone` helpers. Logs `deletionTimestamp` and finalizers progress periodically so a stuck finalizer is visible immediately. Fail-fast on timeout — no auto-strip of finalizers; the operator must investigate before re-running. +- **Update** Ceph CR `Create*` helpers (`CreateCephCluster` / `CreateCephBlockPool` / `CreateCephFilesystem` / `CreateCephClusterAuthentication` / `CreateCephClusterConnection` / `CreateCephStorageClass`) and `WaitFor*Ready`: now fail fast when the live object has `metadata.deletionTimestamp != nil`. Prevents the framework from updating a Terminating object (silent no-op) or waiting 20 minutes on Ready for an object that's being garbage-collected. +- **Refactor** `pkg/testkit/ceph.go::TeardownCephStorageClass`: explicitly `WaitFor*Gone` after every Delete in the right order (`CephStorageClass` → `CephClusterConnection` → `CephClusterAuthentication` → `CephBlockPool` or `CephFilesystem` → `CephCluster` → `rook-config-override`). Without these waits the parent `CephCluster` was deleted before its dependents were gone, Rook recorded `DeletionIsBlocked / ObjectHasDependents`, and the next test run either found a stuck Terminating CR or hung in `WaitForCephClusterReady`. Errors are aggregated; NotFound is treated as success. +- **Update** `pkg/testkit/ceph_crc.go::RestartCephDaemons`: extended the daemon selector from `mon,mgr,osd` to `mon,mgr,osd,mds,rgw`. A global `ms_crc_data` flip lives in `ceph.conf` and any unrestarted daemon (typically MDS) silently breaks the messenger handshake — degrades CephFS and pins csi-cephfs PVCs in Pending. `rgw` is included for forward-compat with future S3 tests. +- **Add** `pkg/testkit/ceph_crc.go::RestartRookOperator`: rollout-restarts the rook-operator Deployment after a wire-protocol bounce so it picks up the new `ceph.conf` instead of pinning the cephcluster CR in `HEALTH_ERR`. Deployment name is derived from the namespace by stripping the leading `d8-` prefix (Deckhouse module convention, e.g. `d8-sds-elastic` → `sds-elastic`); vanilla Rook is not supported. +- **Update** `pkg/testkit/ceph_crc.go::SetMsCrcDataOnServer`: after rewriting `rook-config-override` the helper now (1) calls `RestartCephDaemons` for the extended selector, (2) calls `RestartRookOperator`, then (3) waits for every `CephFilesystem` in the namespace to come back to Ready. This is what unblocks the CephFS half of the msCrcData matrix — previously a flip silently left MDS / operator out of sync. - **Update** `docs/FUNCTIONS_GLOSSARY.md`: noted that the three `WaitForCeph*Ready` helpers now apply a per-call deadline and emit WARN on consecutive Get failures. - **Update** `docs/ARCHITECTURE.md`: added `pkg/kubernetes/poll.go` to Section 1.1 and Section 3.6, added `pkg/kubernetes/cephfilesystem.go` (carry-over from the prior commit), added `internal/config/overrides.go` to Section 3.1. + +## 2026-05-06 + +- **Update** `internal/cluster/cluster.go::GetKubeconfig`: when SSH retrieval of `/etc/kubernetes/{super-admin,admin}.conf` fails, the function now fails fast unless `KUBE_CONFIG_PATH` is set explicitly. The previously considered fallback to `clientcmd.NewDefaultClientConfigLoadingRules` (KUBECONFIG / `~/.kube/config`) was dropped before release to preserve the original fail-fast contract — a silent fallback to the developer's personal kubeconfig is too risky in CI and on machines whose `kubectl` already points at an unrelated cluster. +- **Update** `docs/FUNCTIONS_GLOSSARY.md`: documented `OpenDistrolessReader` + `*DistrolessReader` methods, `CreateStorageClass`, `CreateVolumeSnapshotClass` / `WaitForVolumeSnapshotClass`, `RenderCephGlobalConfig`, and the full `pkg/testkit/ceph_crc.go` surface (`EnableServerCRC` / `DisableServerCRC` / `ResetServerCRCToDefault` / `SetMsCrcDataOnServer` / `RestartCephDaemons` / `RestartRookOperator`); added matching TOC entries. +- **Update** `docs/ARCHITECTURE.md`: added `internal/config/overrides.go` to Section 1.1 (was only in Section 3.1), added `pkg/kubernetes/pod_exec.go` to Section 1.1 and Section 3.6, documented `KUBE_CONFIG_PATH` semantics and `${VAR}` expansion (`MODULE_IMAGE_TAG`) in Section 7. From 1d636a51e498ff85ab6f33029c17f0dcc753d302 Mon Sep 17 00:00:00 2001 From: Aleksandr Zimin Date: Wed, 6 May 2026 22:44:32 +0300 Subject: [PATCH 14/14] Diagnose GetKubeconfig SSH failure and emit actionable error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the SSH-side kubeconfig fetch fails and KUBE_CONFIG_PATH is unset, the default switch branch in GetKubeconfig used to return a single generic "command failed: Process exited with status 1" wrapped into a vague suggestion to "fix SSH credentials so passwordless sudo works on the master". That left the operator guessing. The default branch now runs two cheap probe commands against the master to classify the failure: 1) test -f /etc/kubernetes/{super-admin,admin}.conf -> at least one kubeconfig file exists on the host 2) sudo -n -l /bin/cat -> a NOPASSWD rule that matches the cat command actually applies and returns a multi-line, actionable error tailored to the detected cause. The "sudo password required" branch embeds a ready-to-paste /etc/sudoers.d/e2e-kubeconfig snippet (with the actual SSH user baked in), the "kubeconfig missing" branch points at SSH_HOST/SSH_JUMP_HOST misconfig, and the unknown branch lists all three remedies. While here, fix a self-inflicted source of the same failure: the SSH command used to read the kubeconfig was sudo -n sh -c 'if [ -f .../super-admin.conf ]; then cat ...; ...' so the privileged binary as far as sudoers was concerned was /bin/sh, NOT /bin/cat. The fine-grained NOPASSWD rule the new error message recommends ("NOPASSWD: /bin/cat /etc/kubernetes/{super-admin,admin}.conf") therefore did not match and sudo asked for a password — exactly the situation the error message tells the user to fix. The command is now sudo -n /bin/cat /etc/kubernetes/super-admin.conf 2>/dev/null \ || sudo -A -n /bin/cat /etc/kubernetes/admin.conf which works with the recommended minimal rule. The classifier probe was moved off "sudo -n true" for the same reason: under hosts that grant "NOPASSWD: ALL" the probe returned 0 even when the per-file rule was absent, which would mask the real cause. "sudo -n -l /bin/cat " asks sudo whether THAT specific command is allowed without a password. Contract preserved: still fail-fast (no silent ~/.kube/config fallback), still wraps the original ssh exit error via %w so callers' errors.Is / errors.As keep working. Probes are best-effort -- any error from a probe is treated as "unknown" rather than masking the original sshErr. Out of scope: actual SUDO_PASSWORD plumbing via 'sudo -S' (requires extending SSHClient to forward stdin and adding secret-redaction in command logs). Documented as a follow-up. Signed-off-by: Aleksandr Zimin --- docs/WORKLOG.md | 2 + internal/cluster/cluster.go | 125 +++++++++++++++++++++++++++++++++--- 2 files changed, 117 insertions(+), 10 deletions(-) diff --git a/docs/WORKLOG.md b/docs/WORKLOG.md index edb5440..fd56502 100644 --- a/docs/WORKLOG.md +++ b/docs/WORKLOG.md @@ -54,3 +54,5 @@ All notable changes to this repository are documented here. New entries are appe - **Update** `internal/cluster/cluster.go::GetKubeconfig`: when SSH retrieval of `/etc/kubernetes/{super-admin,admin}.conf` fails, the function now fails fast unless `KUBE_CONFIG_PATH` is set explicitly. The previously considered fallback to `clientcmd.NewDefaultClientConfigLoadingRules` (KUBECONFIG / `~/.kube/config`) was dropped before release to preserve the original fail-fast contract — a silent fallback to the developer's personal kubeconfig is too risky in CI and on machines whose `kubectl` already points at an unrelated cluster. - **Update** `docs/FUNCTIONS_GLOSSARY.md`: documented `OpenDistrolessReader` + `*DistrolessReader` methods, `CreateStorageClass`, `CreateVolumeSnapshotClass` / `WaitForVolumeSnapshotClass`, `RenderCephGlobalConfig`, and the full `pkg/testkit/ceph_crc.go` surface (`EnableServerCRC` / `DisableServerCRC` / `ResetServerCRCToDefault` / `SetMsCrcDataOnServer` / `RestartCephDaemons` / `RestartRookOperator`); added matching TOC entries. - **Update** `docs/ARCHITECTURE.md`: added `internal/config/overrides.go` to Section 1.1 (was only in Section 3.1), added `pkg/kubernetes/pod_exec.go` to Section 1.1 and Section 3.6, documented `KUBE_CONFIG_PATH` semantics and `${VAR}` expansion (`MODULE_IMAGE_TAG`) in Section 7. +- **Update** `internal/cluster/cluster.go::GetKubeconfig`: when the SSH-side kubeconfig fetch fails and `KUBE_CONFIG_PATH` is unset, the function now runs two cheap probes (`test -f /etc/kubernetes/{super-admin,admin}.conf` for existence, then `sudo -n -l /bin/cat ` for a matching NOPASSWD rule) to classify the failure and returns a structured, actionable error. The error embeds a ready-to-paste `/etc/sudoers.d/e2e-kubeconfig` snippet for the most common cause (passworded sudo on the master) and a `KUBE_CONFIG_PATH` escape hatch. Original SSH error is still wrapped via `%w` so `errors.Is`/`errors.As` keep working. +- **Bugfix** `internal/cluster/cluster.go::getKubeconfigRemoteShell`: dropped the `sudo -n sh -c '...'` wrapper and now invokes `sudo -n /bin/cat ` directly (with a `||` fallback from `super-admin.conf` to `admin.conf`). The wrapper made the privileged binary `/bin/sh`, so the recommended fine-grained `NOPASSWD: /bin/cat /etc/kubernetes/{super-admin,admin}.conf` sudoers rule did not match and `GetKubeconfig` failed with "sudo on the master requires a password" even after the operator pasted the recommended snippet. Aligned the diagnostic probe in `classifyKubeconfigFetchFailure` to `sudo -n -l /bin/cat ` for the same reason — `sudo -n true` returned 0 under common `NOPASSWD: ALL` configurations and would mask the real problem on hosts that only allow `cat`. diff --git a/internal/cluster/cluster.go b/internal/cluster/cluster.go index cf713e8..f5acf09 100644 --- a/internal/cluster/cluster.go +++ b/internal/cluster/cluster.go @@ -191,7 +191,20 @@ func expandPath(path string) (string, error) { // getKubeconfigRemoteShell prints kubeconfig for use with client-go. It prefers // /etc/kubernetes/super-admin.conf (Kubernetes 1.29+ unified kubeconfig) when the file // exists, and falls back to /etc/kubernetes/admin.conf otherwise. -const getKubeconfigRemoteShell = "sudo -n sh -c 'if [ -f /etc/kubernetes/super-admin.conf ]; then cat /etc/kubernetes/super-admin.conf; else cat /etc/kubernetes/admin.conf; fi'" +// +// The two `sudo -n /bin/cat ...` invocations are intentionally NOT wrapped in +// `sudo -n sh -c '...'`. With a wrapper the privileged binary is /bin/sh, so a +// minimal sudoers rule of the form +// +// user ALL=(root) NOPASSWD: /bin/cat /etc/kubernetes/super-admin.conf, /bin/cat /etc/kubernetes/admin.conf +// +// would NOT match and sudo would still ask for a password. By calling /bin/cat +// directly we make this command work with the same fine-grained NOPASSWD rule +// that the buildKubeconfigFetchError diagnostic recommends. The 2>/dev/null on +// the first try suppresses the "permission denied / no such file" noise so the +// fallback to admin.conf produces clean kubeconfig content on stdout. +const getKubeconfigRemoteShell = "sudo -n /bin/cat /etc/kubernetes/super-admin.conf 2>/dev/null " + + "|| sudo -n /bin/cat /etc/kubernetes/admin.conf" // GetKubeconfig connects to the master node via SSH, retrieves kubeconfig (preferring // super-admin.conf over admin.conf when available), and returns a rest.Config that can @@ -262,15 +275,10 @@ func GetKubeconfig(ctx context.Context, masterIP, user, keyPath string, sshClien // KUBE_CONFIG_PATH. Fail fast rather than silently picking up the // developer's ~/.kube/config / $KUBECONFIG, which has historically // caused tests to acquire stale locks on unrelated SAN clusters or - // deploy modules against the wrong stand. - return nil, "", fmt.Errorf( - "failed to read kubeconfig from master via SSH (%s@%s) "+ - "and KUBE_CONFIG_PATH is not set; "+ - "set KUBE_CONFIG_PATH to a kubeconfig pointing at the target cluster, "+ - "or fix SSH credentials so passwordless sudo works on the master. "+ - "Original SSH error: %w", - user, masterIP, sshErr, - ) + // deploy modules against the wrong stand. Classify the failure so the + // returned error tells the operator which knob to turn. + cause := classifyKubeconfigFetchFailure(ctx, sshClient) + return nil, "", buildKubeconfigFetchError(user, masterIP, sshErr, cause) } // Always stamp the kubeconfig source + the resulting current-context/server @@ -411,3 +419,100 @@ func kubeconfigContextSummary(content []byte) (currentContext, server string) { return } +// kubeconfigFetchCause discriminates the most likely reason +// getKubeconfigRemoteShell exited non-zero. Used solely to choose the +// human-readable error template — the original SSH error is always +// preserved via %w wrapping, so callers' errors.Is/errors.As keep working. +type kubeconfigFetchCause int + +const ( + causeUnknown kubeconfigFetchCause = iota + causeSudoPasswordRequired + causeKubeconfigMissing +) + +// classifyKubeconfigFetchFailure runs two cheap probes against the master +// to figure out the most likely reason getKubeconfigRemoteShell failed. +// Best-effort: any probe-time error is treated as "unknown" rather than +// surfaced — we are already in an error path and the original sshErr is +// what callers care about. +// +// Order matters and matches what we actually need to know: +// 1. Do the kubeconfig files even exist on this host? `test -f` runs as +// the SSH user without sudo and returns 0 even when the file is +// root:root 0600, because it only checks the inode. If both files are +// missing this is almost certainly a non-control-plane node and no +// sudoers tweak will help. +// 2. If at least one file exists, are we allowed to `cat` it without a +// password? We probe with `sudo -n -l /bin/cat `: -l makes sudo +// just look up the rule (no execution), and with -n it exits non-zero +// when no matching NOPASSWD rule applies. Crucially this matches the +// SAME granular rule the diagnostic recommends, so a misconfiguration +// where the operator added `NOPASSWD: /bin/sh` (or only NOPASSWD: ALL) +// does NOT mask the real "missing /bin/cat rule" cause. +func classifyKubeconfigFetchFailure(ctx context.Context, sshClient ssh.SSHClient) kubeconfigFetchCause { + if _, err := sshClient.Exec(ctx, + "test -f /etc/kubernetes/super-admin.conf || test -f /etc/kubernetes/admin.conf"); err != nil { + return causeKubeconfigMissing + } + if _, err := sshClient.Exec(ctx, + "sudo -n -l /bin/cat /etc/kubernetes/super-admin.conf >/dev/null 2>&1 || "+ + "sudo -n -l /bin/cat /etc/kubernetes/admin.conf >/dev/null 2>&1"); err != nil { + return causeSudoPasswordRequired + } + return causeUnknown +} + +// buildKubeconfigFetchError renders an actionable, multi-line error for +// the caller to print. Each branch lists the same kind of remediation +// (sudoers tweak, KUBE_CONFIG_PATH escape, SSH check) but in the order +// most relevant for the detected cause. The returned error always wraps +// the original sshErr so errors.Is(err, &ssh.ExitError{...}) still works. +func buildKubeconfigFetchError(user, masterIP string, sshErr error, cause kubeconfigFetchCause) error { + sudoersLine := fmt.Sprintf( + "%s ALL=(root) NOPASSWD: /bin/cat /etc/kubernetes/super-admin.conf, /bin/cat /etc/kubernetes/admin.conf", + user, + ) + sudoersFix := "echo '" + sudoersLine + "' | sudo tee /etc/sudoers.d/e2e-kubeconfig && sudo chmod 0440 /etc/sudoers.d/e2e-kubeconfig" + + switch cause { + case causeSudoPasswordRequired: + return fmt.Errorf( + "failed to read kubeconfig from master via SSH (%s@%s): "+ + "sudo on the master requires a password (sudo -n exited non-zero).\n"+ + "Pick ONE remedy:\n"+ + " 1) Allow passwordless cat of the two kubeconfig files (run on the master):\n"+ + " %s\n"+ + " 2) Point the test at a local kubeconfig instead (no SSH/sudo at all):\n"+ + " export KUBE_CONFIG_PATH=$HOME/.kube/config\n"+ + "Original SSH error: %w", + user, masterIP, sudoersFix, sshErr) + + case causeKubeconfigMissing: + return fmt.Errorf( + "failed to read kubeconfig from master via SSH (%s@%s): "+ + "neither /etc/kubernetes/super-admin.conf nor /etc/kubernetes/admin.conf exists on the host — "+ + "this looks like a non-control-plane node.\n"+ + "Pick ONE remedy:\n"+ + " 1) Make sure SSH_HOST points at a Kubernetes control-plane (master) node "+ + "(check SSH_HOST/SSH_USER, and SSH_JUMP_HOST if you use one).\n"+ + " 2) Set KUBE_CONFIG_PATH to a kubeconfig file on your local machine:\n"+ + " export KUBE_CONFIG_PATH=$HOME/.kube/config\n"+ + "Original SSH error: %w", + user, masterIP, sshErr) + + default: + return fmt.Errorf( + "failed to read kubeconfig from master via SSH (%s@%s) "+ + "and KUBE_CONFIG_PATH is not set.\n"+ + "Pick ONE remedy:\n"+ + " 1) If sudo on the master requires a password, allow passwordless cat of the kubeconfig files:\n"+ + " %s\n"+ + " 2) Set KUBE_CONFIG_PATH to a kubeconfig file on your local machine:\n"+ + " export KUBE_CONFIG_PATH=$HOME/.kube/config\n"+ + " 3) Fix SSH credentials so the master is reachable as %s with key-based auth.\n"+ + "Original SSH error: %w", + user, masterIP, sudoersFix, user, sshErr) + } +} +