diff --git a/README.md b/README.md index 177c1d1..75b4fe0 100644 --- a/README.md +++ b/README.md @@ -53,6 +53,59 @@ Template folder for creating new E2E tests. Contains a complete framework with: Use `./tests/create-test.sh ` to create a new test from this template. +### csi-ceph + +Reference testkit that provisions a full Rook-managed Ceph cluster and a +csi-ceph-backed `StorageClass` end-to-end, then verifies a plain `PVC` +bound against that class. + +Built around `testkit.EnsureCephStorageClass` (see +[pkg/FUNCTIONS_GLOSSARY.md](pkg/FUNCTIONS_GLOSSARY.md#ceph-storageclass-testkit)), +which handles: enabling `sds-node-configurator` + `sds-elastic` + `csi-ceph` +modules, optionally provisioning a `sds-local-volume` Thick `StorageClass` +for OSD backing, seeding `rook-config-override` (for things like +`ms_crc_data=false`), creating Rook `CephCluster` + `CephBlockPool`, and +wiring `CephClusterConnection` / `CephClusterAuthentication` / +`CephStorageClass` csi-ceph CRs. + +The testkit itself only runs a smoke check; downstream repos (e.g. +`csi-ceph`) can import `github.com/deckhouse/storage-e2e/pkg/testkit` and +reuse `EnsureCephStorageClass` inside their own Ginkgo specs. + +Testkit-specific env variables: + +- `CSI_CEPH_OSD_STORAGE_CLASS` — pre-existing block-mode StorageClass used to + back Rook OSD PVCs. When empty, a `sds-local-volume` Thick SC is + auto-provisioned via `EnsureDefaultStorageClass`. +- `CSI_CEPH_MODULE_PULL_OVERRIDE` — image tag for `csi-ceph`'s + ModulePullOverride (dev registries only, e.g. when testing a PR build). + +#### `modulePullOverride` env templating + +Any module entry in `cluster_config.yml` may reference an env var with the +`${VAR}` form in `modulePullOverride`. `storage-e2e` resolves those at config +load time, so CI can point a module at a per-PR/MR image without editing the +YAML between runs: + +```yaml +dkpParameters: + modules: + - name: csi-ceph + modulePullOverride: "${MODULE_IMAGE_TAG}" # CI must set MODULE_IMAGE_TAG, e.g. "pr131" on GitHub or "mr131" on GitLab +``` + +If a referenced env var is unset, `LoadClusterConfig` fails fast with +`module "" references env var ${VAR} in modulePullOverride but it is not set` +instead of silently falling back to `main` — so a missing variable in CI is +caught before bootstrap, not after a 30-minute wrong-image install. + +Run: + +```bash +source tests/csi-ceph/test_exports +go test -timeout=240m -v ./tests/csi-ceph -count=1 +``` + ### csi-all-stress-tests Stress tests for all CSI storage drivers. This test suite: diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 4d06918..a109a0a 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -23,6 +23,7 @@ storage-e2e/ │ ├── config/ # Configuration management │ │ ├── config.go # Main configuration struct │ │ ├── env.go # Environment variable parsing +│ │ ├── overrides.go # ${VAR} expansion in modulePullOverride at config load time │ │ ├── types.go # Configuration type definitions │ │ └── images.go # OS image definitions │ │ @@ -75,6 +76,12 @@ storage-e2e/ │ ├── kubernetes/ # Public Kubernetes utilities │ │ ├── apply.go # YAML manifest application │ │ ├── blockdevice.go # BlockDevice operations +│ │ ├── cephblockpool.go # Rook CephBlockPool operations +│ │ ├── cephcluster.go # Rook CephCluster operations +│ │ ├── cephfilesystem.go # Rook CephFilesystem operations +│ │ ├── cephclusterconnection.go # csi-ceph connection/auth CRs +│ │ ├── cephcredentials.go # Rook Ceph credential discovery +│ │ ├── cephstorageclass.go # csi-ceph CephStorageClass CR │ │ ├── client.go # Clientset/dynamic client with retry │ │ ├── localstorageclass.go # LocalStorageClass CR operations │ │ ├── lvmvolumegroup.go # LVMVolumeGroup operations @@ -83,17 +90,25 @@ storage-e2e/ │ │ ├── nodegroup.go # NodeGroup operations │ │ ├── nodes.go # Node listing, taints, labels │ │ ├── pod.go # Pod operations +│ │ ├── pod_exec.go # Pods/exec helpers + DistrolessReader for distroless containers +│ │ ├── poll.go # Generic readiness poller (per-call timeout, WARN on net errors) │ │ ├── pvc.go # PVC operations +│ │ ├── rookconfigoverride.go # Rook ceph.conf override ConfigMap │ │ ├── secrets.go # Secret operations │ │ ├── storageclass.go # StorageClass get/wait/default +│ │ ├── storageclass_manage.go # Global default StorageClass management │ │ ├── virtualdisk.go # VirtualDisk attach/detach -│ │ └── vmpod.go # VM pod lookup +│ │ ├── vmpod.go # VM pod lookup +│ │ └── volumesnapshotclass.go # VolumeSnapshotClass helpers │ │ │ ├── retry/ # Generic retry with exponential backoff │ │ └── retry.go │ │ │ └── testkit/ # Test framework utilities -│ ├── storageclass.go # Default StorageClass provisioning +│ ├── ceph.go # EnsureCephStorageClass (Rook + csi-ceph) +│ ├── ceph_cluster.go # EnsureCephCluster (Rook only, no csi-ceph) +│ ├── ceph_crc.go # Ceph CRC tuning helpers +│ ├── storageclass.go # EnsureDefaultStorageClass (sds-local-volume) │ └── stress-tests.go # Stress test runner │ ├── tests/ # Test suites @@ -326,6 +341,7 @@ Tests use Ginkgo's lifecycle hooks: config/ ├── config.go # Main configuration operations ├── env.go # Environment variable definitions and validation +├── overrides.go # ${VAR} expansion in modulePullOverride at config load time ├── types.go # Configuration type definitions └── images.go # OS image URL definitions ``` @@ -486,6 +502,12 @@ pkg/ ├── kubernetes/ │ ├── apply.go # YAML manifest application │ ├── blockdevice.go # BlockDevice operations +│ ├── cephblockpool.go # Rook CephBlockPool CRUD + wait +│ ├── cephcluster.go # Rook CephCluster CRUD + wait +│ ├── cephfilesystem.go # Rook CephFilesystem CRUD + wait +│ ├── cephclusterconnection.go # csi-ceph CephClusterConnection/Auth CRs +│ ├── cephcredentials.go # Read fsid/mons/admin-key from Rook secrets +│ ├── cephstorageclass.go # csi-ceph CephStorageClass CR │ ├── client.go # Clientset/dynamic client with retry │ ├── localstorageclass.go # LocalStorageClass CR operations │ ├── lvmvolumegroup.go # LVMVolumeGroup operations @@ -494,15 +516,23 @@ pkg/ │ ├── nodegroup.go # NodeGroup operations │ ├── nodes.go # Node listing, taints, labels │ ├── pod.go # Pod operations +│ ├── pod_exec.go # Exec helpers + DistrolessReader (ephemeral-container session) +│ ├── poll.go # pollResourceUntilReady helper for Wait*Ready callers │ ├── pvc.go # PVC operations +│ ├── rookconfigoverride.go # Rook global ceph.conf override │ ├── secrets.go # Secret operations │ ├── storageclass.go # StorageClass get/wait/default +│ ├── storageclass_manage.go # Global default-SC management │ ├── virtualdisk.go # VirtualDisk attach/detach -│ └── vmpod.go # VM pod lookup +│ ├── vmpod.go # VM pod lookup +│ └── volumesnapshotclass.go # VolumeSnapshotClass helpers ├── retry/ │ └── retry.go # Generic retry with exponential backoff └── testkit/ - ├── storageclass.go # Default StorageClass provisioning + ├── ceph.go # EnsureCephStorageClass / EnsureDefaultCephStorageClass + ├── ceph_cluster.go # EnsureCephCluster (Rook-only, no csi-ceph) + ├── ceph_crc.go # Ceph CRC tuning helpers + ├── storageclass.go # EnsureDefaultStorageClass (sds-local-volume) └── stress-tests.go # Stress test runner ``` @@ -730,7 +760,8 @@ logger.Error("Failed to create resource: %v", err) | `TEST_CLUSTER_VIRTUAL_MACHINE_CLASS_NAME` | `generic` | VM class for VMs on the base cluster in `alwaysCreateNew`. If set to another name (DNS-1123 subdomain) and the class does not exist, it is created from `generic` with `spec.cpu.type: Host`, **`spec.nodeSelector` / `spec.tolerations` cleared**, sizing policies retained from template, labeled `storage-e2e.deckhouse.io/auto-created=true`, and left after cleanup | | `TEST_CLUSTER_CLEANUP` | `false` | Cleanup cluster after tests | | `LOG_LEVEL` | `debug` | Log level (debug/info/warn/error) | -| `KUBE_CONFIG_PATH` | - | Fallback kubeconfig path | +| `KUBE_CONFIG_PATH` | - | Explicit kubeconfig path. Used when SSH retrieval of `/etc/kubernetes/{super-admin,admin}.conf` from the master fails. If unset and SSH also fails, `GetKubeconfig` returns an error (no silent fallback to `~/.kube/config`). | +| `MODULE_IMAGE_TAG` (and any other custom name) | - | Any `${VAR}` placeholder used inside `modulePullOverride:` in `cluster_config.yml` is expanded at config load time by `internal/config/overrides.ExpandEnvInModulePullOverride`. Missing/empty placeholders fail fast with an explicit error so CI can point modules at `pr` / `mr` images via a single env var without editing the YAML between runs. | ### Commander Variables (only when `TEST_CLUSTER_CREATE_MODE=commander`) diff --git a/docs/FUNCTIONS_GLOSSARY.md b/docs/FUNCTIONS_GLOSSARY.md index 64d4b4a..592a386 100644 --- a/docs/FUNCTIONS_GLOSSARY.md +++ b/docs/FUNCTIONS_GLOSSARY.md @@ -16,6 +16,7 @@ All exported functions available in the `pkg/` directory, grouped by resource. - [Pod](#pod) - [PVC (PersistentVolumeClaim)](#pvc-persistentvolumeclaim) - [StorageClass](#storageclass) +- [VolumeSnapshotClass](#volumesnapshotclass) - [BlockDevice](#blockdevice) - [LVMVolumeGroup](#lvmvolumegroup) - [LocalStorageClass](#localstorageclass) @@ -24,8 +25,18 @@ All exported functions available in the `pkg/` directory, grouped by resource. - [Secrets](#secrets) - [Modules](#modules) - [Retry](#retry) +- [Rook Config Override](#rook-config-override) +- [Ceph Credentials](#ceph-credentials) +- [CephCluster (Rook)](#cephcluster-rook) +- [CephBlockPool (Rook)](#cephblockpool-rook) +- [CephFilesystem (Rook)](#cephfilesystem-rook) +- [CephClusterConnection / CephClusterAuthentication (csi-ceph)](#cephclusterconnection--cephclusterauthentication-csi-ceph) +- [CephStorageClass (csi-ceph)](#cephstorageclass-csi-ceph) - [Default StorageClass (Testkit)](#default-storageclass-testkit) +- [Ceph StorageClass (Testkit)](#ceph-storageclass-testkit) +- [Ceph Cluster (Testkit) — no csi-ceph wiring](#ceph-cluster-testkit--no-csi-ceph-wiring) - [Stress Tests (Testkit)](#stress-tests-testkit) +- [Ceph CRC (Testkit)](#ceph-crc-testkit) --- @@ -137,6 +148,16 @@ All exported functions available in the `pkg/` directory, grouped by resource. - `WaitForAllPodsReadyInNamespace(ctx, kubeconfig, namespace, timeout)` — Waits for all pods in a namespace to be in Ready condition. - `WaitForPodsStatus(ctx, clientset, namespace, labelSelector, status, expectedCount, maxAttempts, interval)` — Waits for pods matching a label selector to reach a specific status (Running, Completed, etc.). +`pkg/kubernetes/pod_exec.go` + +- `ExecInPod(ctx, kubeconfig, namespace, pod, container, cmd) (stdout, stderr, err)` — Runs a command inside a container via the apiserver's `pods/exec` subresource (SPDY). Returns stdout and stderr separately; the container must ship every binary referenced by `cmd`. Use this when the container has a usable shell/userland. +- `ReadFileFromPod(ctx, kubeconfig, namespace, pod, container, path)` — `ExecInPod` + `cat `. Convenience wrapper for non-distroless images. +- `ReadFileFromDistrolessPod(ctx, kubeconfig, namespace, pod, targetContainer, path, opts)` — Reads a file from a distroless / scratch container that ships no `cat`/`sh`/`tar`. Injects a short-lived ephemeral container (image from `opts.DebugImage`, defaults to `DefaultDebugImage = "busybox:1.36"`) with `targetContainerName=targetContainer`, polls until it goes Running (`opts.StartupTimeout`, defaults to 60s), then `cat /proc/1/root` — `/proc/1/root` is the kernel-exposed FS root of PID 1 in the target container, which the ephemeral container can see thanks to the shared PID namespace. Adding the ephemeral container goes through the dedicated `/pods//ephemeralcontainers` subresource, so existing containers and the pod sandbox are NOT restarted, `metadata.generation` is not bumped, and ReplicaSet/DaemonSet observation is unaffected — downstream rollout / `checksum/...` annotation assertions still see a clean signal. Caveat: ephemeral containers cannot be removed once added, but each call generates a unique name and the `sleep 60` command exits on its own; entries pile up in `pod.status.ephemeralContainerStatuses` until the next pod recycle. Internally a one-shot wrapper around `OpenDistrolessReader` + `(*DistrolessReader).ReadFile`. +- `OpenDistrolessReader(ctx, kubeconfig, namespace, pod, targetContainer, opts) (*DistrolessReader, error)` — Long-lived variant of `ReadFileFromDistrolessPod`: injects ONE ephemeral container (sleeps for `opts.SessionTTL`, defaults to `DefaultDistrolessSessionTTL` = 30 min) and returns a session that can serve arbitrarily many cheap reads. Use this for polling loops (e.g. `Eventually(...)` waiting for a file's content to flip) so the ephemeral-container cold start is paid once instead of per iteration. +- `(*DistrolessReader) ReadFile(ctx, path)` — `cat /proc/1/root` against the pre-injected ephemeral container. Cheap — just a `pods/exec` round-trip; no apiserver mutations. +- `(*DistrolessReader) PodName()` — Name of the pod this reader is bound to. Used by callers that need to detect rollouts (the pod name changes when the workload-controller recycles the pod) and re-`OpenDistrolessReader` against the new pod. +- `(*DistrolessReader) EphemeralName()` — Auto-generated name of the injected ephemeral container, mostly for logs. + ## PVC (PersistentVolumeClaim) `pkg/kubernetes/pvc.go` @@ -155,6 +176,17 @@ All exported functions available in the `pkg/` directory, grouped by resource. - `GetStorageClass(ctx, kubeconfig, name)` — Returns the `*storagev1.StorageClass` with the given name, or `(nil, nil)` if it does not exist. - `SetGlobalDefaultStorageClass(ctx, kubeconfig, storageClassName)` — Updates the "global" ModuleConfig to set `spec.settings.storageClass` to the given name, making it the cluster default. +`pkg/kubernetes/storageclass_manage.go` + +- `CreateStorageClass(ctx, kubeconfig, cfg)` — Creates a `storage.k8s.io/v1 StorageClass` directly from `StorageClassCreateConfig` (`Name`, `Provisioner`, `Parameters`, `VolumeBindingMode`, `ReclaimPolicy`, `AllowExpansion`, `MakeDefault`, plus optional extra labels/annotations). When `MakeDefault=true` both the GA and beta `is-default-class` annotations are set. Idempotent: `AlreadyExists` is logged and treated as success. + +## VolumeSnapshotClass + +`pkg/kubernetes/volumesnapshotclass.go` + +- `CreateVolumeSnapshotClass(ctx, kubeconfig, cfg)` — Creates a `snapshot.storage.k8s.io/v1 VolumeSnapshotClass` from `VolumeSnapshotClassConfig` (`Name`, `Driver`, `DeletionPolicy` defaulting to `Delete`, `Parameters`, `MakeDefault`). Idempotent: `AlreadyExists` is logged and treated as success. +- `WaitForVolumeSnapshotClass(ctx, kubeconfig, name, timeout)` — Polls until the named VolumeSnapshotClass is Get-able. + ## BlockDevice `pkg/kubernetes/blockdevice.go` @@ -223,6 +255,69 @@ All exported functions available in the `pkg/` directory, grouped by resource. - `IsSSHConnectionError(err)` — Checks if an error specifically indicates SSH connection failure requiring reconnection. - `WithRetryAfter(cfg, err)` — Returns a modified retry config that respects `RetryAfterSeconds` hints from Kubernetes API errors. +## Rook Config Override + +`pkg/kubernetes/rookconfigoverride.go` + +- `SetRookConfigOverride(ctx, kubeconfig, namespace, globals)` — Creates or updates the `rook-config-override` ConfigMap in the Rook operator namespace. The provided map is rendered under `[global]` and Rook picks it up into every Ceph daemon's `ceph.conf` (used for `ms_crc_data`, `bdev_enable_discard`, and similar knobs). Keys are sorted for stable output. +- `DeleteRookConfigOverride(ctx, kubeconfig, namespace)` — Removes the ConfigMap; safe if it does not exist. +- `RenderCephGlobalConfig(globals)` — Pure helper that renders a `[global]` section for `ceph.conf` from a `map[string]string`. Keys are sorted so the output is byte-stable across calls with logically-equivalent maps (used by `SetRookConfigOverride` to avoid spurious ConfigMap updates and by callers that need to compare the desired vs. live ConfigMap content before deciding to roll daemons). + +## Ceph Credentials + +`pkg/kubernetes/cephcredentials.go` + +- `WaitForCephCredentials(ctx, kubeconfig, namespace, timeout)` — Polls Rook's `rook-ceph-mon` Secret and `rook-ceph-mon-endpoints` ConfigMap until all pieces required to connect a CSI client to the cluster (`fsid`, admin user, admin key, monitor endpoints) are present. Returns a `*CephCredentials`. + +## CephCluster (Rook) + +`pkg/kubernetes/cephcluster.go` + +- `CreateCephCluster(ctx, kubeconfig, cfg)` — Creates or updates a Rook `CephCluster` CR using `CephClusterConfig` (image, mon/mgr counts, network provider, OSD storage class / count / size, data-dir host path, etc.). Idempotent. **Fail-fast:** if an existing CR has `metadata.deletionTimestamp != nil`, returns an error instead of trying to update a Terminating object (which would silently no-op and trap the next `WaitForCephClusterReady` for 15-20 minutes). +- `WaitForCephClusterReady(ctx, kubeconfig, namespace, name, timeout)` — Blocks until `status.state == "Created"` (or `status.phase == "Ready"`). HEALTH_WARN is tolerated so single-OSD test clusters still succeed. Each Get is bounded by `PollGetTimeout` (30s) and consecutive Get failures emit WARN, so a dropped SSH tunnel surfaces in seconds instead of after the readyTimeout. **Fail-fast** when the CR comes back with `deletionTimestamp != nil` — there's no point waiting for Ready on a Terminating object. +- `DeleteCephCluster(ctx, kubeconfig, namespace, name)` — Fire-and-forget delete; NotFound is treated as success. Does NOT garbage-collect OSD data on host disks. Pair with `WaitForCephClusterGone` if the next step depends on the CR being fully GC'd (e.g. before re-creating the cluster, or to detect a stuck `cephcluster.ceph.rook.io` finalizer early). +- `WaitForCephClusterGone(ctx, kubeconfig, namespace, name, timeout)` — Polls until the CR returns NotFound (default `CephClusterGoneTimeout` = 10m when timeout is 0). Logs deletionTimestamp / finalizers progress periodically, so a stuck finalizer (typical after a teardown that left dependents alive — see `DeletionIsBlocked`) is visible immediately instead of after a silent timeout. Fail-fast on timeout: does NOT auto-strip finalizers — investigate the cluster manually before re-running. + +## CephBlockPool (Rook) + +`pkg/kubernetes/cephblockpool.go` + +- `CreateCephBlockPool(ctx, kubeconfig, cfg)` — Creates or updates a Rook `CephBlockPool` from `CephBlockPoolConfig` (replicated with optional `requireSafeReplicaSize` override, or erasure-coded with `dataChunks`/`codingChunks`; `failureDomain`). **Fail-fast** when the existing CR has `deletionTimestamp != nil`. +- `WaitForCephBlockPoolReady(ctx, kubeconfig, namespace, name, timeout)` — Polls until `status.phase == "Ready"`. Each Get is bounded by `PollGetTimeout` (30s) and consecutive Get failures emit WARN, so a dropped SSH tunnel surfaces in seconds instead of after the readyTimeout. Fail-fast on `deletionTimestamp != nil`. +- `DeleteCephBlockPool(ctx, kubeconfig, namespace, name)` — Fire-and-forget delete; NotFound is treated as success. Pair with `WaitForCephBlockPoolGone` to make sure the parent CephCluster's deletion isn't blocked by `ObjectHasDependents`. +- `WaitForCephBlockPoolGone(ctx, kubeconfig, namespace, name, timeout)` — Polls until the CR is GC'd (default `CephBlockPoolGoneTimeout` = 5m). Logs progress periodically. + +## CephFilesystem (Rook) + +`pkg/kubernetes/cephfilesystem.go` + +- `CreateCephFilesystem(ctx, kubeconfig, cfg)` — Creates or updates a Rook `CephFilesystem` from `CephFilesystemConfig` (one replicated metadata pool + one replicated data pool, configurable `failureDomain`, `MetadataServerActiveCount`, optional `RequireSafeReplicaSize`). Idempotent. **Fail-fast** when the existing CR has `deletionTimestamp != nil`. +- `WaitForCephFilesystemReady(ctx, kubeconfig, namespace, name, timeout)` — Polls until `status.phase == "Ready"`, with a fallback that also accepts `status.conditions[type=Ready,status=True]` for Rook revisions that populate conditions before phase. Each Get is bounded by `PollGetTimeout` (30s) and consecutive Get failures emit WARN, so a dropped SSH tunnel surfaces in seconds instead of after the readyTimeout. Fail-fast on `deletionTimestamp != nil`. +- `DeleteCephFilesystem(ctx, kubeconfig, namespace, name)` — Fire-and-forget delete; NotFound is treated as success. Pair with `WaitForCephFilesystemGone` to make sure the parent CephCluster's deletion isn't blocked by `ObjectHasDependents`. +- `WaitForCephFilesystemGone(ctx, kubeconfig, namespace, name, timeout)` — Polls until the CR is GC'd (default `CephFilesystemGoneTimeout` = 5m). Logs progress periodically. +- `CephFSDataPoolFullName(fsName, dataPoolName)` — Returns the full Ceph pool name (`-`) that should be passed to `CephStorageClass.spec.cephFS.pool`. + +## CephClusterConnection / CephClusterAuthentication (csi-ceph) + +`pkg/kubernetes/cephclusterconnection.go` + +- `CreateCephClusterAuthentication(ctx, kubeconfig, cfg)` — Creates or updates a `CephClusterAuthentication` CR (`userID` + `userKey`) used by csi-ceph to log in to Ceph. **Fail-fast** when the existing CR has `deletionTimestamp != nil`. +- `DeleteCephClusterAuthentication(ctx, kubeconfig, name)` — Fire-and-forget delete; NotFound is treated as success. +- `WaitForCephClusterAuthenticationGone(ctx, kubeconfig, name, timeout)` — Polls until the CR is GC'd (default `CephClusterAuthenticationGoneTimeout` = 1m). +- `CreateCephClusterConnection(ctx, kubeconfig, cfg)` — Creates or updates a `CephClusterConnection` CR (`clusterID == fsid`, `monitors`, `userID`, `userKey`). `clusterID` is immutable: existing-resource updates leave it unchanged and only sync monitors/user. **Fail-fast** when the existing CR has `deletionTimestamp != nil`. +- `DeleteCephClusterConnection(ctx, kubeconfig, name)` — Fire-and-forget delete; NotFound is treated as success. +- `WaitForCephClusterConnectionGone(ctx, kubeconfig, name, timeout)` — Polls until the CR is GC'd (default `CephClusterConnectionGoneTimeout` = 1m). +- `WaitForCephClusterConnectionCreated(ctx, kubeconfig, name, timeout)` — Polls until csi-ceph reports `status.phase == "Created"` (credentials + monitors validated against the live Ceph cluster). + +## CephStorageClass (csi-ceph) + +`pkg/kubernetes/cephstorageclass.go` + +- `CreateCephStorageClass(ctx, kubeconfig, cfg)` — Creates or updates a csi-ceph `CephStorageClass` CR (RBD by default; CephFS when `Type == "CephFS"` and `CephFSName` / `CephFSPool` are set). The csi-ceph controller provisions a corresponding core `storage.k8s.io/v1 StorageClass` as a side effect. **Fail-fast** when the existing CR has `deletionTimestamp != nil`. +- `DeleteCephStorageClass(ctx, kubeconfig, name)` — Fire-and-forget delete; the controller removes the backing StorageClass. +- `WaitForCephStorageClassGone(ctx, kubeconfig, name, timeout)` — Polls until the CR is GC'd (default `CephStorageClassGoneTimeout` = 1m). +- `WaitForCephStorageClassCreated(ctx, kubeconfig, name, timeout)` — Polls until `status.phase == "Created"`. + ## Default StorageClass (Testkit) `pkg/testkit/storageclass.go` @@ -230,6 +325,20 @@ All exported functions available in the `pkg/` directory, grouped by resource. - `CreateDefaultStorageClass(ctx, kubeconfig, cfg)` — High-level helper: discovers nodes, enables sds-node-configurator/sds-local-volume modules, labels nodes, optionally attaches VirtualDisks, creates LVMVolumeGroups (Thick or Thin with thin pool), creates LocalStorageClass, waits for StorageClass. Configured via `DefaultStorageClassConfig`. - `EnsureDefaultStorageClass(ctx, kubeconfig, cfg)` — Idempotent wrapper around `CreateDefaultStorageClass`. Checks if StorageClass already exists, skips creation if so, then sets it as the cluster default via "global" ModuleConfig. +## Ceph StorageClass (Testkit) + +`pkg/testkit/ceph.go` + +- `EnsureCephStorageClass(ctx, kubeconfig, cfg)` — High-level end-to-end helper that turns an empty test cluster into one with a working csi-ceph `StorageClass`. Steps: (1) enable `sds-node-configurator`, `sds-elastic`, `csi-ceph` modules and wait Ready; (2) optionally call `EnsureDefaultStorageClass` to auto-provision a sds-local-volume SC for OSDs when `OSDStorageClass` is empty; (3) seed `rook-config-override` with `GlobalCephConfigOverrides` (e.g. `ms_crc_data=false`); (4) create Rook `CephCluster` and wait Created; (5) create the backing pool primitive — `CephBlockPool` (when `Type == "RBD"`, default) or `CephFilesystem` (when `Type == "CephFS"`) — and wait Ready; (6) read fsid/monitors/admin-key from Rook-managed secrets; (7) wire csi-ceph by creating `CephClusterAuthentication` + `CephClusterConnection`; (8) create the matching `CephStorageClass` (RBD pool or `-` for CephFS) and wait for the backing core StorageClass. Idempotent; returns the resulting StorageClass name. +- `EnsureDefaultCephStorageClass(ctx, kubeconfig, cfg)` — `EnsureCephStorageClass` + `SetGlobalDefaultStorageClass` so new PVCs without an explicit `storageClassName` use the provisioned Ceph (RBD or CephFS) class. +- `TeardownCephStorageClass(ctx, kubeconfig, cfg)` — Reverse of `EnsureCephStorageClass`. After every Delete it now waits for the CR to be fully GC'd via the matching `WaitForXxxGone` helper. Order is: `CephStorageClass` → `CephClusterConnection` → `CephClusterAuthentication` → (`CephBlockPool` or `CephFilesystem` per `cfg.Type`) → `CephCluster` (unless `SkipClusterTeardown`) → `rook-config-override` ConfigMap. Without those waits the parent `CephCluster` would be deleted before its dependents are gone, Rook would record `DeletionIsBlocked / ObjectHasDependents`, and the next test run would either find a stuck Terminating CR or hang in `WaitForCephClusterReady`. Fail-fast on a Wait*Gone timeout: errors are aggregated and returned, no auto-strip of finalizers — investigate the cluster manually before re-running. NotFound is still treated as success; subsequent deletions are still attempted on partial failures. + +## Ceph Cluster (Testkit) — no csi-ceph wiring + +`pkg/testkit/ceph_cluster.go` + +- `EnsureCephCluster(ctx, kubeconfig, cfg)` — "Stop-before-csi-ceph" variant of `EnsureCephStorageClass`: brings up a Rook-managed Ceph cluster + CephBlockPool via sds-elastic alone. Steps: (1) enable `sds-node-configurator` + `sds-elastic` (does **not** enable `csi-ceph`); (2) resolve/provision OSD backing StorageClass (reuses `EnsureDefaultStorageClass`); (3) seed `rook-config-override` with `GlobalCephConfigOverrides`; (4) create Rook `CephCluster` and wait Created; (5) create `CephBlockPool` and wait Ready. Does not create `CephClusterConnection`/`CephClusterAuthentication`/`CephStorageClass`. Useful when tests need a live Ceph backend to talk to directly (e.g. from within csi-ceph's own e2e) without the testkit preselecting a csi-ceph-backed StorageClass. Idempotent; returns the pool name. + ## Stress Tests (Testkit) `pkg/testkit/stress-tests.go` @@ -239,3 +348,14 @@ All exported functions available in the `pkg/` directory, grouped by resource. - `(*Config) Validate()` — Validates the stress test configuration (namespace, storage class, PVC size, mode-specific params). - `(*StressTestRunner) Run(ctx)` — Executes the stress test based on configured mode: flog, check_fs_only, check_cloning, check_restoring_from_snapshot, snapshot_only, or snapshot_resize_cloning. - `CleanupStressNamespaces(ctx, kubeconfig)` — Deletes all namespaces with the `load-test=true` label. + +## Ceph CRC (Testkit) + +`pkg/testkit/ceph_crc.go` + +- `EnableServerCRC(ctx, kubeconfig, namespace)` — Sets `ms_crc_data=true` on the server side: rewrites `rook-config-override` and rolling-restarts every Rook-managed Ceph daemon Deployment (mon/mgr/osd/mds/rgw) plus the rook-operator. Use when a test wants Ceph pinned in the explicit CRC-on state. Thin wrapper over `SetMsCrcDataOnServer(..., ptr(true))`. +- `DisableServerCRC(ctx, kubeconfig, namespace)` — Same as `EnableServerCRC` but flips Ceph into `ms_crc_data=false`. Paired with a csi-ceph client that defaults to `msCrcData=true` this reproduces the msCrcData matrix mismatch case. Thin wrapper over `SetMsCrcDataOnServer(..., ptr(false))`. +- `ResetServerCRCToDefault(ctx, kubeconfig, namespace)` — Removes `ms_crc_data` from `rook-config-override` so Ceph falls back to its compile-time default (`true`). Convenient for `AfterAll` / `AfterEach` restoration. Thin wrapper over `SetMsCrcDataOnServer(..., nil)`. +- `SetMsCrcDataOnServer(ctx, kubeconfig, namespace, enabled *bool)` — Lower-level primitive behind the three readability wrappers. Rewrites `rook-config-override` so that only `ms_crc_data=` ends up under `[global]` (`nil` removes the key entirely). Idempotent: when the ConfigMap already encodes the desired state, nothing is restarted. Otherwise it (1) rolling-restarts Rook-managed Ceph daemons via `RestartCephDaemons`, (2) restarts the rook-operator via `RestartRookOperator`, and (3) waits for every `CephFilesystem` in the namespace to come back to Ready. Prefer the named wrappers at call sites; this primitive exists so a boolean test parameter (e.g. a CRC matrix) doesn't have to branch. +- `RestartCephDaemons(ctx, kubeconfig, namespace, timeout)` — Rollout-restarts every Rook-managed Ceph daemon Deployment that consumes `/etc/ceph/ceph.conf` — the selector covers `rook-ceph-mon`, `rook-ceph-mgr`, `rook-ceph-osd`, `rook-ceph-mds`, `rook-ceph-rgw` — and waits for each to reach its desired Ready replica count. All five roles are bounced because a global ConfigMap knob like `ms_crc_data` lives in `ceph.conf` and any daemon left running with the old value (typically MDS) silently breaks the messenger handshake and degrades CephFS / blocks csi-cephfs PVCs in Pending. Operator restart is intentionally out of scope here — see `RestartRookOperator`. +- `RestartRookOperator(ctx, kubeconfig, namespace, timeout)` — Rollout-restarts the rook-operator Deployment in the given namespace and waits for the new pod to become Ready. Required after every wire-protocol bounce: the operator runs as a Ceph admin client (admin keyring + baked-in `ceph.conf`), and without a pod restart it keeps retrying with the stale `ceph.conf`, which surfaces in the cephcluster CR as `HEALTH_ERR` / `state: Error` until the next reconcile. Deckhouse-specific naming: the Deployment name is derived from the namespace by stripping the leading `d8-` prefix (`d8-sds-elastic` → `sds-elastic`). Vanilla Rook (`rook-ceph-operator` in `rook-ceph`) is not supported. diff --git a/docs/WORKLOG.md b/docs/WORKLOG.md index e7d995f..3725397 100644 --- a/docs/WORKLOG.md +++ b/docs/WORKLOG.md @@ -55,3 +55,34 @@ All notable changes to this repository are documented here. New entries are appe - **Add** `.cursor/rules/todo-command.mdc`: `/todo` command for managing `docs/TODO.md` - **Add** `.cursor/rules/backward-compatibility.mdc`: rule to guard backward compatibility of exported `pkg/` API — ask before breaking changes, mark worklog with `[Possible compatibility break]` - **Add** `.cursor/rules/versatile-functions.mdc`: rule to ensure new functions are general-purpose and reusable — return data not decisions, no hardcoded names, compose from existing functions, no empty wrappers + +## 2026-05-05 + +- **Add** `internal/config/overrides.go` + `_test.go`: `ExpandEnvInModulePullOverride` resolves `${VAR}` placeholders in `modulePullOverride` at config load time; missing env fails fast with an explicit error so CI can point modules at `pr` / `mr` images via a single env var (`MODULE_IMAGE_TAG`) without editing `cluster_config.yml`. +- **Update** `internal/cluster/cluster.go::LoadClusterConfig` and `pkg/cluster/cluster.go::loadClusterConfigFromPath`: hook `ExpandEnvInModulePullOverride` right after `yaml.Unmarshal`. +- **Update** `README.md`: documented `${VAR}` form in `modulePullOverride` and the fail-fast behavior on unset env vars. +- **Refactor** `internal/config/env.go`: extracted `ApplyDefaults()` out of `ValidateEnvironment` so suites that don't call validation still get defaults for `SSH_VM_USER` / `SSH_PRIVATE_KEY` / `SSH_PUBLIC_KEY` / `TEST_CLUSTER_NAMESPACE` / `YAML_CONFIG_FILENAME` / `TEST_CLUSTER_CLEANUP`. +- **Update** `pkg/cluster/cluster.go::CreateTestCluster`: call `config.ApplyDefaults()` defensively + fall back to `config.YAMLConfigFilenameDefaultValue` when the filename arg is empty. +- **Bugfix** `pkg/cluster/setup.go::executeDhctlBootstrap`: pass `FORCE_NO_PRIVATE_KEYS=true` and `USE_AGENT_WITH_NO_PRIVATE_KEYS=true` env vars into the `dhctl bootstrap` container so `lib-connection` stops opening `/root/.ssh/id_rsa` and authenticates exclusively via the mounted ssh-agent socket — fixes "Failed to read private keys from flags" on passphrase-protected keys. +- **Bugfix** `pkg/cluster/vms.go::generateCloudInitUserData`: pin apt to `mirror.yandex.ru` and force IPv4 (`Acquire::ForceIPv4=true`) in cloud-init, so `package_update` and Docker install stop stalling when `archive.ubuntu.com` IPs are partially unreachable. +- **Refactor** `internal/infrastructure/ssh/client.go::StartTunnel` (both `*client` and `*jumpHostClient`): extracted shared `runTunnelLoop` + `tunnelDialer`. On dial failure that looks like a dropped SSH session, the loop now logs a visible WARN, calls the existing `reconnect()` (retry + exponential backoff), and retries the dial once with the freshly rebuilt session. Fixes the "test hangs 20 minutes silently after Wi-Fi flap" failure mode. +- **Add** `pkg/kubernetes/poll.go`: `pollResourceUntilReady` centralizes the `WaitFor*Ready` loops with a per-call `PollGetTimeout` (30s) on every Get and WARN logging once consecutive Get failures cross 3, so a dropped tunnel surfaces in seconds instead of after the 20-minute readyTimeout. +- **Refactor** `pkg/kubernetes/cephcluster.go`, `pkg/kubernetes/cephblockpool.go`, `pkg/kubernetes/cephfilesystem.go`: `WaitForCephClusterReady` / `WaitForCephBlockPoolReady` / `WaitForCephFilesystemReady` migrated to `pollResourceUntilReady`. Public signatures unchanged. +- **Add** `pkg/kubernetes/pod_exec.go`: `ExecInPod` (pods/exec via SPDY), `ReadFileFromPod` (`cat ` wrapper for non-distroless images), and `ReadFileFromDistrolessPod` (single-shot ephemeral container injection that reads through `/proc/1/root` thanks to the shared PID namespace; uses the dedicated `ephemeralcontainers` subresource so the target pod and its sandbox are NOT restarted and `metadata.generation` is not bumped — keeps downstream rollout assertions clean). +- **Add** `pkg/kubernetes/pod_exec.go::DistrolessReader` + `OpenDistrolessReader`: long-lived ephemeral-container session for cheap repeated reads. `(*DistrolessReader).ReadFile` is a plain `pods/exec` round-trip against the already-running ephemeral container; `(*DistrolessReader).PodName()` lets callers detect rollouts and re-open against the new pod. Pays the ephemeral-container cold start once instead of per `Eventually` iteration. +- **Add** `pkg/kubernetes/poll.go::pollResourceUntilGone` + per-CR `WaitForCephClusterGone` / `WaitForCephBlockPoolGone` / `WaitForCephFilesystemGone` / `WaitForCephClusterAuthenticationGone` / `WaitForCephClusterConnectionGone` / `WaitForCephStorageClassGone` helpers. Logs `deletionTimestamp` and finalizers progress periodically so a stuck finalizer is visible immediately. Fail-fast on timeout — no auto-strip of finalizers; the operator must investigate before re-running. +- **Update** Ceph CR `Create*` helpers (`CreateCephCluster` / `CreateCephBlockPool` / `CreateCephFilesystem` / `CreateCephClusterAuthentication` / `CreateCephClusterConnection` / `CreateCephStorageClass`) and `WaitFor*Ready`: now fail fast when the live object has `metadata.deletionTimestamp != nil`. Prevents the framework from updating a Terminating object (silent no-op) or waiting 20 minutes on Ready for an object that's being garbage-collected. +- **Refactor** `pkg/testkit/ceph.go::TeardownCephStorageClass`: explicitly `WaitFor*Gone` after every Delete in the right order (`CephStorageClass` → `CephClusterConnection` → `CephClusterAuthentication` → `CephBlockPool` or `CephFilesystem` → `CephCluster` → `rook-config-override`). Without these waits the parent `CephCluster` was deleted before its dependents were gone, Rook recorded `DeletionIsBlocked / ObjectHasDependents`, and the next test run either found a stuck Terminating CR or hung in `WaitForCephClusterReady`. Errors are aggregated; NotFound is treated as success. +- **Update** `pkg/testkit/ceph_crc.go::RestartCephDaemons`: extended the daemon selector from `mon,mgr,osd` to `mon,mgr,osd,mds,rgw`. A global `ms_crc_data` flip lives in `ceph.conf` and any unrestarted daemon (typically MDS) silently breaks the messenger handshake — degrades CephFS and pins csi-cephfs PVCs in Pending. `rgw` is included for forward-compat with future S3 tests. +- **Add** `pkg/testkit/ceph_crc.go::RestartRookOperator`: rollout-restarts the rook-operator Deployment after a wire-protocol bounce so it picks up the new `ceph.conf` instead of pinning the cephcluster CR in `HEALTH_ERR`. Deployment name is derived from the namespace by stripping the leading `d8-` prefix (Deckhouse module convention, e.g. `d8-sds-elastic` → `sds-elastic`); vanilla Rook is not supported. +- **Update** `pkg/testkit/ceph_crc.go::SetMsCrcDataOnServer`: after rewriting `rook-config-override` the helper now (1) calls `RestartCephDaemons` for the extended selector, (2) calls `RestartRookOperator`, then (3) waits for every `CephFilesystem` in the namespace to come back to Ready. This is what unblocks the CephFS half of the msCrcData matrix — previously a flip silently left MDS / operator out of sync. +- **Update** `docs/FUNCTIONS_GLOSSARY.md`: noted that the three `WaitForCeph*Ready` helpers now apply a per-call deadline and emit WARN on consecutive Get failures. +- **Update** `docs/ARCHITECTURE.md`: added `pkg/kubernetes/poll.go` to Section 1.1 and Section 3.6, added `pkg/kubernetes/cephfilesystem.go` (carry-over from the prior commit), added `internal/config/overrides.go` to Section 3.1. + +## 2026-05-06 + +- **Update** `internal/cluster/cluster.go::GetKubeconfig`: when SSH retrieval of `/etc/kubernetes/{super-admin,admin}.conf` fails, the function now fails fast unless `KUBE_CONFIG_PATH` is set explicitly. The previously considered fallback to `clientcmd.NewDefaultClientConfigLoadingRules` (KUBECONFIG / `~/.kube/config`) was dropped before release to preserve the original fail-fast contract — a silent fallback to the developer's personal kubeconfig is too risky in CI and on machines whose `kubectl` already points at an unrelated cluster. +- **Update** `docs/FUNCTIONS_GLOSSARY.md`: documented `OpenDistrolessReader` + `*DistrolessReader` methods, `CreateStorageClass`, `CreateVolumeSnapshotClass` / `WaitForVolumeSnapshotClass`, `RenderCephGlobalConfig`, and the full `pkg/testkit/ceph_crc.go` surface (`EnableServerCRC` / `DisableServerCRC` / `ResetServerCRCToDefault` / `SetMsCrcDataOnServer` / `RestartCephDaemons` / `RestartRookOperator`); added matching TOC entries. +- **Update** `docs/ARCHITECTURE.md`: added `internal/config/overrides.go` to Section 1.1 (was only in Section 3.1), added `pkg/kubernetes/pod_exec.go` to Section 1.1 and Section 3.6, documented `KUBE_CONFIG_PATH` semantics and `${VAR}` expansion (`MODULE_IMAGE_TAG`) in Section 7. +- **Update** `internal/cluster/cluster.go::GetKubeconfig`: when the SSH-side kubeconfig fetch fails and `KUBE_CONFIG_PATH` is unset, the function now runs two cheap probes (`test -f /etc/kubernetes/{super-admin,admin}.conf` for existence, then `sudo -n -l /bin/cat ` for a matching NOPASSWD rule) to classify the failure and returns a structured, actionable error. The error embeds a ready-to-paste `/etc/sudoers.d/e2e-kubeconfig` snippet for the most common cause (passworded sudo on the master) and a `KUBE_CONFIG_PATH` escape hatch. Original SSH error is still wrapped via `%w` so `errors.Is`/`errors.As` keep working. +- **Bugfix** `internal/cluster/cluster.go::getKubeconfigRemoteShell`: dropped the `sudo -n sh -c '...'` wrapper and now invokes `sudo -n /bin/cat ` directly (with a `||` fallback from `super-admin.conf` to `admin.conf`). The wrapper made the privileged binary `/bin/sh`, so the recommended fine-grained `NOPASSWD: /bin/cat /etc/kubernetes/{super-admin,admin}.conf` sudoers rule did not match and `GetKubeconfig` failed with "sudo on the master requires a password" even after the operator pasted the recommended snippet. Aligned the diagnostic probe in `classifyKubeconfigFetchFailure` to `sudo -n -l /bin/cat ` for the same reason — `sudo -n true` returned 0 under common `NOPASSWD: ALL` configurations and would mask the real problem on hosts that only allow `cat`. diff --git a/go.mod b/go.mod index 764c44d..698c7ce 100644 --- a/go.mod +++ b/go.mod @@ -35,13 +35,16 @@ require ( github.com/google/go-cmp v0.7.0 // indirect github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad // indirect github.com/google/uuid v1.6.0 // indirect + github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/kr/fs v0.1.0 // indirect github.com/mailru/easyjson v0.7.7 // indirect + github.com/moby/spdystream v0.5.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect github.com/openshift/custom-resource-status v1.1.2 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/spf13/pflag v1.0.7 // indirect diff --git a/go.sum b/go.sum index e68a8bc..5089189 100644 --- a/go.sum +++ b/go.sum @@ -5,6 +5,8 @@ github.com/Masterminds/semver/v3 v3.3.1/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lpr github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ= github.com/PuerkitoBio/purell v1.1.1/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0= github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE= +github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= +github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= @@ -107,6 +109,8 @@ github.com/googleapis/gnostic v0.5.1/go.mod h1:6U4PtQXGIEt/Z3h5MAT7FNofLnw9vXk2c github.com/googleapis/gnostic v0.5.5/go.mod h1:7+EbHbldMins07ALC74bsA81Ovc97DwqyJO1AENw9kA= github.com/gorilla/mux v1.8.0/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So= github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= +github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 h1:JeSE6pjso5THxAzdVpqr6/geYxZytqFMBCOtn/ujyeo= +github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674/go.mod h1:r4w70xmWCQKmi1ONH4KIaBptdivuRPyosB9RmPlGEwA= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= @@ -133,6 +137,8 @@ github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0 github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= github.com/moby/spdystream v0.2.0/go.mod h1:f7i0iNDQJ059oMTcWxx8MA/zKFIuD/lY+0GqbN2Wy8c= +github.com/moby/spdystream v0.5.0 h1:7r0J1Si3QO/kjRitvSLVVFUjxMEb/YLj6S9FF62JBCU= +github.com/moby/spdystream v0.5.0/go.mod h1:xBAYlnt/ay+11ShkdFKNAG7LsyK/tmNBVvVOwrfMgdI= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= @@ -143,6 +149,7 @@ github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWu github.com/munnerz/goautoneg v0.0.0-20120707110453-a547fc61f48d/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A= diff --git a/internal/cluster/cluster.go b/internal/cluster/cluster.go index 1cd7469..f5acf09 100644 --- a/internal/cluster/cluster.go +++ b/internal/cluster/cluster.go @@ -48,6 +48,7 @@ import ( "github.com/deckhouse/storage-e2e/internal/config" "github.com/deckhouse/storage-e2e/internal/infrastructure/ssh" + "github.com/deckhouse/storage-e2e/internal/logger" ) // LoadClusterConfig loads and validates a cluster configuration from a YAML file @@ -73,6 +74,14 @@ func LoadClusterConfig(configFilename string) (*config.ClusterDefinition, error) return nil, fmt.Errorf("failed to parse YAML config: %w", err) } + // Expand ${VAR} placeholders in modulePullOverride fields. CI uses this to + // pass a per-PR/MR image tag via a single env var (e.g. MODULE_IMAGE_TAG) + // without editing the YAML between runs. Missing envs fail fast here so we + // don't silently regress to "main" on accidentally unset variables. + if err := config.ExpandEnvInModulePullOverride(&clusterDef); err != nil { + return nil, fmt.Errorf("expand env in modulePullOverride: %w", err) + } + // Validate the configuration if err := validateClusterConfig(&clusterDef); err != nil { return nil, fmt.Errorf("config validation failed: %w", err) @@ -182,7 +191,20 @@ func expandPath(path string) (string, error) { // getKubeconfigRemoteShell prints kubeconfig for use with client-go. It prefers // /etc/kubernetes/super-admin.conf (Kubernetes 1.29+ unified kubeconfig) when the file // exists, and falls back to /etc/kubernetes/admin.conf otherwise. -const getKubeconfigRemoteShell = "sudo -n sh -c 'if [ -f /etc/kubernetes/super-admin.conf ]; then cat /etc/kubernetes/super-admin.conf; else cat /etc/kubernetes/admin.conf; fi'" +// +// The two `sudo -n /bin/cat ...` invocations are intentionally NOT wrapped in +// `sudo -n sh -c '...'`. With a wrapper the privileged binary is /bin/sh, so a +// minimal sudoers rule of the form +// +// user ALL=(root) NOPASSWD: /bin/cat /etc/kubernetes/super-admin.conf, /bin/cat /etc/kubernetes/admin.conf +// +// would NOT match and sudo would still ask for a password. By calling /bin/cat +// directly we make this command work with the same fine-grained NOPASSWD rule +// that the buildKubeconfigFetchError diagnostic recommends. The 2>/dev/null on +// the first try suppresses the "permission denied / no such file" noise so the +// fallback to admin.conf produces clean kubeconfig content on stdout. +const getKubeconfigRemoteShell = "sudo -n /bin/cat /etc/kubernetes/super-admin.conf 2>/dev/null " + + "|| sudo -n /bin/cat /etc/kubernetes/admin.conf" // GetKubeconfig connects to the master node via SSH, retrieves kubeconfig (preferring // super-admin.conf over admin.conf when available), and returns a rest.Config that can @@ -216,35 +238,56 @@ func GetKubeconfig(ctx context.Context, masterIP, user, keyPath string, sshClien kubeconfigPath := filepath.Join(outputDir, fmt.Sprintf("kubeconfig-%s.yml", masterIP)) - var kubeconfigContent []byte + var ( + kubeconfigContent []byte + // kubeconfigSource is a short, human-readable tag identifying where the + // kubeconfig came from. It's printed at the end of GetKubeconfig so it + // is always obvious in test logs which cluster we're actually about to + // hit — important after diagnosing wrong-cluster bugs that look like + // "stale lock" or "unexpected modules". + kubeconfigSource string + ) // Read kubeconfig via SSH: prefer super-admin.conf when present (see getKubeconfigRemoteShell). - kubeconfigContentStr, err := sshClient.Exec(ctx, getKubeconfigRemoteShell) - if err != nil { - // SSH retrieval failed (likely due to sudo password requirement) - // Try to use KUBE_CONFIG_PATH if set, otherwise notify user - if config.KubeConfigPath != "" { - // Expand path to handle ~ and resolve symlinks if present - resolvedPath, err := expandPath(config.KubeConfigPath) - if err != nil { - return nil, "", fmt.Errorf("failed to expand KUBE_CONFIG_PATH (%s): %w", config.KubeConfigPath, err) - } - // Read kubeconfig content from the provided file - kubeconfigContent, err = os.ReadFile(resolvedPath) - if err != nil { - return nil, "", fmt.Errorf("failed to read kubeconfig from KUBE_CONFIG_PATH (%s): %w", resolvedPath, err) - } - } else { - // KUBE_CONFIG_PATH not set, notify user and fail - return nil, "", fmt.Errorf("failed to read kubeconfig from master (this may occur if sudo requires a password). "+ - "Please download the kubeconfig file manually and provide its full path via KUBE_CONFIG_PATH environment variable. "+ - "Original error: %w", err) - } - } else { + kubeconfigContentStr, sshErr := sshClient.Exec(ctx, getKubeconfigRemoteShell) + switch { + case sshErr == nil: // SSH succeeded - use the content from SSH kubeconfigContent = []byte(kubeconfigContentStr) + kubeconfigSource = fmt.Sprintf("SSH(%s@%s:/etc/kubernetes/{super-admin,admin}.conf)", user, masterIP) + + case config.KubeConfigPath != "": + // SSH retrieval failed (likely due to sudo password requirement) and the + // caller pointed us at a specific kubeconfig file via KUBE_CONFIG_PATH. + resolvedPath, expandErr := expandPath(config.KubeConfigPath) + if expandErr != nil { + return nil, "", fmt.Errorf("failed to expand KUBE_CONFIG_PATH (%s): %w", config.KubeConfigPath, expandErr) + } + readContent, readErr := os.ReadFile(resolvedPath) + if readErr != nil { + return nil, "", fmt.Errorf("failed to read kubeconfig from KUBE_CONFIG_PATH (%s): %w", resolvedPath, readErr) + } + kubeconfigContent = readContent + kubeconfigSource = fmt.Sprintf("KUBE_CONFIG_PATH=%s", resolvedPath) + + default: + // SSH failed and the caller did not opt into a specific kubeconfig via + // KUBE_CONFIG_PATH. Fail fast rather than silently picking up the + // developer's ~/.kube/config / $KUBECONFIG, which has historically + // caused tests to acquire stale locks on unrelated SAN clusters or + // deploy modules against the wrong stand. Classify the failure so the + // returned error tells the operator which knob to turn. + cause := classifyKubeconfigFetchFailure(ctx, sshClient) + return nil, "", buildKubeconfigFetchError(user, masterIP, sshErr, cause) } + // Always stamp the kubeconfig source + the resulting current-context/server + // in the log. With this single line a developer reading the output knows + // for sure which cluster the test is about to talk to, regardless of which + // of the three resolution paths fired above. + finalCtx, finalServer := kubeconfigContextSummary(kubeconfigContent) + logger.Info("Loaded kubeconfig (source=%s, current-context=%q, server=%q)", kubeconfigSource, finalCtx, finalServer) + // Write kubeconfig content to file (always write a working copy, regardless of source) kubeconfigFile, err := os.Create(kubeconfigPath) if err != nil { @@ -348,3 +391,128 @@ func UpdateKubeconfigPort(kubeconfigPath string, localPort int) error { return nil } + +// kubeconfigContextSummary parses a serialized kubeconfig and returns its +// current-context name and the matching cluster's `server:` URL. Used purely +// for human-readable log lines that identify which cluster the test is about +// to talk to. On any parse failure the helper returns "" / "" +// rather than an error: failing here would defeat its only purpose, which is +// to make the surrounding log message safer to print under partial failures. +func kubeconfigContextSummary(content []byte) (currentContext, server string) { + currentContext = "" + server = "" + if len(content) == 0 { + return + } + cfg, err := clientcmd.Load(content) + if err != nil || cfg == nil { + return + } + if cfg.CurrentContext != "" { + currentContext = cfg.CurrentContext + } + if ctx, ok := cfg.Contexts[cfg.CurrentContext]; ok && ctx != nil { + if cl, ok := cfg.Clusters[ctx.Cluster]; ok && cl != nil && cl.Server != "" { + server = cl.Server + } + } + return +} + +// kubeconfigFetchCause discriminates the most likely reason +// getKubeconfigRemoteShell exited non-zero. Used solely to choose the +// human-readable error template — the original SSH error is always +// preserved via %w wrapping, so callers' errors.Is/errors.As keep working. +type kubeconfigFetchCause int + +const ( + causeUnknown kubeconfigFetchCause = iota + causeSudoPasswordRequired + causeKubeconfigMissing +) + +// classifyKubeconfigFetchFailure runs two cheap probes against the master +// to figure out the most likely reason getKubeconfigRemoteShell failed. +// Best-effort: any probe-time error is treated as "unknown" rather than +// surfaced — we are already in an error path and the original sshErr is +// what callers care about. +// +// Order matters and matches what we actually need to know: +// 1. Do the kubeconfig files even exist on this host? `test -f` runs as +// the SSH user without sudo and returns 0 even when the file is +// root:root 0600, because it only checks the inode. If both files are +// missing this is almost certainly a non-control-plane node and no +// sudoers tweak will help. +// 2. If at least one file exists, are we allowed to `cat` it without a +// password? We probe with `sudo -n -l /bin/cat `: -l makes sudo +// just look up the rule (no execution), and with -n it exits non-zero +// when no matching NOPASSWD rule applies. Crucially this matches the +// SAME granular rule the diagnostic recommends, so a misconfiguration +// where the operator added `NOPASSWD: /bin/sh` (or only NOPASSWD: ALL) +// does NOT mask the real "missing /bin/cat rule" cause. +func classifyKubeconfigFetchFailure(ctx context.Context, sshClient ssh.SSHClient) kubeconfigFetchCause { + if _, err := sshClient.Exec(ctx, + "test -f /etc/kubernetes/super-admin.conf || test -f /etc/kubernetes/admin.conf"); err != nil { + return causeKubeconfigMissing + } + if _, err := sshClient.Exec(ctx, + "sudo -n -l /bin/cat /etc/kubernetes/super-admin.conf >/dev/null 2>&1 || "+ + "sudo -n -l /bin/cat /etc/kubernetes/admin.conf >/dev/null 2>&1"); err != nil { + return causeSudoPasswordRequired + } + return causeUnknown +} + +// buildKubeconfigFetchError renders an actionable, multi-line error for +// the caller to print. Each branch lists the same kind of remediation +// (sudoers tweak, KUBE_CONFIG_PATH escape, SSH check) but in the order +// most relevant for the detected cause. The returned error always wraps +// the original sshErr so errors.Is(err, &ssh.ExitError{...}) still works. +func buildKubeconfigFetchError(user, masterIP string, sshErr error, cause kubeconfigFetchCause) error { + sudoersLine := fmt.Sprintf( + "%s ALL=(root) NOPASSWD: /bin/cat /etc/kubernetes/super-admin.conf, /bin/cat /etc/kubernetes/admin.conf", + user, + ) + sudoersFix := "echo '" + sudoersLine + "' | sudo tee /etc/sudoers.d/e2e-kubeconfig && sudo chmod 0440 /etc/sudoers.d/e2e-kubeconfig" + + switch cause { + case causeSudoPasswordRequired: + return fmt.Errorf( + "failed to read kubeconfig from master via SSH (%s@%s): "+ + "sudo on the master requires a password (sudo -n exited non-zero).\n"+ + "Pick ONE remedy:\n"+ + " 1) Allow passwordless cat of the two kubeconfig files (run on the master):\n"+ + " %s\n"+ + " 2) Point the test at a local kubeconfig instead (no SSH/sudo at all):\n"+ + " export KUBE_CONFIG_PATH=$HOME/.kube/config\n"+ + "Original SSH error: %w", + user, masterIP, sudoersFix, sshErr) + + case causeKubeconfigMissing: + return fmt.Errorf( + "failed to read kubeconfig from master via SSH (%s@%s): "+ + "neither /etc/kubernetes/super-admin.conf nor /etc/kubernetes/admin.conf exists on the host — "+ + "this looks like a non-control-plane node.\n"+ + "Pick ONE remedy:\n"+ + " 1) Make sure SSH_HOST points at a Kubernetes control-plane (master) node "+ + "(check SSH_HOST/SSH_USER, and SSH_JUMP_HOST if you use one).\n"+ + " 2) Set KUBE_CONFIG_PATH to a kubeconfig file on your local machine:\n"+ + " export KUBE_CONFIG_PATH=$HOME/.kube/config\n"+ + "Original SSH error: %w", + user, masterIP, sshErr) + + default: + return fmt.Errorf( + "failed to read kubeconfig from master via SSH (%s@%s) "+ + "and KUBE_CONFIG_PATH is not set.\n"+ + "Pick ONE remedy:\n"+ + " 1) If sudo on the master requires a password, allow passwordless cat of the kubeconfig files:\n"+ + " %s\n"+ + " 2) Set KUBE_CONFIG_PATH to a kubeconfig file on your local machine:\n"+ + " export KUBE_CONFIG_PATH=$HOME/.kube/config\n"+ + " 3) Fix SSH credentials so the master is reachable as %s with key-based auth.\n"+ + "Original SSH error: %w", + user, masterIP, sudoersFix, user, sshErr) + } +} + diff --git a/internal/config/env.go b/internal/config/env.go index 23b3a0a..37dcae6 100644 --- a/internal/config/env.go +++ b/internal/config/env.go @@ -242,8 +242,16 @@ func EffectiveVirtualMachineClassName() string { return n } -func ValidateEnvironment() error { - // Default values for environment variables +// ApplyDefaults populates package-level config variables that have a documented +// default value but were not provided through the environment. It is idempotent +// and safe to call multiple times. +// +// Suites that don't call ValidateEnvironment() (because they don't need its +// required-variable checks) should still call ApplyDefaults() — otherwise +// optional variables like SSH_VM_USER stay empty and propagate as user="" all +// the way to the SSH server, where it shows up as "Invalid user" / publickey +// rejection that is hard to attribute to a missing default. +func ApplyDefaults() { if YAMLConfigFilename == "" { YAMLConfigFilename = YAMLConfigFilenameDefaultValue } @@ -264,6 +272,10 @@ func ValidateEnvironment() error { if TestClusterNamespace == "" { TestClusterNamespace = TestClusterNamespaceDefaultValue } +} + +func ValidateEnvironment() error { + ApplyDefaults() TestClusterVirtualMachineClassName = strings.TrimSpace(TestClusterVirtualMachineClassName) if TestClusterVirtualMachineClassName == "" { diff --git a/internal/config/overrides.go b/internal/config/overrides.go new file mode 100644 index 0000000..5eed2fc --- /dev/null +++ b/internal/config/overrides.go @@ -0,0 +1,68 @@ +/* +Copyright 2026 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package config + +import ( + "fmt" + "os" + "regexp" +) + +// envVarRefPattern matches ${NAME} placeholders. We accept only the braced +// form (no bare $NAME) to keep substitution intent explicit and avoid +// accidentally rewriting tags that legitimately contain a dollar sign. +var envVarRefPattern = regexp.MustCompile(`\$\{([A-Za-z_][A-Za-z0-9_]*)\}`) + +// ExpandEnvInModulePullOverride expands ${VAR} references in each module's +// ModulePullOverride field. If a referenced env var is not set, returns an +// error pointing at the offending module so CI fails loudly instead of +// silently falling back to the "main" default in configureModulePullOverride. +// +// This lets test suites declare in YAML which modules should track a CI-built +// image without hard-coding any tag: +// +// modules: +// - name: csi-ceph +// modulePullOverride: "${MODULE_IMAGE_TAG}" +// +// CI then sets MODULE_IMAGE_TAG=pr (GitHub) or mr (GitLab), and the +// resulting ModulePullOverride CR points at the right image without anyone +// editing the YAML per run. +// +// Use this hook right after yaml.Unmarshal of cluster_config.yml. Modules +// without any placeholder are left untouched. +func ExpandEnvInModulePullOverride(def *ClusterDefinition) error { + for _, m := range def.DKPParameters.Modules { + if m == nil || m.ModulePullOverride == "" { + continue + } + matches := envVarRefPattern.FindAllStringSubmatch(m.ModulePullOverride, -1) + if len(matches) == 0 { + continue + } + for _, ms := range matches { + if _, ok := os.LookupEnv(ms[1]); !ok { + return fmt.Errorf( + "module %q references env var ${%s} in modulePullOverride but it is not set", + m.Name, ms[1], + ) + } + } + m.ModulePullOverride = os.Expand(m.ModulePullOverride, os.Getenv) + } + return nil +} diff --git a/internal/config/overrides_test.go b/internal/config/overrides_test.go new file mode 100644 index 0000000..dfe79e6 --- /dev/null +++ b/internal/config/overrides_test.go @@ -0,0 +1,149 @@ +/* +Copyright 2026 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package config + +import ( + "os" + "strings" + "testing" +) + +func TestExpandEnvInModulePullOverride_NoPlaceholder(t *testing.T) { + def := &ClusterDefinition{ + DKPParameters: DKPParameters{ + Modules: []*ModuleConfig{ + {Name: "snapshot-controller", ModulePullOverride: "main"}, + {Name: "csi-ceph", ModulePullOverride: ""}, + {Name: "sds-elastic"}, + }, + }, + } + if err := ExpandEnvInModulePullOverride(def); err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got := def.DKPParameters.Modules[0].ModulePullOverride; got != "main" { + t.Errorf("snapshot-controller: got %q, want %q", got, "main") + } + if got := def.DKPParameters.Modules[1].ModulePullOverride; got != "" { + t.Errorf("csi-ceph: got %q, want empty", got) + } + if got := def.DKPParameters.Modules[2].ModulePullOverride; got != "" { + t.Errorf("sds-elastic: got %q, want empty", got) + } +} + +func TestExpandEnvInModulePullOverride_Expands(t *testing.T) { + t.Setenv("MODULE_IMAGE_TAG", "pr131") + def := &ClusterDefinition{ + DKPParameters: DKPParameters{ + Modules: []*ModuleConfig{ + {Name: "csi-ceph", ModulePullOverride: "${MODULE_IMAGE_TAG}"}, + }, + }, + } + if err := ExpandEnvInModulePullOverride(def); err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got := def.DKPParameters.Modules[0].ModulePullOverride; got != "pr131" { + t.Errorf("got %q, want %q", got, "pr131") + } +} + +func TestExpandEnvInModulePullOverride_MissingEnvFails(t *testing.T) { + // Use t.Setenv to register cleanup that restores the original value (if + // any) after the test, then os.Unsetenv to actually drop it for this run. + const name = "MISSING_TAG_FOR_TEST" + t.Setenv(name, "anything") + if err := os.Unsetenv(name); err != nil { + t.Fatalf("os.Unsetenv: %v", err) + } + + def := &ClusterDefinition{ + DKPParameters: DKPParameters{ + Modules: []*ModuleConfig{ + {Name: "snapshot-controller", ModulePullOverride: "main"}, + {Name: "csi-ceph", ModulePullOverride: "${" + name + "}"}, + }, + }, + } + err := ExpandEnvInModulePullOverride(def) + if err == nil { + t.Fatalf("expected error for missing env, got nil") + } + if !strings.Contains(err.Error(), "csi-ceph") { + t.Errorf("error should mention offending module name, got: %v", err) + } + if !strings.Contains(err.Error(), name) { + t.Errorf("error should mention env var name %q, got: %v", name, err) + } +} + +func TestExpandEnvInModulePullOverride_PerModuleEnvs(t *testing.T) { + t.Setenv("CSI_CEPH_TAG", "pr131") + t.Setenv("SDS_ELASTIC_TAG", "mr41") + + def := &ClusterDefinition{ + DKPParameters: DKPParameters{ + Modules: []*ModuleConfig{ + {Name: "csi-ceph", ModulePullOverride: "${CSI_CEPH_TAG}"}, + {Name: "sds-elastic", ModulePullOverride: "${SDS_ELASTIC_TAG}"}, + {Name: "snapshot-controller", ModulePullOverride: "main"}, + }, + }, + } + if err := ExpandEnvInModulePullOverride(def); err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got := def.DKPParameters.Modules[0].ModulePullOverride; got != "pr131" { + t.Errorf("csi-ceph: got %q, want %q", got, "pr131") + } + if got := def.DKPParameters.Modules[1].ModulePullOverride; got != "mr41" { + t.Errorf("sds-elastic: got %q, want %q", got, "mr41") + } + if got := def.DKPParameters.Modules[2].ModulePullOverride; got != "main" { + t.Errorf("snapshot-controller: got %q, want %q", got, "main") + } +} + +func TestExpandEnvInModulePullOverride_MultiplePlaceholdersInOneString(t *testing.T) { + t.Setenv("PREFIX", "branch") + t.Setenv("NAME", "ms-crc") + def := &ClusterDefinition{ + DKPParameters: DKPParameters{ + Modules: []*ModuleConfig{ + {Name: "csi-ceph", ModulePullOverride: "${PREFIX}-${NAME}"}, + }, + }, + } + if err := ExpandEnvInModulePullOverride(def); err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got := def.DKPParameters.Modules[0].ModulePullOverride; got != "branch-ms-crc" { + t.Errorf("got %q, want %q", got, "branch-ms-crc") + } +} + +func TestExpandEnvInModulePullOverride_NilModuleSliceEntry(t *testing.T) { + def := &ClusterDefinition{ + DKPParameters: DKPParameters{ + Modules: []*ModuleConfig{nil, {Name: "csi-ceph", ModulePullOverride: "main"}}, + }, + } + if err := ExpandEnvInModulePullOverride(def); err != nil { + t.Fatalf("unexpected error: %v", err) + } +} diff --git a/internal/infrastructure/ssh/client.go b/internal/infrastructure/ssh/client.go index e2a24d1..a9654b5 100644 --- a/internal/infrastructure/ssh/client.go +++ b/internal/infrastructure/ssh/client.go @@ -404,94 +404,20 @@ func (c *client) reconnect(ctx context.Context) error { // StartTunnel starts an SSH tunnel with port forwarding from local to remote // It returns a function to stop the tunnel and an error if the tunnel fails to start func (c *client) StartTunnel(ctx context.Context, localPort, remotePort string) (func() error, error) { - // Check context before starting - if err := ctx.Err(); err != nil { - return nil, fmt.Errorf("context error before starting tunnel: %w", err) - } - - listener, err := net.Listen("tcp", "127.0.0.1:"+localPort) - if err != nil { - return nil, fmt.Errorf("failed to listen on local port %s: %w", localPort, err) - } - - stopChan := make(chan struct{}) - - go func() { - defer listener.Close() - for { - // Check context and stop channel - select { - case <-ctx.Done(): - return - case <-stopChan: - return - default: - } - - // Set deadline for Accept based on context deadline if available - if deadline, ok := ctx.Deadline(); ok { - if err := listener.(*net.TCPListener).SetDeadline(deadline); err != nil { - // If setting deadline fails, continue without it - } - } - - localConn, err := listener.Accept() - if err != nil { - // Listener closed or error occurred - select { - case <-ctx.Done(): - return - case <-stopChan: - return - default: - // Continue if not stopped - continue - } + dialer := tunnelDialer{ + describe: fmt.Sprintf("%s@%s local:%s -> remote:%s", c.user, c.host, localPort, remotePort), + dial: func() (net.Conn, error) { + c.mu.Lock() + sc := c.sshClient + c.mu.Unlock() + if sc == nil { + return nil, fmt.Errorf("ssh client is not initialized") } - - go func() { - defer localConn.Close() - remoteConn, err := c.sshClient.Dial("tcp", "127.0.0.1:"+remotePort) - if err != nil { - // Connection failed, just return - the error will be visible to the client - return - } - defer remoteConn.Close() - - // Copy data bidirectionally with context support - done := make(chan struct{}, 2) - go func() { - _, _ = copyWithContext(ctx, localConn, remoteConn) - done <- struct{}{} - }() - go func() { - _, _ = copyWithContext(ctx, remoteConn, localConn) - done <- struct{}{} - }() - - // Wait for either direction to finish or context cancellation - select { - case <-ctx.Done(): - return - case <-done: - // One direction finished, wait for the other - select { - case <-ctx.Done(): - return - case <-done: - // Both directions finished - } - } - }() - } - }() - - stop := func() error { - close(stopChan) - return listener.Close() + return sc.Dial("tcp", "127.0.0.1:"+remotePort) + }, + reconnect: c.reconnect, } - - return stop, nil + return runTunnelLoop(ctx, localPort, dialer) } // Exec executes a command on the remote host with automatic retry and reconnection @@ -667,7 +593,7 @@ func NewClientWithJumpHost(jumpUser, jumpHost, jumpKeyPath, targetUser, targetHo } // Create SSH config for target host - targetConfig, _, err := createSSHConfig(targetUser, targetKeyPath) + targetConfig, targetKeyInfo, err := createSSHConfig(targetUser, targetKeyPath) if err != nil { jumpClient.Close() return nil, fmt.Errorf("failed to create SSH config for target host: %w", err) @@ -696,14 +622,22 @@ func NewClientWithJumpHost(jumpUser, jumpHost, jumpKeyPath, targetUser, targetHo targetConn, err := jumpClient.Dial("tcp", targetAddr) if err != nil { - lastErr = fmt.Errorf("failed to dial target host %s@%s through jump host: %w", targetUser, targetAddr, err) + lastErr = fmt.Errorf("failed to dial target host %q@%s through jump host %q@%s: %w", + targetUser, targetAddr, jumpUser, jumpAddr, err) continue } targetClientConn, targetChans, targetReqs, err := ssh.NewClientConn(targetConn, targetAddr, targetConfig) if err != nil { targetConn.Close() - lastErr = fmt.Errorf("failed to establish SSH connection to target host: %w", err) + lastErr = fmt.Errorf( + "failed to establish SSH connection to target host %q@%s (via jump %q@%s): %w\n"+ + " Key used: %s (algorithm: %s, fingerprint: %s)\n"+ + " Hint: verify SSH_VM_USER (current=%q) is correct for this VM image and that the key's public part is in %s@%s:~/.ssh/authorized_keys", + targetUser, targetAddr, jumpUser, jumpAddr, err, + targetKeyInfo.Path, targetKeyInfo.Algorithm, targetKeyInfo.Fingerprint, + targetUser, targetUser, targetAddr, + ) continue } @@ -713,7 +647,8 @@ func NewClientWithJumpHost(jumpUser, jumpHost, jumpKeyPath, targetUser, targetHo if targetClient == nil { jumpClient.Close() - return nil, fmt.Errorf("failed to connect to target host after %d attempts: %w", maxRetries, lastErr) + return nil, fmt.Errorf("failed to connect to target host %q@%s after %d attempts: %w", + targetUser, targetAddr, maxRetries, lastErr) } // Start keepalive for both connections @@ -906,17 +841,65 @@ func (c *jumpHostClient) reconnect(ctx context.Context) error { return fmt.Errorf("failed to reconnect after %d attempts: %w", config.SSHRetryCount, lastErr) } -// StartTunnel starts an SSH tunnel with port forwarding from local to remote +// StartTunnel starts an SSH tunnel with port forwarding from local to remote. +// Like the non-jump-host variant, dial errors that look like a dropped SSH +// session trigger a reconnect attempt against jump+target before the next +// retry — Wi-Fi flaps on the developer's laptop are by far the most common +// way for the tunnel to die mid-test. func (c *jumpHostClient) StartTunnel(ctx context.Context, localPort, remotePort string) (func() error, error) { - // Use the target client's StartTunnel method - // We need to access the underlying client's StartTunnel - // Since we can't directly call it, we'll implement it here - return startTunnelOnClient(ctx, c.targetClient, localPort, remotePort) + dialer := tunnelDialer{ + describe: fmt.Sprintf("%s@%s via jump %s@%s local:%s -> remote:%s", + c.targetUser, c.targetHost, c.jumpUser, c.jumpHost, localPort, remotePort), + dial: func() (net.Conn, error) { + c.mu.Lock() + tc := c.targetClient + c.mu.Unlock() + if tc == nil { + return nil, fmt.Errorf("jump-host target client is not initialized") + } + return tc.Dial("tcp", "127.0.0.1:"+remotePort) + }, + reconnect: c.reconnect, + } + return runTunnelLoop(ctx, localPort, dialer) +} + +// tunnelDialer abstracts the per-tunnel concerns that runTunnelLoop needs to +// know about: how to open a fresh remote connection through the active SSH +// session, how to re-establish that session when it dies, and a human-readable +// description for log messages. +type tunnelDialer struct { + // describe identifies the tunnel in WARN/INFO logs. It should encode user, + // host(s) and ports — enough to distinguish concurrent tunnels. + describe string + // dial opens a fresh TCP connection to the remote endpoint via the *current* + // SSH client. Implementations must read the underlying *ssh.Client under + // whatever mutex guards it (so reconnect updates are visible). + dial func() (net.Conn, error) + // reconnect tries to rebuild the broken SSH session(s). Called once per + // accepted local connection when dial fails with a connection-style error. + // May itself perform retries with backoff. + reconnect func(ctx context.Context) error } -// startTunnelOnClient starts a tunnel on a raw ssh.Client -func startTunnelOnClient(ctx context.Context, sshClient *ssh.Client, localPort, remotePort string) (func() error, error) { - // Check context before starting +// runTunnelLoop runs the accept loop for an SSH tunnel. +// +// Compared to the previous inline implementation it adds two things: +// +// 1. **Auto-reconnect on dial failure.** When sshClient.Dial returns a +// connection-style error (EOF, connection lost, broken pipe…) we kick +// dialer.reconnect and retry the dial once with the freshly rebuilt +// SSH session. Without this, a Wi-Fi flap on the developer's laptop +// killed the SSH session permanently, the tunnel listener stayed up +// happily accepting local connects, but every Dial through the dead +// session returned EOF — and the test process spent the entire 20-min +// readiness timeout silently retrying client-go GETs through a port +// that nobody answered. See poll.go for the related per-call deadline. +// 2. **Visible WARN log when reconnect kicks in.** Previously the failure +// was swallowed (`return`); now we emit a WARN every time the tunnel +// has to be rebuilt so users can correlate "tests slowed down" with +// "wifi flapped". +func runTunnelLoop(ctx context.Context, localPort string, dialer tunnelDialer) (func() error, error) { if err := ctx.Err(); err != nil { return nil, fmt.Errorf("context error before starting tunnel: %w", err) } @@ -931,7 +914,6 @@ func startTunnelOnClient(ctx context.Context, sshClient *ssh.Client, localPort, go func() { defer listener.Close() for { - // Check context and stop channel select { case <-ctx.Done(): return @@ -940,63 +922,26 @@ func startTunnelOnClient(ctx context.Context, sshClient *ssh.Client, localPort, default: } - // Set deadline for Accept based on context deadline if available - if deadline, ok := ctx.Deadline(); ok { - if tcpListener, ok := listener.(*net.TCPListener); ok { - if err := tcpListener.SetDeadline(deadline); err != nil { - // If setting deadline fails, continue without it - } - } + // Short Accept deadline so the loop can re-check ctx/stopChan + // promptly even when no clients are connecting; a deadline tied + // to ctx.Deadline() fired only at the very end of the test. + if tcpListener, ok := listener.(*net.TCPListener); ok { + _ = tcpListener.SetDeadline(time.Now().Add(500 * time.Millisecond)) } localConn, err := listener.Accept() if err != nil { - // Listener closed or error occurred select { case <-ctx.Done(): return case <-stopChan: return default: - // Continue if not stopped continue } } - go func() { - defer localConn.Close() - remoteConn, err := sshClient.Dial("tcp", "127.0.0.1:"+remotePort) - if err != nil { - // Connection failed, just return - the error will be visible to the client - return - } - defer remoteConn.Close() - - // Copy data bidirectionally with context support - done := make(chan struct{}, 2) - go func() { - _, _ = copyWithContext(ctx, localConn, remoteConn) - done <- struct{}{} - }() - go func() { - _, _ = copyWithContext(ctx, remoteConn, localConn) - done <- struct{}{} - }() - - // Wait for either direction to finish or context cancellation - select { - case <-ctx.Done(): - return - case <-done: - // One direction finished, wait for the other - select { - case <-ctx.Done(): - return - case <-done: - // Both directions finished - } - } - }() + go handleTunnelConnection(ctx, localConn, dialer) } }() @@ -1004,10 +949,64 @@ func startTunnelOnClient(ctx context.Context, sshClient *ssh.Client, localPort, close(stopChan) return listener.Close() } - return stop, nil } +// handleTunnelConnection serves a single accepted local connection. On the +// first dial failure that looks like a dead SSH session we call +// dialer.reconnect and retry once. After that, further failures are surfaced +// to the local client by closing localConn (which causes client-go on the +// other side to see EOF and retry through the freshly opened tunnel on the +// next request). +func handleTunnelConnection(ctx context.Context, localConn net.Conn, dialer tunnelDialer) { + defer localConn.Close() + + remoteConn, err := dialer.dial() + if err != nil { + if !isConnectionError(err) { + // Non-connection errors (e.g. invalid address) won't be fixed by a + // reconnect — drop the local conn so the client sees the failure. + logger.Debug("SSH tunnel %s dial failed (non-retryable): %v", dialer.describe, err) + return + } + + logger.Warn("SSH tunnel %s dial failed (%v); attempting to reconnect SSH session", dialer.describe, err) + if rcErr := dialer.reconnect(ctx); rcErr != nil { + logger.Warn("SSH tunnel %s reconnect failed: %v", dialer.describe, rcErr) + return + } + logger.Info("SSH tunnel %s SSH session reconnected; retrying dial", dialer.describe) + + remoteConn, err = dialer.dial() + if err != nil { + logger.Warn("SSH tunnel %s dial still failing after reconnect: %v", dialer.describe, err) + return + } + } + defer remoteConn.Close() + + done := make(chan struct{}, 2) + go func() { + _, _ = copyWithContext(ctx, localConn, remoteConn) + done <- struct{}{} + }() + go func() { + _, _ = copyWithContext(ctx, remoteConn, localConn) + done <- struct{}{} + }() + + select { + case <-ctx.Done(): + return + case <-done: + select { + case <-ctx.Done(): + return + case <-done: + } + } +} + // Exec executes a command on the remote host with automatic retry and reconnection func (c *jumpHostClient) Exec(ctx context.Context, cmd string) (string, error) { var output string diff --git a/pkg/cluster/cluster.go b/pkg/cluster/cluster.go index b57f334..6d6d41f 100644 --- a/pkg/cluster/cluster.go +++ b/pkg/cluster/cluster.go @@ -149,6 +149,14 @@ func loadClusterConfigFromPath(configPath string) (*config.ClusterDefinition, er return nil, fmt.Errorf("failed to parse YAML config: %w", err) } + // Expand ${VAR} placeholders in modulePullOverride fields. CI uses this to + // pass a per-PR/MR image tag via a single env var (e.g. MODULE_IMAGE_TAG) + // without editing the YAML between runs. Missing envs fail fast here so we + // don't silently regress to "main" on accidentally unset variables. + if err := config.ExpandEnvInModulePullOverride(&clusterDef); err != nil { + return nil, fmt.Errorf("expand env in modulePullOverride: %w", err) + } + // Validate the configuration (using the same validation logic as internal/cluster) if len(clusterDef.Masters) == 0 { return nil, fmt.Errorf("at least one master node is required") @@ -194,6 +202,22 @@ func CreateTestCluster( ctx context.Context, yamlConfigFilename string, ) (*TestClusterResources, error) { + // Apply env-var defaults defensively so suites that don't call + // config.ValidateEnvironment() (e.g. csi-ceph e2e) still get sensible + // values for SSH_VM_USER / SSH_PRIVATE_KEY / SSH_PUBLIC_KEY / + // TEST_CLUSTER_NAMESPACE / YAML_CONFIG_FILENAME / TEST_CLUSTER_CLEANUP + // instead of empty strings that surface as obscure failures (e.g. + // user="" -> sshd "Invalid user", or "" filename -> directory read). + config.ApplyDefaults() + + // Belt-and-suspenders: function arg also has a documented default. Without + // this, an empty filename gets joined with the test-package directory and + // yields a path to the directory itself, failing later with a confusing + // "is a directory" read error. + if yamlConfigFilename == "" { + yamlConfigFilename = config.YAMLConfigFilenameDefaultValue + } + logger.Step(1, "Loading cluster configuration from %s", yamlConfigFilename) // Find the test package directory by walking the call stack. diff --git a/pkg/cluster/vms.go b/pkg/cluster/vms.go index 61c4e3f..2507b87 100644 --- a/pkg/cluster/vms.go +++ b/pkg/cluster/vms.go @@ -591,10 +591,39 @@ func getCVMINameFromImageURL(imageURL string) string { return name } +// cloudInitAptMirror configures cloud-init to use mirror.yandex.ru as the +// Ubuntu apt mirror for both the primary archive and security pools, and +// pins apt to IPv4. Default Ubuntu mirrors (archive.ubuntu.com / +// security.ubuntu.com) round-robin across many IPs and are partially +// unreachable from some Flant infra (e.g. some egress paths block all the +// IPv6 endpoints, and most IPv4 ones time out for archive.ubuntu.com), +// which makes Step 9 (Wait for Docker) and per-node package_update very flaky +// or outright stall. mirror.yandex.ru carries main/universe/multiverse/restricted +// for the same suites and is reachable in those environments. +// +// The leading newline keeps the indentation flush with the rest of the +// cloud-config when interpolated mid-document. +const cloudInitAptMirror = `apt: + primary: + - arches: [default] + uri: http://mirror.yandex.ru/ubuntu + security: + - arches: [default] + uri: http://mirror.yandex.ru/ubuntu +` + +// cloudInitForceIPv4 disables IPv6 for apt to avoid 30-second connection +// timeouts on every package fetch when the host lacks working IPv6 egress. +// Written via write_files so it is in effect before package_update runs. +const cloudInitForceIPv4Apt = ` - path: /etc/apt/apt.conf.d/99force-ipv4 + content: | + Acquire::ForceIPv4 "true"; +` + // generateCloudInitUserData generates cloud-init user data for VM provisioning (cluster nodes) func generateCloudInitUserData(hostname, sshPubKey string) string { return fmt.Sprintf(`#cloud-config -package_update: true +%spackage_update: true packages: - tmux - htop @@ -619,7 +648,7 @@ users: ssh_authorized_keys: - %s write_files: - - path: /etc/ssh/sshd_config.d/allow_tcp_forwarding.conf +%s - path: /etc/ssh/sshd_config.d/allow_tcp_forwarding.conf content: | # Разрешить TCP forwarding AllowTcpForwarding yes @@ -635,14 +664,14 @@ runcmd: - systemctl daemon-reload - systemctl enable --now qemu-guest-agent.service - echo 'source /root/.kubectl_aliases' >> /root/.bashrc -`, sshPubKey, hostname) +`, cloudInitAptMirror, sshPubKey, cloudInitForceIPv4Apt, hostname) } // generateSetupNodeCloudInit generates cloud-init user data for the setup/bootstrap node. // This includes Docker which is required for running the Deckhouse installer. func generateSetupNodeCloudInit(hostname, sshPubKey string) string { return fmt.Sprintf(`#cloud-config -package_update: true +%spackage_update: true packages: - tmux - htop @@ -664,7 +693,7 @@ users: ssh_authorized_keys: - %s write_files: - - path: /etc/ssh/sshd_config.d/allow_tcp_forwarding.conf +%s - path: /etc/ssh/sshd_config.d/allow_tcp_forwarding.conf content: | # Разрешить TCP forwarding AllowTcpForwarding yes @@ -675,7 +704,7 @@ runcmd: - systemctl daemon-reload - systemctl enable --now qemu-guest-agent.service - systemctl enable --now docker.service -`, sshPubKey, hostname) +`, cloudInitAptMirror, sshPubKey, cloudInitForceIPv4Apt, hostname) } // RemoveAllVMs forcefully stops and deletes virtual machines, virtual disks, and virtual images. diff --git a/pkg/kubernetes/cephblockpool.go b/pkg/kubernetes/cephblockpool.go new file mode 100644 index 0000000..8ad2dfc --- /dev/null +++ b/pkg/kubernetes/cephblockpool.go @@ -0,0 +1,225 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubernetes + +import ( + "context" + "fmt" + "time" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/rest" + + "github.com/deckhouse/storage-e2e/internal/logger" +) + +// CephBlockPoolGVR is the GroupVersionResource of Rook's CephBlockPool. +var CephBlockPoolGVR = schema.GroupVersionResource{ + Group: "ceph.rook.io", + Version: "v1", + Resource: "cephblockpools", +} + +// CephBlockPoolConfig describes a minimal replicated or erasure-coded Ceph +// RBD pool managed by Rook. Exactly one of ReplicaSize or ErasureCoded must +// be set; leaving both zero defaults to a single-replica pool suitable for +// single-node test clusters. +type CephBlockPoolConfig struct { + // Name of the CephBlockPool CR (also becomes the Ceph pool name). + Name string + + // Namespace the Rook operator watches (typically "d8-sds-elastic"). + Namespace string + + // FailureDomain is the CRUSH failure domain: "host" or "osd" (default: "host"). + FailureDomain string + + // --- Replicated pool knobs (used when ErasureCoded is nil) --- + + // ReplicaSize is the number of object copies. Default: 1. + ReplicaSize int + + // RequireSafeReplicaSize toggles Ceph's safeguard against single-replica + // pools. When nil, it is set to `false` for ReplicaSize==1 (unsafe single + // replica, accepted for e2e test clusters) and left unset otherwise. + RequireSafeReplicaSize *bool + + // --- Erasure-coded pool knobs --- + + // ErasureCoded, when non-nil, produces an EC pool instead of a replicated + // one. Its fields map to `spec.erasureCoded.{dataChunks,codingChunks}`. + ErasureCoded *CephBlockPoolErasureCoded +} + +// CephBlockPoolErasureCoded configures a Ceph erasure-coded RBD pool. +type CephBlockPoolErasureCoded struct { + DataChunks int + CodingChunks int +} + +// CreateCephBlockPool creates (or updates, if already present) a CephBlockPool +// in the given namespace from the provided configuration. It is idempotent and +// safe to call on every test run. +func CreateCephBlockPool(ctx context.Context, kubeconfig *rest.Config, cfg CephBlockPoolConfig) error { + if cfg.Name == "" { + return fmt.Errorf("CephBlockPool name is required") + } + if cfg.Namespace == "" { + return fmt.Errorf("CephBlockPool namespace is required") + } + if cfg.ErasureCoded == nil && cfg.ReplicaSize <= 0 { + cfg.ReplicaSize = 1 + } + if cfg.FailureDomain == "" { + cfg.FailureDomain = "host" + } + + spec := map[string]interface{}{ + "failureDomain": cfg.FailureDomain, + } + + if cfg.ErasureCoded != nil { + if cfg.ErasureCoded.DataChunks <= 0 || cfg.ErasureCoded.CodingChunks <= 0 { + return fmt.Errorf("ErasureCoded pool requires positive dataChunks and codingChunks") + } + spec["erasureCoded"] = map[string]interface{}{ + "dataChunks": int64(cfg.ErasureCoded.DataChunks), + "codingChunks": int64(cfg.ErasureCoded.CodingChunks), + } + } else { + replicated := map[string]interface{}{ + "size": int64(cfg.ReplicaSize), + } + requireSafe := cfg.RequireSafeReplicaSize + if requireSafe == nil && cfg.ReplicaSize == 1 { + f := false + requireSafe = &f + } + if requireSafe != nil { + replicated["requireSafeReplicaSize"] = *requireSafe + } + spec["replicated"] = replicated + } + + obj := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "ceph.rook.io/v1", + "kind": "CephBlockPool", + "metadata": map[string]interface{}{ + "name": cfg.Name, + "namespace": cfg.Namespace, + }, + "spec": spec, + }, + } + + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + + logger.Info("Creating CephBlockPool %s/%s", cfg.Namespace, cfg.Name) + _, err = dynamicClient.Resource(CephBlockPoolGVR).Namespace(cfg.Namespace).Create(ctx, obj, metav1.CreateOptions{}) + if err == nil { + logger.Success("CephBlockPool %s/%s created", cfg.Namespace, cfg.Name) + return nil + } + if !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("failed to create CephBlockPool %s/%s: %w", cfg.Namespace, cfg.Name, err) + } + + logger.Info("CephBlockPool %s/%s already exists, updating spec", cfg.Namespace, cfg.Name) + existing, err := dynamicClient.Resource(CephBlockPoolGVR).Namespace(cfg.Namespace).Get(ctx, cfg.Name, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to fetch existing CephBlockPool %s/%s: %w", cfg.Namespace, cfg.Name, err) + } + if err := errIfTerminating(existing, "CephBlockPool", formatRef(cfg.Namespace, cfg.Name)); err != nil { + return err + } + existing.Object["spec"] = spec + if _, err := dynamicClient.Resource(CephBlockPoolGVR).Namespace(cfg.Namespace).Update(ctx, existing, metav1.UpdateOptions{}); err != nil { + return fmt.Errorf("failed to update CephBlockPool %s/%s: %w", cfg.Namespace, cfg.Name, err) + } + return nil +} + +// WaitForCephBlockPoolReady blocks until the CephBlockPool reports +// `status.phase == "Ready"`. Rook transitions the pool from Progressing to +// Ready once the Ceph OSDs have accepted the new pool and its CRUSH rule. +// +// Per-call deadlines and loud (WARN) logging on consecutive network failures +// are inherited from pollResourceUntilReady, so a dropped SSH tunnel surfaces +// in seconds instead of after the parent timeout. +func WaitForCephBlockPoolReady(ctx context.Context, kubeconfig *rest.Config, namespace, name string, timeout time.Duration) error { + return pollResourceUntilReady( + ctx, kubeconfig, CephBlockPoolGVR, namespace, name, + timeout, PollTickInterval, "CephBlockPool", + func(obj *unstructured.Unstructured) (bool, string) { + phase, _, _ := unstructured.NestedString(obj.Object, "status", "phase") + if phase == "Ready" { + return true, "phase=Ready" + } + logger.Debug("CephBlockPool %s/%s phase: %q, waiting...", obj.GetNamespace(), obj.GetName(), phase) + return false, "" + }, + ) +} + +// DeleteCephBlockPool deletes a CephBlockPool. Safe to call if the pool does +// not exist. NOTE: this is fire-and-forget — the API call returns as soon as +// the apiserver accepts the request, but Rook may still be running its +// finalizer (`cephblockpool.ceph.rook.io`) for a few minutes afterwards. If +// you want to be certain the CR is fully gone before continuing, follow up +// with WaitForCephBlockPoolGone. +func DeleteCephBlockPool(ctx context.Context, kubeconfig *rest.Config, namespace, name string) error { + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + + if err := dynamicClient.Resource(CephBlockPoolGVR).Namespace(namespace).Delete(ctx, name, metav1.DeleteOptions{}); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed to delete CephBlockPool %s/%s: %w", namespace, name, err) + } + logger.Info("Deleted CephBlockPool %s/%s", namespace, name) + return nil +} + +// CephBlockPoolGoneTimeout is the default budget for WaitForCephBlockPoolGone. +// Rook removes the underlying RBD pool from Ceph before lifting the +// finalizer; with one OSD the pool delete normally completes in seconds but +// can take a few minutes if the cluster is unhealthy. +const CephBlockPoolGoneTimeout = 5 * time.Minute + +// WaitForCephBlockPoolGone polls until the CephBlockPool is fully GC'd by +// Kubernetes (GET returns NotFound). Use this after DeleteCephBlockPool to +// be sure the parent CephCluster won't be blocked by `ObjectHasDependents` +// when it gets deleted next. +func WaitForCephBlockPoolGone(ctx context.Context, kubeconfig *rest.Config, namespace, name string, timeout time.Duration) error { + if timeout <= 0 { + timeout = CephBlockPoolGoneTimeout + } + return pollResourceUntilGone( + ctx, kubeconfig, CephBlockPoolGVR, namespace, name, + timeout, PollTickInterval, "CephBlockPool", + ) +} diff --git a/pkg/kubernetes/cephcluster.go b/pkg/kubernetes/cephcluster.go new file mode 100644 index 0000000..501d8d8 --- /dev/null +++ b/pkg/kubernetes/cephcluster.go @@ -0,0 +1,411 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubernetes + +import ( + "context" + "fmt" + "time" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/rest" + + "github.com/deckhouse/storage-e2e/internal/logger" +) + +// CephClusterGVR is the GroupVersionResource of Rook's CephCluster. +var CephClusterGVR = schema.GroupVersionResource{ + Group: "ceph.rook.io", + Version: "v1", + Resource: "cephclusters", +} + +// Defaults shared between CephClusterConfig and the testkit-level helper. +const ( + DefaultRookNamespace = "d8-sds-elastic" + DefaultCephClusterName = "ceph-cluster" + DefaultCephImage = "quay.io/ceph/ceph:v18.2.7" + DefaultDataDirHostPath = "/var/lib/rook" + DefaultOSDStorageClassSize = "10Gi" +) + +// CephClusterConfig describes a Rook-managed Ceph cluster suitable for e2e +// testing. It is intentionally narrower than Rook's native CephCluster CRD: +// knobs that don't matter for our scenarios are hidden behind hard-coded +// defaults (mirroring the values from the internal Flant wiki instruction +// on deploying sds-elastic + Rook + Ceph on LVM). +type CephClusterConfig struct { + // Name of the CephCluster (default: "ceph-cluster"). + Name string + + // Namespace where Rook watches (default: "d8-sds-elastic"). + Namespace string + + // CephImage is the Ceph container image tag. + // Default: "quay.io/ceph/ceph:v18.2.7". + CephImage string + + // AllowUnsupportedCephVersion flips spec.cephVersion.allowUnsupported. + // Default: true (e2e clusters are allowed to run any version Ceph ships). + AllowUnsupportedCephVersion *bool + + // MonCount / MgrCount are the Rook mon/mgr replica counts. Defaults: + // 1 / 1, which is appropriate for single-node / tiny test clusters. + MonCount int + MgrCount int + + // AllowMultipleMonPerNode allows multiple mons on the same node + // (required for single-node clusters). Default: true. + AllowMultipleMonPerNode *bool + + // DataDirHostPath is where Rook persists mon/OSD data on each node. + // Default: "/var/lib/rook". + DataDirHostPath string + + // NetworkProvider selects the Rook networking mode. Supported values: + // "" — default CNI pod network (suitable for in-cluster e2e); + // "host" — host networking (matches the Flant wiki production layout). + NetworkProvider string + + // PublicNetworkCIDRs / ClusterNetworkCIDRs are the public/cluster CIDRs + // plumbed into `spec.network.addressRanges` when NetworkProvider is + // non-empty. They are ignored for the default (CNI) mode. + PublicNetworkCIDRs []string + ClusterNetworkCIDRs []string + + // --- OSD backing --- + + // OSDStorageClass is the name of a k8s StorageClass able to hand out + // block-mode PVCs. Those PVCs are used by Rook's + // `storage.storageClassDeviceSets` to back OSDs. + OSDStorageClass string + + // OSDCount is the number of OSDs to provision (default: 1). + OSDCount int + + // OSDSize is the size of each OSD PVC (default: "10Gi"). + OSDSize string + + // OSDDeviceSetName is the `storageClassDeviceSets[].name` (default: + // "set1"). Changing it is useful mostly for debugging. + OSDDeviceSetName string +} + +func (c *CephClusterConfig) applyDefaults() { + if c.Name == "" { + c.Name = DefaultCephClusterName + } + if c.Namespace == "" { + c.Namespace = DefaultRookNamespace + } + if c.CephImage == "" { + c.CephImage = DefaultCephImage + } + if c.AllowUnsupportedCephVersion == nil { + t := true + c.AllowUnsupportedCephVersion = &t + } + if c.MonCount <= 0 { + c.MonCount = 1 + } + if c.MgrCount <= 0 { + c.MgrCount = 1 + } + if c.AllowMultipleMonPerNode == nil { + t := true + c.AllowMultipleMonPerNode = &t + } + if c.DataDirHostPath == "" { + c.DataDirHostPath = DefaultDataDirHostPath + } + if c.OSDCount <= 0 { + c.OSDCount = 1 + } + if c.OSDSize == "" { + c.OSDSize = DefaultOSDStorageClassSize + } + if c.OSDDeviceSetName == "" { + c.OSDDeviceSetName = "set1" + } +} + +// CreateCephCluster creates (or updates) a CephCluster in the given namespace. +// It is idempotent: if the resource already exists, its spec is overwritten +// with the freshly-rendered one so callers can tweak `CephClusterConfig` and +// re-apply without manual cleanup. +func CreateCephCluster(ctx context.Context, kubeconfig *rest.Config, cfg CephClusterConfig) error { + cfg.applyDefaults() + + if cfg.OSDStorageClass == "" { + return fmt.Errorf("CephCluster requires OSDStorageClass (backing StorageClass for OSD PVCs)") + } + + spec := buildCephClusterSpec(cfg) + + obj := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "ceph.rook.io/v1", + "kind": "CephCluster", + "metadata": map[string]interface{}{ + "name": cfg.Name, + "namespace": cfg.Namespace, + }, + "spec": spec, + }, + } + + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + + logger.Info("Creating CephCluster %s/%s (image=%s, mon=%d, mgr=%d, osd=%d x %s on SC %s)", + cfg.Namespace, cfg.Name, cfg.CephImage, cfg.MonCount, cfg.MgrCount, cfg.OSDCount, cfg.OSDSize, cfg.OSDStorageClass) + + _, err = dynamicClient.Resource(CephClusterGVR).Namespace(cfg.Namespace).Create(ctx, obj, metav1.CreateOptions{}) + if err == nil { + logger.Success("CephCluster %s/%s created", cfg.Namespace, cfg.Name) + return nil + } + if !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("failed to create CephCluster %s/%s: %w", cfg.Namespace, cfg.Name, err) + } + + logger.Info("CephCluster %s/%s already exists, updating spec", cfg.Namespace, cfg.Name) + existing, err := dynamicClient.Resource(CephClusterGVR).Namespace(cfg.Namespace).Get(ctx, cfg.Name, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to fetch existing CephCluster %s/%s: %w", cfg.Namespace, cfg.Name, err) + } + if err := errIfTerminating(existing, "CephCluster", formatRef(cfg.Namespace, cfg.Name)); err != nil { + return err + } + existing.Object["spec"] = spec + if _, err := dynamicClient.Resource(CephClusterGVR).Namespace(cfg.Namespace).Update(ctx, existing, metav1.UpdateOptions{}); err != nil { + return fmt.Errorf("failed to update CephCluster %s/%s: %w", cfg.Namespace, cfg.Name, err) + } + return nil +} + +// buildCephClusterSpec renders the spec portion of a CephCluster object. The +// choice of fields follows the Flant internal wiki instruction for +// sds-elastic + Rook + Ceph, stripped down to the parts that matter in e2e: +// - mon/mgr counts come from the config (1/1 by default for single-node); +// - network.provider=host is opt-in via NetworkProvider; +// - OSDs are backed by one `storageClassDeviceSets[0]` entry that points +// to a user-supplied StorageClass capable of issuing block-mode PVCs. +func buildCephClusterSpec(cfg CephClusterConfig) map[string]interface{} { + spec := map[string]interface{}{ + "cephVersion": map[string]interface{}{ + "image": cfg.CephImage, + "allowUnsupported": *cfg.AllowUnsupportedCephVersion, + }, + "dataDirHostPath": cfg.DataDirHostPath, + "skipUpgradeChecks": false, + "continueUpgradeAfterChecksEvenIfNotHealthy": false, + "mon": map[string]interface{}{ + "count": int64(cfg.MonCount), + "allowMultiplePerNode": *cfg.AllowMultipleMonPerNode, + }, + "mgr": map[string]interface{}{ + "count": int64(cfg.MgrCount), + "allowMultiplePerNode": *cfg.AllowMultipleMonPerNode, + "modules": []interface{}{ + map[string]interface{}{ + "name": "pg_autoscaler", + "enabled": true, + }, + }, + }, + "dashboard": map[string]interface{}{ + "enabled": false, + "ssl": false, + }, + "crashCollector": map[string]interface{}{ + "disable": false, + }, + "logCollector": map[string]interface{}{ + "enabled": true, + "periodicity": "daily", + "maxLogSize": "100M", + }, + "priorityClassNames": map[string]interface{}{ + "mon": "system-node-critical", + "osd": "system-node-critical", + "mgr": "system-cluster-critical", + }, + "disruptionManagement": map[string]interface{}{ + "managePodBudgets": true, + "osdMaintenanceTimeout": int64(30), + "pgHealthCheckTimeout": int64(0), + }, + "storage": map[string]interface{}{ + "useAllNodes": true, + "useAllDevices": false, + "storageClassDeviceSets": []interface{}{ + map[string]interface{}{ + "name": cfg.OSDDeviceSetName, + "count": int64(cfg.OSDCount), + "portable": false, + "tuneDeviceClass": true, + "volumeClaimTemplates": []interface{}{ + map[string]interface{}{ + "metadata": map[string]interface{}{ + "name": "data", + }, + "spec": map[string]interface{}{ + "resources": map[string]interface{}{ + "requests": map[string]interface{}{ + "storage": cfg.OSDSize, + }, + }, + "storageClassName": cfg.OSDStorageClass, + "volumeMode": "Block", + "accessModes": []interface{}{"ReadWriteOnce"}, + }, + }, + }, + }, + }, + }, + } + + if cfg.NetworkProvider != "" { + network := map[string]interface{}{ + "provider": cfg.NetworkProvider, + "connections": map[string]interface{}{ + "encryption": map[string]interface{}{"enabled": false}, + "compression": map[string]interface{}{"enabled": false}, + "requireMsgr2": false, + }, + } + + addrs := map[string]interface{}{} + if len(cfg.PublicNetworkCIDRs) > 0 { + addrs["public"] = toInterfaceSlice(cfg.PublicNetworkCIDRs) + } + if len(cfg.ClusterNetworkCIDRs) > 0 { + addrs["cluster"] = toInterfaceSlice(cfg.ClusterNetworkCIDRs) + } + if len(addrs) > 0 { + network["addressRanges"] = addrs + } + spec["network"] = network + } + + return spec +} + +// toInterfaceSlice converts a []string to a []interface{} so it can be +// embedded into an `unstructured.Unstructured`'s object tree. +func toInterfaceSlice(in []string) []interface{} { + out := make([]interface{}, len(in)) + for i, v := range in { + out[i] = v + } + return out +} + +// WaitForCephClusterReady blocks until the CephCluster status reports that +// Ceph is up and healthy. Rook exposes the cluster state through two status +// fields: +// - `status.state` — overall lifecycle phase ("Creating", "Created", +// "Updating", "Error"); +// - `status.ceph.health` — the Ceph health summary ("HEALTH_OK", +// "HEALTH_WARN", "HEALTH_ERR"). On a single-OSD test cluster Ceph often +// sits in HEALTH_WARN (PGs undersized, no replicas), which we still treat +// as "good enough" as long as `status.state == "Created"`. +// +// We return success once `state == "Created"`. HEALTH_ERR is reported in the +// log and does not short-circuit (Rook may recover). +// +// Network errors are logged loud (WARN) after a few consecutive failures so a +// dropped SSH tunnel surfaces in seconds instead of getting buried in Debug +// output. See pollResourceUntilReady for the per-call deadline rationale. +func WaitForCephClusterReady(ctx context.Context, kubeconfig *rest.Config, namespace, name string, timeout time.Duration) error { + return pollResourceUntilReady( + ctx, kubeconfig, CephClusterGVR, namespace, name, + timeout, 10*time.Second, "CephCluster", + func(obj *unstructured.Unstructured) (bool, string) { + state, _, _ := unstructured.NestedString(obj.Object, "status", "state") + health, _, _ := unstructured.NestedString(obj.Object, "status", "ceph", "health") + phase, _, _ := unstructured.NestedString(obj.Object, "status", "phase") + + if state == "Created" || phase == "Ready" { + return true, fmt.Sprintf("state=%s phase=%s ceph health: %s", state, phase, health) + } + logger.Debug("CephCluster %s/%s state=%q phase=%q health=%q", + obj.GetNamespace(), obj.GetName(), state, phase, health) + return false, "" + }, + ) +} + +// DeleteCephCluster removes a CephCluster. Tearing down the cluster this way +// is a *destructive* operation — Rook will leave OSD data on host disks under +// `dataDirHostPath` and operator-managed PVCs will not be garbage-collected +// automatically. The operation is still idempotent: a NotFound error is +// swallowed. +// +// NOTE: this is fire-and-forget. The apiserver returns success as soon as it +// records the delete intent; Rook then runs its `cephcluster.ceph.rook.io` +// finalizer for several minutes, removing pools, mon/mgr/osd pods, and so +// on. If any dependent CR (CephBlockPool, CephFilesystem, ...) is still +// alive, Rook records `DeletionIsBlocked / ObjectHasDependents` and the CR +// stays in `phase=Deleting` indefinitely. Always tear down dependents first +// (and call WaitForCephBlockPoolGone / WaitForCephFilesystemGone on them) +// before invoking DeleteCephCluster, then follow up with +// WaitForCephClusterGone. +func DeleteCephCluster(ctx context.Context, kubeconfig *rest.Config, namespace, name string) error { + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + + if err := dynamicClient.Resource(CephClusterGVR).Namespace(namespace).Delete(ctx, name, metav1.DeleteOptions{}); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed to delete CephCluster %s/%s: %w", namespace, name, err) + } + logger.Info("Deleted CephCluster %s/%s", namespace, name) + return nil +} + +// CephClusterGoneTimeout is the default budget for WaitForCephClusterGone. +// Rook needs to drain mon/mgr/osd pods, remove the CRUSH map, and unset +// finalizers — easily 5+ minutes on a single-OSD cluster, longer on +// degraded ones. +const CephClusterGoneTimeout = 10 * time.Minute + +// WaitForCephClusterGone polls until the CephCluster is fully GC'd by +// Kubernetes (GET returns NotFound). The poller logs the +// deletionTimestamp/finalizers progress periodically so a stuck finalizer +// (typical e2e failure: orphan dependent CR, broken Ceph health) is +// immediately visible in the test log instead of being hidden behind a +// silent timeout. +func WaitForCephClusterGone(ctx context.Context, kubeconfig *rest.Config, namespace, name string, timeout time.Duration) error { + if timeout <= 0 { + timeout = CephClusterGoneTimeout + } + return pollResourceUntilGone( + ctx, kubeconfig, CephClusterGVR, namespace, name, + timeout, PollTickInterval, "CephCluster", + ) +} diff --git a/pkg/kubernetes/cephclusterconnection.go b/pkg/kubernetes/cephclusterconnection.go new file mode 100644 index 0000000..f8117db --- /dev/null +++ b/pkg/kubernetes/cephclusterconnection.go @@ -0,0 +1,313 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubernetes + +import ( + "context" + "fmt" + "time" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/rest" + + "github.com/deckhouse/storage-e2e/internal/logger" +) + +// GVRs of the csi-ceph cluster-scoped CRs. We use unstructured to avoid +// pulling github.com/deckhouse/csi-ceph/api into go.mod just for these +// tiny types. +var ( + CephClusterConnectionGVR = schema.GroupVersionResource{ + Group: "storage.deckhouse.io", + Version: "v1alpha1", + Resource: "cephclusterconnections", + } + CephClusterAuthenticationGVR = schema.GroupVersionResource{ + Group: "storage.deckhouse.io", + Version: "v1alpha1", + Resource: "cephclusterauthentications", + } +) + +// CephClusterAuthenticationConfig describes CephX credentials that csi-ceph +// reuses for every StorageClass that references the authentication. +type CephClusterAuthenticationConfig struct { + // Name of the CephClusterAuthentication CR. + Name string + // UserID is the Ceph user (typically "admin"). + UserID string + // UserKey is the CephX key of UserID. + UserKey string +} + +// CreateCephClusterAuthentication creates (or updates) a +// CephClusterAuthentication CR with the given CephX credentials. +func CreateCephClusterAuthentication(ctx context.Context, kubeconfig *rest.Config, cfg CephClusterAuthenticationConfig) error { + if cfg.Name == "" { + return fmt.Errorf("CephClusterAuthentication name is required") + } + if cfg.UserID == "" { + return fmt.Errorf("CephClusterAuthentication UserID is required") + } + if cfg.UserKey == "" { + return fmt.Errorf("CephClusterAuthentication UserKey is required") + } + + obj := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "storage.deckhouse.io/v1alpha1", + "kind": "CephClusterAuthentication", + "metadata": map[string]interface{}{ + "name": cfg.Name, + }, + "spec": map[string]interface{}{ + "userID": cfg.UserID, + "userKey": cfg.UserKey, + }, + }, + } + + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + + logger.Info("Creating CephClusterAuthentication %s (userID=%s)", cfg.Name, cfg.UserID) + _, err = dynamicClient.Resource(CephClusterAuthenticationGVR).Create(ctx, obj, metav1.CreateOptions{}) + if err == nil { + return nil + } + if !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("failed to create CephClusterAuthentication %s: %w", cfg.Name, err) + } + + logger.Info("CephClusterAuthentication %s already exists, updating spec", cfg.Name) + existing, err := dynamicClient.Resource(CephClusterAuthenticationGVR).Get(ctx, cfg.Name, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to fetch CephClusterAuthentication %s: %w", cfg.Name, err) + } + if err := errIfTerminating(existing, "CephClusterAuthentication", cfg.Name); err != nil { + return err + } + existing.Object["spec"] = obj.Object["spec"] + if _, err := dynamicClient.Resource(CephClusterAuthenticationGVR).Update(ctx, existing, metav1.UpdateOptions{}); err != nil { + return fmt.Errorf("failed to update CephClusterAuthentication %s: %w", cfg.Name, err) + } + return nil +} + +// DeleteCephClusterAuthentication removes a CephClusterAuthentication. +// NotFound is treated as success. Pair with WaitForCephClusterAuthenticationGone +// when teardown order matters. +func DeleteCephClusterAuthentication(ctx context.Context, kubeconfig *rest.Config, name string) error { + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + if err := dynamicClient.Resource(CephClusterAuthenticationGVR).Delete(ctx, name, metav1.DeleteOptions{}); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed to delete CephClusterAuthentication %s: %w", name, err) + } + logger.Info("Deleted CephClusterAuthentication %s", name) + return nil +} + +// CephClusterAuthenticationGoneTimeout is the default budget for +// WaitForCephClusterAuthenticationGone. The CR has no heavy finalizer. +const CephClusterAuthenticationGoneTimeout = 1 * time.Minute + +// WaitForCephClusterAuthenticationGone polls until the CephClusterAuthentication +// is fully GC'd by Kubernetes (GET returns NotFound). +func WaitForCephClusterAuthenticationGone(ctx context.Context, kubeconfig *rest.Config, name string, timeout time.Duration) error { + if timeout <= 0 { + timeout = CephClusterAuthenticationGoneTimeout + } + return pollResourceUntilGone( + ctx, kubeconfig, CephClusterAuthenticationGVR, "", name, + timeout, PollTickInterval, "CephClusterAuthentication", + ) +} + +// CephClusterConnectionConfig describes a csi-ceph CephClusterConnection CR. +// Its spec.clusterID (== Ceph fsid) is immutable once created. +type CephClusterConnectionConfig struct { + // Name of the CephClusterConnection CR. + Name string + // ClusterID is the Ceph fsid. Immutable after creation. + ClusterID string + // Monitors is the list of `ip:port` monitor endpoints. + Monitors []string + // UserID is the Ceph user (typically "admin"). + UserID string + // UserKey is the CephX key of UserID. + UserKey string +} + +// CreateCephClusterConnection creates (or updates) a CephClusterConnection CR. +// If the resource already exists we do *not* attempt to update spec.clusterID +// (which the CRD marks immutable) — only Monitors/UserID/UserKey are synced. +func CreateCephClusterConnection(ctx context.Context, kubeconfig *rest.Config, cfg CephClusterConnectionConfig) error { + if cfg.Name == "" { + return fmt.Errorf("CephClusterConnection name is required") + } + if cfg.ClusterID == "" { + return fmt.Errorf("CephClusterConnection ClusterID (fsid) is required") + } + if len(cfg.Monitors) == 0 { + return fmt.Errorf("CephClusterConnection Monitors is required") + } + + monitors := make([]interface{}, len(cfg.Monitors)) + for i, m := range cfg.Monitors { + monitors[i] = m + } + + obj := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "storage.deckhouse.io/v1alpha1", + "kind": "CephClusterConnection", + "metadata": map[string]interface{}{ + "name": cfg.Name, + }, + "spec": map[string]interface{}{ + "clusterID": cfg.ClusterID, + "monitors": monitors, + "userID": cfg.UserID, + "userKey": cfg.UserKey, + }, + }, + } + + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + + logger.Info("Creating CephClusterConnection %s (clusterID=%s, mons=%d)", cfg.Name, cfg.ClusterID, len(cfg.Monitors)) + _, err = dynamicClient.Resource(CephClusterConnectionGVR).Create(ctx, obj, metav1.CreateOptions{}) + if err == nil { + return nil + } + if !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("failed to create CephClusterConnection %s: %w", cfg.Name, err) + } + + logger.Info("CephClusterConnection %s already exists, syncing monitors/userID/userKey", cfg.Name) + existing, err := dynamicClient.Resource(CephClusterConnectionGVR).Get(ctx, cfg.Name, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to fetch CephClusterConnection %s: %w", cfg.Name, err) + } + if err := errIfTerminating(existing, "CephClusterConnection", cfg.Name); err != nil { + return err + } + if err := unstructured.SetNestedSlice(existing.Object, monitors, "spec", "monitors"); err != nil { + return fmt.Errorf("set monitors: %w", err) + } + if err := unstructured.SetNestedField(existing.Object, cfg.UserID, "spec", "userID"); err != nil { + return fmt.Errorf("set userID: %w", err) + } + if err := unstructured.SetNestedField(existing.Object, cfg.UserKey, "spec", "userKey"); err != nil { + return fmt.Errorf("set userKey: %w", err) + } + if _, err := dynamicClient.Resource(CephClusterConnectionGVR).Update(ctx, existing, metav1.UpdateOptions{}); err != nil { + return fmt.Errorf("failed to update CephClusterConnection %s: %w", cfg.Name, err) + } + return nil +} + +// DeleteCephClusterConnection removes a CephClusterConnection. +// NotFound is treated as success. Pair with WaitForCephClusterConnectionGone +// when teardown order matters. +func DeleteCephClusterConnection(ctx context.Context, kubeconfig *rest.Config, name string) error { + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + if err := dynamicClient.Resource(CephClusterConnectionGVR).Delete(ctx, name, metav1.DeleteOptions{}); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed to delete CephClusterConnection %s: %w", name, err) + } + logger.Info("Deleted CephClusterConnection %s", name) + return nil +} + +// CephClusterConnectionGoneTimeout is the default budget for +// WaitForCephClusterConnectionGone. The CR has no heavy finalizer. +const CephClusterConnectionGoneTimeout = 1 * time.Minute + +// WaitForCephClusterConnectionGone polls until the CephClusterConnection is +// fully GC'd by Kubernetes (GET returns NotFound). +func WaitForCephClusterConnectionGone(ctx context.Context, kubeconfig *rest.Config, name string, timeout time.Duration) error { + if timeout <= 0 { + timeout = CephClusterConnectionGoneTimeout + } + return pollResourceUntilGone( + ctx, kubeconfig, CephClusterConnectionGVR, "", name, + timeout, PollTickInterval, "CephClusterConnection", + ) +} + +// WaitForCephClusterConnectionCreated polls until the CephClusterConnection +// status reports phase=Created. csi-ceph's controller flips the status from +// Pending to Created once it has verified the supplied fsid / monitors / +// CephX credentials against the real Ceph cluster. +func WaitForCephClusterConnectionCreated(ctx context.Context, kubeconfig *rest.Config, name string, timeout time.Duration) error { + if name == "" { + return fmt.Errorf("name is required") + } + + logger.Debug("Waiting for CephClusterConnection %s phase=Created (timeout: %v)", name, timeout) + + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + + ctx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + + for { + obj, err := dynamicClient.Resource(CephClusterConnectionGVR).Get(ctx, name, metav1.GetOptions{}) + if err == nil { + phase, _, _ := unstructured.NestedString(obj.Object, "status", "phase") + reason, _, _ := unstructured.NestedString(obj.Object, "status", "reason") + if phase == "Created" { + logger.Success("CephClusterConnection %s is Created", name) + return nil + } + logger.Debug("CephClusterConnection %s phase=%q reason=%q", name, phase, reason) + } else if !apierrors.IsNotFound(err) { + logger.Debug("Error getting CephClusterConnection %s: %v", name, err) + } + + select { + case <-ctx.Done(): + return fmt.Errorf("timeout waiting for CephClusterConnection %s: %w", name, ctx.Err()) + case <-ticker.C: + } + } +} diff --git a/pkg/kubernetes/cephcredentials.go b/pkg/kubernetes/cephcredentials.go new file mode 100644 index 0000000..11f68ec --- /dev/null +++ b/pkg/kubernetes/cephcredentials.go @@ -0,0 +1,183 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubernetes + +import ( + "context" + "fmt" + "sort" + "strings" + "time" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/rest" + + "github.com/deckhouse/storage-e2e/internal/logger" +) + +// Well-known Rook resources that hold Ceph connection data. +const ( + // RookMonSecretName is the Secret that the Rook operator populates with + // admin credentials and cluster fsid once the CephCluster is bootstrapped. + RookMonSecretName = "rook-ceph-mon" + + // RookMonEndpointsConfigMapName is the ConfigMap the operator keeps in + // sync with the current set of Ceph monitors. + RookMonEndpointsConfigMapName = "rook-ceph-mon-endpoints" +) + +// CephCredentials holds the information a Ceph CSI client needs to connect +// to a cluster bootstrapped by Rook. +type CephCredentials struct { + // FSID is the Ceph cluster unique identifier. + FSID string + + // AdminUser is the Ceph user name (typically "admin"). + AdminUser string + + // AdminKey is the CephX key for AdminUser. + AdminKey string + + // Monitors is the list of monitor endpoints in "IP:PORT" form, sorted + // alphabetically to make the output stable across runs. + Monitors []string +} + +// WaitForCephCredentials blocks until all pieces of information required to +// connect to the Rook-managed Ceph cluster are populated: +// - Secret `rook-ceph-mon` exists and has `fsid`, `ceph-username`, `ceph-secret`. +// - ConfigMap `rook-ceph-mon-endpoints` exists and has at least one reachable monitor. +// +// The returned CephCredentials is suitable for wiring csi-ceph CRs +// (CephClusterConnection, CephClusterAuthentication). +func WaitForCephCredentials(ctx context.Context, kubeconfig *rest.Config, namespace string, timeout time.Duration) (*CephCredentials, error) { + if namespace == "" { + return nil, fmt.Errorf("namespace is required") + } + + logger.Debug("Waiting for Ceph credentials in %s (timeout: %v)", namespace, timeout) + + clientset, err := NewClientsetWithRetry(ctx, kubeconfig) + if err != nil { + return nil, fmt.Errorf("failed to create clientset: %w", err) + } + + ctx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + + for { + secret, err := clientset.CoreV1().Secrets(namespace).Get(ctx, RookMonSecretName, metav1.GetOptions{}) + if err != nil && !apierrors.IsNotFound(err) { + logger.Debug("Failed to get Secret %s/%s: %v", namespace, RookMonSecretName, err) + } + + cm, cmErr := clientset.CoreV1().ConfigMaps(namespace).Get(ctx, RookMonEndpointsConfigMapName, metav1.GetOptions{}) + if cmErr != nil && !apierrors.IsNotFound(cmErr) { + logger.Debug("Failed to get ConfigMap %s/%s: %v", namespace, RookMonEndpointsConfigMapName, cmErr) + } + + if err == nil && cmErr == nil { + creds, extractErr := extractCephCredentials(secret.Data, cm.Data) + if extractErr == nil { + logger.Success("Ceph credentials ready in %s (fsid=%s, %d monitor(s))", namespace, creds.FSID, len(creds.Monitors)) + return creds, nil + } + logger.Debug("Rook credentials not complete yet: %v", extractErr) + } + + select { + case <-ctx.Done(): + return nil, fmt.Errorf("timeout waiting for Ceph credentials in %s: %w", namespace, ctx.Err()) + case <-ticker.C: + } + } +} + +// extractCephCredentials parses the Rook-managed Secret/ConfigMap payloads +// into a CephCredentials struct. It returns an error if any required field +// is missing so the caller can keep polling until the operator has populated +// everything. +func extractCephCredentials(secretData map[string][]byte, cmData map[string]string) (*CephCredentials, error) { + fsid := strings.TrimSpace(string(secretData["fsid"])) + if fsid == "" { + return nil, fmt.Errorf("Secret %s is missing `fsid`", RookMonSecretName) + } + + adminUser := strings.TrimSpace(string(secretData["ceph-username"])) + if adminUser == "" { + adminUser = "client.admin" + } + adminUser = strings.TrimPrefix(adminUser, "client.") + + adminKey := strings.TrimSpace(string(secretData["ceph-secret"])) + if adminKey == "" { + return nil, fmt.Errorf("Secret %s is missing `ceph-secret`", RookMonSecretName) + } + + raw, ok := cmData["data"] + if !ok { + return nil, fmt.Errorf("ConfigMap %s is missing `data`", RookMonEndpointsConfigMapName) + } + monitors, err := parseMonEndpoints(raw) + if err != nil { + return nil, err + } + if len(monitors) == 0 { + return nil, fmt.Errorf("ConfigMap %s has no populated monitor endpoints", RookMonEndpointsConfigMapName) + } + + return &CephCredentials{ + FSID: fsid, + AdminUser: adminUser, + AdminKey: adminKey, + Monitors: monitors, + }, nil +} + +// parseMonEndpoints parses the Rook-maintained monitor endpoints string. +// +// Rook stores the current mon list in the `data` key of the +// `rook-ceph-mon-endpoints` ConfigMap as a comma-separated list of +// `=:` pairs, for example: +// +// a=10.0.0.1:6789,b=10.0.0.2:6789,c=10.0.0.3:6789 +// +// This helper returns just the `:` portion of every entry, sorted +// alphabetically for stable output. +func parseMonEndpoints(raw string) ([]string, error) { + out := []string{} + for _, part := range strings.Split(raw, ",") { + part = strings.TrimSpace(part) + if part == "" { + continue + } + // Strip the "=" prefix if present. + if idx := strings.Index(part, "="); idx >= 0 { + part = part[idx+1:] + } + if part == "" { + continue + } + out = append(out, part) + } + sort.Strings(out) + return out, nil +} diff --git a/pkg/kubernetes/cephfilesystem.go b/pkg/kubernetes/cephfilesystem.go new file mode 100644 index 0000000..91fab14 --- /dev/null +++ b/pkg/kubernetes/cephfilesystem.go @@ -0,0 +1,274 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubernetes + +import ( + "context" + "fmt" + "time" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/rest" + + "github.com/deckhouse/storage-e2e/internal/logger" +) + +// CephFilesystemGVR is the GroupVersionResource of Rook's CephFilesystem. +var CephFilesystemGVR = schema.GroupVersionResource{ + Group: "ceph.rook.io", + Version: "v1", + Resource: "cephfilesystems", +} + +// CephFilesystemConfig describes a minimal Rook CephFilesystem with one +// metadata pool and exactly one data pool. Defaults are tuned for tiny +// single-node test clusters and mirror CephBlockPoolConfig conventions. +type CephFilesystemConfig struct { + // Name of the CephFilesystem CR. + Name string + + // Namespace the Rook operator watches (typically "d8-sds-elastic"). + Namespace string + + // FailureDomain is the CRUSH failure domain: "host" or "osd" + // (default: "osd" when MetadataPoolReplicas == DataPoolReplicas == 1, + // "host" otherwise). + FailureDomain string + + // MetadataPoolReplicas is the metadata pool replication factor. Default: 1. + MetadataPoolReplicas int + + // DataPoolName is the (Rook-side) data pool name. The full Ceph pool + // name is "-" — see CephFSDataPoolFullName. + // Default: "data0". + DataPoolName string + + // DataPoolReplicas is the data pool replication factor. Default: 1. + DataPoolReplicas int + + // MetadataServerActiveCount is the number of active MDS daemons. + // Default: 1. + MetadataServerActiveCount int + + // RequireSafeReplicaSize toggles Ceph's safeguard against single-replica + // pools. When nil, it is set to false for replicas==1 (unsafe single + // replica, accepted for e2e test clusters) and left unset otherwise. + RequireSafeReplicaSize *bool +} + +// CephFSDataPoolFullName returns the full Ceph pool name that ends up +// referenced from CephStorageClass.spec.cephFS.pool. Rook composes the +// per-filesystem pool name as "-". +func CephFSDataPoolFullName(fsName, dataPoolName string) string { + return fmt.Sprintf("%s-%s", fsName, dataPoolName) +} + +// CreateCephFilesystem creates (or updates, if already present) a +// CephFilesystem in the given namespace from the provided configuration. It +// is idempotent and safe to call on every test run. +func CreateCephFilesystem(ctx context.Context, kubeconfig *rest.Config, cfg CephFilesystemConfig) error { + if cfg.Name == "" { + return fmt.Errorf("CephFilesystem name is required") + } + if cfg.Namespace == "" { + return fmt.Errorf("CephFilesystem namespace is required") + } + if cfg.MetadataPoolReplicas <= 0 { + cfg.MetadataPoolReplicas = 1 + } + if cfg.DataPoolReplicas <= 0 { + cfg.DataPoolReplicas = 1 + } + if cfg.DataPoolName == "" { + cfg.DataPoolName = "data0" + } + if cfg.MetadataServerActiveCount <= 0 { + cfg.MetadataServerActiveCount = 1 + } + if cfg.FailureDomain == "" { + if cfg.MetadataPoolReplicas == 1 && cfg.DataPoolReplicas == 1 { + cfg.FailureDomain = "osd" + } else { + cfg.FailureDomain = "host" + } + } + + requireSafe := cfg.RequireSafeReplicaSize + if requireSafe == nil && (cfg.MetadataPoolReplicas == 1 || cfg.DataPoolReplicas == 1) { + f := false + requireSafe = &f + } + + metadataReplicated := map[string]interface{}{ + "size": int64(cfg.MetadataPoolReplicas), + } + dataReplicated := map[string]interface{}{ + "size": int64(cfg.DataPoolReplicas), + } + if requireSafe != nil { + metadataReplicated["requireSafeReplicaSize"] = *requireSafe + dataReplicated["requireSafeReplicaSize"] = *requireSafe + } + + spec := map[string]interface{}{ + "metadataPool": map[string]interface{}{ + "failureDomain": cfg.FailureDomain, + "replicated": metadataReplicated, + }, + "dataPools": []interface{}{ + map[string]interface{}{ + "name": cfg.DataPoolName, + "failureDomain": cfg.FailureDomain, + "replicated": dataReplicated, + }, + }, + "preserveFilesystemOnDelete": false, + "metadataServer": map[string]interface{}{ + "activeCount": int64(cfg.MetadataServerActiveCount), + "activeStandby": false, + }, + } + + obj := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "ceph.rook.io/v1", + "kind": "CephFilesystem", + "metadata": map[string]interface{}{ + "name": cfg.Name, + "namespace": cfg.Namespace, + }, + "spec": spec, + }, + } + + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + + logger.Info("Creating CephFilesystem %s/%s", cfg.Namespace, cfg.Name) + _, err = dynamicClient.Resource(CephFilesystemGVR).Namespace(cfg.Namespace).Create(ctx, obj, metav1.CreateOptions{}) + if err == nil { + logger.Success("CephFilesystem %s/%s created", cfg.Namespace, cfg.Name) + return nil + } + if !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("failed to create CephFilesystem %s/%s: %w", cfg.Namespace, cfg.Name, err) + } + + logger.Info("CephFilesystem %s/%s already exists, updating spec", cfg.Namespace, cfg.Name) + existing, err := dynamicClient.Resource(CephFilesystemGVR).Namespace(cfg.Namespace).Get(ctx, cfg.Name, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to fetch existing CephFilesystem %s/%s: %w", cfg.Namespace, cfg.Name, err) + } + if err := errIfTerminating(existing, "CephFilesystem", formatRef(cfg.Namespace, cfg.Name)); err != nil { + return err + } + existing.Object["spec"] = spec + if _, err := dynamicClient.Resource(CephFilesystemGVR).Namespace(cfg.Namespace).Update(ctx, existing, metav1.UpdateOptions{}); err != nil { + return fmt.Errorf("failed to update CephFilesystem %s/%s: %w", cfg.Namespace, cfg.Name, err) + } + return nil +} + +// WaitForCephFilesystemReady blocks until the CephFilesystem reports +// `status.phase == "Ready"`. As a fallback (some Rook revisions populate +// `status.conditions` first) the function also accepts a Ready=True +// condition. +// +// Per-call deadlines and loud (WARN) logging on consecutive network failures +// are inherited from pollResourceUntilReady. +func WaitForCephFilesystemReady(ctx context.Context, kubeconfig *rest.Config, namespace, name string, timeout time.Duration) error { + return pollResourceUntilReady( + ctx, kubeconfig, CephFilesystemGVR, namespace, name, + timeout, PollTickInterval, "CephFilesystem", + func(obj *unstructured.Unstructured) (bool, string) { + phase, _, _ := unstructured.NestedString(obj.Object, "status", "phase") + if phase == "Ready" { + return true, "status.phase" + } + if cephFilesystemReadyByCondition(obj.Object) { + return true, "status.conditions[Ready]=True" + } + logger.Debug("CephFilesystem %s/%s phase: %q, waiting...", obj.GetNamespace(), obj.GetName(), phase) + return false, "" + }, + ) +} + +func cephFilesystemReadyByCondition(obj map[string]interface{}) bool { + conditions, found, err := unstructured.NestedSlice(obj, "status", "conditions") + if err != nil || !found { + return false + } + for _, raw := range conditions { + cond, ok := raw.(map[string]interface{}) + if !ok { + continue + } + ctype, _, _ := unstructured.NestedString(cond, "type") + cstatus, _, _ := unstructured.NestedString(cond, "status") + if ctype == "Ready" && cstatus == "True" { + return true + } + } + return false +} + +// DeleteCephFilesystem deletes a CephFilesystem. Safe to call if the +// filesystem does not exist. NOTE: fire-and-forget — Rook's +// `cephfilesystem.ceph.rook.io` finalizer takes time to detach the MDS +// daemons and remove the metadata/data pools. Pair with +// WaitForCephFilesystemGone if you need to know the CR has actually been +// GC'd before doing something else (e.g. deleting the parent CephCluster). +func DeleteCephFilesystem(ctx context.Context, kubeconfig *rest.Config, namespace, name string) error { + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + + if err := dynamicClient.Resource(CephFilesystemGVR).Namespace(namespace).Delete(ctx, name, metav1.DeleteOptions{}); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed to delete CephFilesystem %s/%s: %w", namespace, name, err) + } + logger.Info("Deleted CephFilesystem %s/%s", namespace, name) + return nil +} + +// CephFilesystemGoneTimeout is the default budget for WaitForCephFilesystemGone. +// MDS shutdown + pool removal usually settles in 1-2 minutes; we allow more +// to absorb operator restarts and slow Ceph mons. +const CephFilesystemGoneTimeout = 5 * time.Minute + +// WaitForCephFilesystemGone polls until the CephFilesystem is fully GC'd by +// Kubernetes (GET returns NotFound). Use this after DeleteCephFilesystem to +// be sure the parent CephCluster's deletion won't be blocked by +// `ObjectHasDependents`. +func WaitForCephFilesystemGone(ctx context.Context, kubeconfig *rest.Config, namespace, name string, timeout time.Duration) error { + if timeout <= 0 { + timeout = CephFilesystemGoneTimeout + } + return pollResourceUntilGone( + ctx, kubeconfig, CephFilesystemGVR, namespace, name, + timeout, PollTickInterval, "CephFilesystem", + ) +} diff --git a/pkg/kubernetes/cephstorageclass.go b/pkg/kubernetes/cephstorageclass.go new file mode 100644 index 0000000..942dd49 --- /dev/null +++ b/pkg/kubernetes/cephstorageclass.go @@ -0,0 +1,252 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubernetes + +import ( + "context" + "fmt" + "time" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/rest" + + "github.com/deckhouse/storage-e2e/internal/logger" +) + +// CephStorageClassGVR points at csi-ceph's CephStorageClass CR (not to be +// confused with Rook's CephCluster / CephBlockPool). +var CephStorageClassGVR = schema.GroupVersionResource{ + Group: "storage.deckhouse.io", + Version: "v1alpha1", + Resource: "cephstorageclasses", +} + +// Supported CephStorageClass types, mirroring csi-ceph's CRD enum. +const ( + CephStorageClassTypeRBD = "RBD" + CephStorageClassTypeCephFS = "CephFS" +) + +// CephStorageClassConfig is an intentionally narrow shape tailored for the +// e2e scenarios we care about today — an RBD StorageClass backed by a single +// block pool. CephFS variant is supported but requires FSName+FSPool to be +// set by the caller. +type CephStorageClassConfig struct { + // Name of the CephStorageClass CR (becomes the k8s StorageClass name). + Name string + + // ClusterConnectionName points at a CephClusterConnection CR. + ClusterConnectionName string + + // ClusterAuthenticationName points at a CephClusterAuthentication CR. + ClusterAuthenticationName string + + // ReclaimPolicy mirrors StorageClass.ReclaimPolicy ("Delete" / "Retain"). + // Default: "Delete". + ReclaimPolicy string + + // Type is "RBD" (default) or "CephFS". + Type string + + // --- RBD options (Type == "RBD") --- + + // RBDPool is the Ceph pool name (e.g. "ceph-rbd-r1"). + RBDPool string + + // RBDDefaultFSType picks the filesystem mkfs on volume attach. + // Default: "ext4". + RBDDefaultFSType string + + // --- CephFS options (Type == "CephFS") --- + CephFSName string // Name of the CephFilesystem. + CephFSPool string // Pool to use inside that filesystem. +} + +// CreateCephStorageClass creates (or updates) a CephStorageClass CR. On +// success the csi-ceph controller provisions a corresponding core +// storage.k8s.io/v1 StorageClass in the cluster. +func CreateCephStorageClass(ctx context.Context, kubeconfig *rest.Config, cfg CephStorageClassConfig) error { + if cfg.Name == "" { + return fmt.Errorf("CephStorageClass name is required") + } + if cfg.ClusterConnectionName == "" { + return fmt.Errorf("CephStorageClass ClusterConnectionName is required") + } + if cfg.ClusterAuthenticationName == "" { + return fmt.Errorf("CephStorageClass ClusterAuthenticationName is required") + } + if cfg.Type == "" { + cfg.Type = CephStorageClassTypeRBD + } + if cfg.ReclaimPolicy == "" { + cfg.ReclaimPolicy = "Delete" + } + + spec := map[string]interface{}{ + "clusterConnectionName": cfg.ClusterConnectionName, + "clusterAuthenticationName": cfg.ClusterAuthenticationName, + "reclaimPolicy": cfg.ReclaimPolicy, + "type": cfg.Type, + } + + switch cfg.Type { + case CephStorageClassTypeRBD: + if cfg.RBDPool == "" { + return fmt.Errorf("CephStorageClass of type RBD requires RBDPool") + } + if cfg.RBDDefaultFSType == "" { + cfg.RBDDefaultFSType = "ext4" + } + spec["rbd"] = map[string]interface{}{ + "defaultFSType": cfg.RBDDefaultFSType, + "pool": cfg.RBDPool, + } + case CephStorageClassTypeCephFS: + if cfg.CephFSName == "" || cfg.CephFSPool == "" { + return fmt.Errorf("CephStorageClass of type CephFS requires CephFSName and CephFSPool") + } + spec["cephFS"] = map[string]interface{}{ + "fsName": cfg.CephFSName, + "pool": cfg.CephFSPool, + } + default: + return fmt.Errorf("unsupported CephStorageClass Type: %s", cfg.Type) + } + + obj := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "storage.deckhouse.io/v1alpha1", + "kind": "CephStorageClass", + "metadata": map[string]interface{}{ + "name": cfg.Name, + }, + "spec": spec, + }, + } + + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + + logger.Info("Creating CephStorageClass %s (type=%s, conn=%s, auth=%s)", + cfg.Name, cfg.Type, cfg.ClusterConnectionName, cfg.ClusterAuthenticationName) + _, err = dynamicClient.Resource(CephStorageClassGVR).Create(ctx, obj, metav1.CreateOptions{}) + if err == nil { + return nil + } + if !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("failed to create CephStorageClass %s: %w", cfg.Name, err) + } + + logger.Info("CephStorageClass %s already exists, updating spec", cfg.Name) + existing, err := dynamicClient.Resource(CephStorageClassGVR).Get(ctx, cfg.Name, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to fetch CephStorageClass %s: %w", cfg.Name, err) + } + if err := errIfTerminating(existing, "CephStorageClass", cfg.Name); err != nil { + return err + } + existing.Object["spec"] = spec + if _, err := dynamicClient.Resource(CephStorageClassGVR).Update(ctx, existing, metav1.UpdateOptions{}); err != nil { + return fmt.Errorf("failed to update CephStorageClass %s: %w", cfg.Name, err) + } + return nil +} + +// DeleteCephStorageClass removes a CephStorageClass. NotFound is treated as +// success. The underlying k8s StorageClass is removed by the csi-ceph +// controller as a side effect. Use WaitForCephStorageClassGone to confirm +// the CR is fully GC'd. +func DeleteCephStorageClass(ctx context.Context, kubeconfig *rest.Config, name string) error { + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + if err := dynamicClient.Resource(CephStorageClassGVR).Delete(ctx, name, metav1.DeleteOptions{}); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed to delete CephStorageClass %s: %w", name, err) + } + logger.Info("Deleted CephStorageClass %s", name) + return nil +} + +// CephStorageClassGoneTimeout is the default budget for +// WaitForCephStorageClassGone. CephStorageClass has no heavyweight finalizer +// (csi-ceph just deletes the backing k8s StorageClass), so this typically +// completes in seconds. +const CephStorageClassGoneTimeout = 1 * time.Minute + +// WaitForCephStorageClassGone polls until the CephStorageClass is fully GC'd +// by Kubernetes (GET returns NotFound). +func WaitForCephStorageClassGone(ctx context.Context, kubeconfig *rest.Config, name string, timeout time.Duration) error { + if timeout <= 0 { + timeout = CephStorageClassGoneTimeout + } + return pollResourceUntilGone( + ctx, kubeconfig, CephStorageClassGVR, "", name, + timeout, PollTickInterval, "CephStorageClass", + ) +} + +// WaitForCephStorageClassCreated polls until the CephStorageClass status +// reports phase=Created (the csi-ceph controller flips this once the backing +// k8s StorageClass has been provisioned). +func WaitForCephStorageClassCreated(ctx context.Context, kubeconfig *rest.Config, name string, timeout time.Duration) error { + if name == "" { + return fmt.Errorf("name is required") + } + + logger.Debug("Waiting for CephStorageClass %s phase=Created (timeout: %v)", name, timeout) + + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + + ctx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + ticker := time.NewTicker(3 * time.Second) + defer ticker.Stop() + + for { + obj, err := dynamicClient.Resource(CephStorageClassGVR).Get(ctx, name, metav1.GetOptions{}) + if err == nil { + phase, _, _ := unstructured.NestedString(obj.Object, "status", "phase") + reason, _, _ := unstructured.NestedString(obj.Object, "status", "reason") + if phase == "Created" { + logger.Success("CephStorageClass %s is Created", name) + return nil + } + logger.Debug("CephStorageClass %s phase=%q reason=%q", name, phase, reason) + } else if !apierrors.IsNotFound(err) { + logger.Debug("Error getting CephStorageClass %s: %v", name, err) + } + + select { + case <-ctx.Done(): + return fmt.Errorf("timeout waiting for CephStorageClass %s: %w", name, ctx.Err()) + case <-ticker.C: + } + } +} diff --git a/pkg/kubernetes/modules.go b/pkg/kubernetes/modules.go index 3b4cedf..94490a7 100644 --- a/pkg/kubernetes/modules.go +++ b/pkg/kubernetes/modules.go @@ -252,9 +252,18 @@ func configureModuleConfig(ctx context.Context, kubeconfig *rest.Config, moduleC settings = moduleConfig.Settings } - // Retry logic for webhook connection errors and network timeouts - maxRetries := 10 + // Retry logic for webhook connection errors and network timeouts. + // On freshly-bootstrapped Deckhouse clusters the validating-webhook-handler + // pod (or the d8-system Service endpoint backing it) can be unready for + // several minutes while the control plane converges. Our previous cap of + // 10 retries with exponential backoff topped out at ~3.7 minutes total + // which was not enough for the SAN stand — we'd fail Step 18 with + // "connection refused" during the first ModuleConfig write. Bumping to 60 + // attempts with delays capped at 30s gives us up to ~30 minutes of + // soft-retries, which easily outlives any realistic webhook cold start. + maxRetries := 60 retryDelay := 2 * time.Second + const maxRetryDelay = 30 * time.Second var lastErr error for attempt := 0; attempt < maxRetries; attempt++ { @@ -282,8 +291,12 @@ func configureModuleConfig(ctx context.Context, kubeconfig *rest.Config, moduleC case <-ctx.Done(): return ctx.Err() case <-time.After(retryDelay): - // Exponential backoff + // Exponential backoff, capped so we don't sleep forever + // between retries on a slow-to-converge cluster. retryDelay = time.Duration(float64(retryDelay) * 1.5) + if retryDelay > maxRetryDelay { + retryDelay = maxRetryDelay + } continue } } @@ -307,8 +320,12 @@ func configureModuleConfig(ctx context.Context, kubeconfig *rest.Config, moduleC case <-ctx.Done(): return ctx.Err() case <-time.After(retryDelay): - // Exponential backoff + // Exponential backoff, capped (see create branch above + // for the rationale — same webhook cold-start). retryDelay = time.Duration(float64(retryDelay) * 1.5) + if retryDelay > maxRetryDelay { + retryDelay = maxRetryDelay + } continue } } diff --git a/pkg/kubernetes/pod_exec.go b/pkg/kubernetes/pod_exec.go new file mode 100644 index 0000000..92297a0 --- /dev/null +++ b/pkg/kubernetes/pod_exec.go @@ -0,0 +1,388 @@ +/* +Copyright 2026 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubernetes + +import ( + "bytes" + "context" + "crypto/rand" + "encoding/hex" + "fmt" + "time" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/remotecommand" +) + +// DefaultDebugImage is the image ReadFileFromDistrolessPod injects as the +// short-lived ephemeral container. busybox ships cat, sleep and a +// minimal sh — exactly the toolset we need to read /proc/1/root/ +// in the target container's filesystem. Tests against an air-gapped +// registry can override this via ReadFileOptions.DebugImage. +const DefaultDebugImage = "busybox:1.36" + +// DefaultEphemeralStartupTimeout caps the wait for the injected +// ephemeral container to transition into Running. Image pull from a +// warm registry usually takes a couple of seconds; 60 s is a generous +// upper bound that still surfaces ImagePullBackOff/ErrImagePull early. +const DefaultEphemeralStartupTimeout = 60 * time.Second + +// DefaultDistrolessSessionTTL is the lifetime of the `sleep` process +// inside the injected ephemeral container when used as a long-lived +// reader session (OpenDistrolessReader / DistrolessReader.ReadFile). +// 30 minutes comfortably outlasts any single test cell while still +// guaranteeing eventual self-cleanup if the caller crashes. +const DefaultDistrolessSessionTTL = 30 * time.Minute + +// ephemeralPollInterval is how often we re-Get the pod when waiting for +// the ephemeral container to start. 500 ms is a deliberate compromise: +// fast enough that the typical 1-3 s pull is observed promptly, slow +// enough that we don't hammer the apiserver. +const ephemeralPollInterval = 500 * time.Millisecond + +// ReadFileOptions tunes ReadFileFromDistrolessPod and OpenDistrolessReader. +type ReadFileOptions struct { + // DebugImage overrides the ephemeral container image. Defaults to + // DefaultDebugImage. Use this on air-gapped clusters to point at an + // internal mirror. + DebugImage string + // StartupTimeout caps the wait for the ephemeral container to reach + // state.Running. Defaults to DefaultEphemeralStartupTimeout. + StartupTimeout time.Duration + // SessionTTL controls how long the injected ephemeral container's + // `sleep` process stays alive. Defaults to DefaultDistrolessSessionTTL. + // Used by OpenDistrolessReader; ReadFileFromDistrolessPod does not + // rely on this value (the entry's status flip after the cat exits + // has no effect on the pod). + SessionTTL time.Duration +} + +// ExecInPod runs cmd inside container of pod namespace/pod via the +// apiserver's pods/exec subresource and returns stdout and stderr +// separately, plus any transport- or exec-level error. +// +// The container must ship every binary referenced by cmd; ExecInPod does +// NOT inject any helper. For distroless containers without cat / sh, +// see ReadFileFromDistrolessPod. +func ExecInPod( + ctx context.Context, + kubeconfig *rest.Config, + namespace, pod, container string, + cmd []string, +) (stdout, stderr string, err error) { + clientset, err := NewClientsetWithRetry(ctx, kubeconfig) + if err != nil { + return "", "", fmt.Errorf("create clientset: %w", err) + } + + req := clientset.CoreV1().RESTClient().Post(). + Resource("pods"). + Name(pod). + Namespace(namespace). + SubResource("exec"). + VersionedParams(&corev1.PodExecOptions{ + Container: container, + Command: cmd, + Stdout: true, + Stderr: true, + }, scheme.ParameterCodec) + + executor, err := remotecommand.NewSPDYExecutor(kubeconfig, "POST", req.URL()) + if err != nil { + return "", "", fmt.Errorf("create SPDY executor for %s/%s[%s]: %w", + namespace, pod, container, err) + } + + var stdoutBuf, stderrBuf bytes.Buffer + err = executor.StreamWithContext(ctx, remotecommand.StreamOptions{ + Stdout: &stdoutBuf, + Stderr: &stderrBuf, + }) + stdout = stdoutBuf.String() + stderr = stderrBuf.String() + if err != nil { + return stdout, stderr, fmt.Errorf("exec %v in %s/%s[%s]: %w (stderr=%q)", + cmd, namespace, pod, container, err, stderr) + } + return stdout, stderr, nil +} + +// ReadFileFromPod cat's `path` from inside `container` of pod +// `namespace/pod`. Equivalent to `kubectl exec -c container -- cat +// path`, with stderr surfaced as part of the error if non-empty. +// +// Requires the container image to ship cat. For distroless / scratch +// images, use ReadFileFromDistrolessPod. +func ReadFileFromPod( + ctx context.Context, + kubeconfig *rest.Config, + namespace, pod, container, path string, +) (string, error) { + stdout, stderr, err := ExecInPod(ctx, kubeconfig, namespace, pod, container, []string{"cat", path}) + if err != nil { + return stdout, err + } + if stderr != "" { + return stdout, fmt.Errorf("cat %s in %s/%s[%s] reported stderr: %s", + path, namespace, pod, container, stderr) + } + return stdout, nil +} + +// ReadFileFromDistrolessPod reads `path` from inside `targetContainer` +// of pod `namespace/pod` even when targetContainer ships no shell, no +// cat and no tar — i.e. a distroless or scratch image like +// csi-controller. It does so by injecting a short-lived ephemeral +// container (TargetContainerName=targetContainer, which gives it a +// shared PID namespace with the target) and then catting +// /proc/1/root. /proc/1 is PID 1 inside the target container's +// PID namespace, and /proc//root is the well-known kernel-exposed +// view of that process's filesystem root. +// +// Why this does NOT restart the target pod or any of its containers: +// +// - Ephemeral containers are added through the dedicated +// /pods//ephemeralcontainers subresource (UpdateEphemeralContainers +// in client-go). The apiserver explicitly allows this mutation on a +// running pod; the ordinary pod PUT/PATCH path that would trigger +// re-creation is bypassed entirely. Without this dedicated path, +// adding a container to a live pod would be flat-out forbidden. +// - metadata.generation, spec.containers, the pod sandbox UID and the +// ReplicaSet/DaemonSet observation all stay intact. The kubelet +// simply launches the new container in the existing pod sandbox +// without disturbing existing containers. Workload-controller +// rollouts and pod-template `checksum/...` annotations are not +// affected, so e2e suites that subsequently assert on rollout +// state see a clean signal — the FS read does not contaminate it. +// - Ephemeral containers are forbidden from declaring ports, probes, +// lifecycle hooks or resources, which guarantees the inject is a +// cheap no-op for the pod's lifecycle. +// +// Caveat: ephemeral containers cannot be removed once added. The cat +// process exits with the container after `sleep`, but the entry remains +// in pod.spec.ephemeralContainers and +// pod.status.ephemeralContainerStatuses (state=Terminated). For +// long-running suites those entries simply pile up until the next pod +// recycle. Each invocation here generates a unique container name, so +// repeat calls against the same pod are safe. +// +// For polling loops or any scenario that reads the same pod multiple +// times, prefer OpenDistrolessReader: each ReadFileFromDistrolessPod +// call pays the full ephemeral-container cold-start cost (~10–20 s for +// kubelet to launch a new container in the existing pod sandbox), and +// that cost dominates the runtime of a Eventually-style poll. +func ReadFileFromDistrolessPod( + ctx context.Context, + kubeconfig *rest.Config, + namespace, pod, targetContainer, path string, + opts ReadFileOptions, +) (string, error) { + r, err := OpenDistrolessReader(ctx, kubeconfig, namespace, pod, targetContainer, opts) + if err != nil { + return "", err + } + return r.ReadFile(ctx, path) +} + +// DistrolessReader is a long-lived ephemeral-container reader session +// against a single distroless pod. Open one with OpenDistrolessReader, +// then call ReadFile as many times as you need — each ReadFile is just +// an exec into the already-running ephemeral container (cheap), so a +// polling loop pays the ephemeral-container cold start ONCE instead of +// per-iteration. +// +// The session expires when the ephemeral container's `sleep` +// (opts.SessionTTL, default DefaultDistrolessSessionTTL) elapses; there +// is no Close — Kubernetes does not allow removing an ephemeral +// container — but the inert "Terminated" status entry has no effect on +// the pod. Callers that need fresh sessions across pod identities +// (e.g. after a workload rollout) should re-open against the new pod. +type DistrolessReader struct { + kubeconfig *rest.Config + namespace string + podName string + targetContainer string + ephemeralName string +} + +// PodName returns the name of the pod this reader is bound to. Useful +// for callers that need to detect rollouts (the pod name changes when +// the workload-controller recycles the pod) and re-open the session. +func (r *DistrolessReader) PodName() string { return r.podName } + +// EphemeralName returns the auto-generated name of the injected +// ephemeral container, mostly for logging. +func (r *DistrolessReader) EphemeralName() string { return r.ephemeralName } + +// ReadFile cat's `path` from inside the target container's filesystem +// (resolved through the ephemeral container's view of /proc/1/root). +// Cheap — just a pods/exec round-trip; no apiserver mutations. +func (r *DistrolessReader) ReadFile(ctx context.Context, path string) (string, error) { + stdout, stderr, err := ExecInPod(ctx, r.kubeconfig, r.namespace, r.podName, r.ephemeralName, + []string{"cat", "/proc/1/root" + path}) + if err != nil { + return stdout, fmt.Errorf("read %s from %s/%s[%s] via ephemeral %s: %w", + path, r.namespace, r.podName, r.targetContainer, r.ephemeralName, err) + } + if stderr != "" { + return stdout, fmt.Errorf("read %s from %s/%s[%s] via ephemeral %s: stderr=%s", + path, r.namespace, r.podName, r.targetContainer, r.ephemeralName, stderr) + } + return stdout, nil +} + +// OpenDistrolessReader injects a long-lived ephemeral container into +// the target pod and waits for it to become Running. The returned +// DistrolessReader can then be used for arbitrarily many cheap +// ReadFile calls until opts.SessionTTL elapses (default 30 minutes). +// +// Failure modes (returned as errors): pod not found, ephemeral +// container terminates before Running, image pull failure, startup +// timeout. On any of these no usable reader is returned. +// +// See ReadFileFromDistrolessPod for the rationale on why this does +// not restart the target pod or any of its existing containers. +func OpenDistrolessReader( + ctx context.Context, + kubeconfig *rest.Config, + namespace, pod, targetContainer string, + opts ReadFileOptions, +) (*DistrolessReader, error) { + if opts.DebugImage == "" { + opts.DebugImage = DefaultDebugImage + } + if opts.StartupTimeout <= 0 { + opts.StartupTimeout = DefaultEphemeralStartupTimeout + } + if opts.SessionTTL <= 0 { + opts.SessionTTL = DefaultDistrolessSessionTTL + } + + clientset, err := NewClientsetWithRetry(ctx, kubeconfig) + if err != nil { + return nil, fmt.Errorf("create clientset: %w", err) + } + pods := clientset.CoreV1().Pods(namespace) + + ecName, err := randomEphemeralName("filereader-") + if err != nil { + return nil, fmt.Errorf("generate ephemeral container name: %w", err) + } + + livePod, err := pods.Get(ctx, pod, metav1.GetOptions{}) + if err != nil { + return nil, fmt.Errorf("get pod %s/%s: %w", namespace, pod, err) + } + sleepSeconds := int64(opts.SessionTTL.Seconds()) + if sleepSeconds < 1 { + sleepSeconds = 1 + } + livePod.Spec.EphemeralContainers = append(livePod.Spec.EphemeralContainers, corev1.EphemeralContainer{ + EphemeralContainerCommon: corev1.EphemeralContainerCommon{ + Name: ecName, + Image: opts.DebugImage, + Command: []string{"sleep", fmt.Sprintf("%d", sleepSeconds)}, + ImagePullPolicy: corev1.PullIfNotPresent, + TerminationMessagePolicy: corev1.TerminationMessageReadFile, + }, + TargetContainerName: targetContainer, + }) + if _, err := pods.UpdateEphemeralContainers(ctx, pod, livePod, metav1.UpdateOptions{}); err != nil { + return nil, fmt.Errorf("inject ephemeral container %q into %s/%s: %w", + ecName, namespace, pod, err) + } + + if err := waitEphemeralContainerRunning(ctx, pods, pod, ecName, opts.StartupTimeout); err != nil { + return nil, err + } + + return &DistrolessReader{ + kubeconfig: kubeconfig, + namespace: namespace, + podName: pod, + targetContainer: targetContainer, + ephemeralName: ecName, + }, nil +} + +// waitEphemeralContainerRunning polls pod.status.ephemeralContainerStatuses +// until the container with name ecName reports state.Running != nil. +// Returns immediately on Terminated / hard pull failures so tests don't +// have to sit through the full timeout when the debug image is +// unreachable. +func waitEphemeralContainerRunning( + ctx context.Context, + pods typedcorev1.PodInterface, + podName, ecName string, + timeout time.Duration, +) error { + deadlineCtx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + ticker := time.NewTicker(ephemeralPollInterval) + defer ticker.Stop() + + for { + p, getErr := pods.Get(deadlineCtx, podName, metav1.GetOptions{}) + switch { + case apierrors.IsNotFound(getErr): + return fmt.Errorf("pod %s disappeared while waiting for ephemeral container %q", + podName, ecName) + case getErr == nil: + for _, st := range p.Status.EphemeralContainerStatuses { + if st.Name != ecName { + continue + } + if st.State.Running != nil { + return nil + } + if st.State.Terminated != nil { + return fmt.Errorf("ephemeral container %q in pod %s terminated before exec: reason=%s exitCode=%d", + ecName, podName, + st.State.Terminated.Reason, st.State.Terminated.ExitCode) + } + if w := st.State.Waiting; w != nil && (w.Reason == "ImagePullBackOff" || w.Reason == "ErrImagePull") { + return fmt.Errorf("ephemeral container %q in pod %s cannot start: %s: %s", + ecName, podName, w.Reason, w.Message) + } + } + } + + select { + case <-deadlineCtx.Done(): + return fmt.Errorf("timeout (%s) waiting for ephemeral container %q in pod %s to be Running", + timeout, ecName, podName) + case <-ticker.C: + } + } +} + +// randomEphemeralName returns prefix + 8 hex chars from crypto/rand. +// Sufficient entropy for uniqueness across a single test run; we don't +// need cryptographic strength but crypto/rand keeps us out of math/rand +// seeding pitfalls. +func randomEphemeralName(prefix string) (string, error) { + var b [4]byte + if _, err := rand.Read(b[:]); err != nil { + return "", err + } + return prefix + hex.EncodeToString(b[:]), nil +} diff --git a/pkg/kubernetes/poll.go b/pkg/kubernetes/poll.go new file mode 100644 index 0000000..4fc833f --- /dev/null +++ b/pkg/kubernetes/poll.go @@ -0,0 +1,339 @@ +/* +Copyright 2026 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubernetes + +import ( + "context" + "fmt" + "time" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/dynamic" + "k8s.io/client-go/rest" + + "github.com/deckhouse/storage-e2e/internal/logger" +) + +// PollGetTimeout caps a single Get call inside readiness pollers. Without +// this cap a hung TCP connect (e.g. SSH tunnel that died after a Wi-Fi flap +// on the developer's laptop) eats the entire parent timeout silently — the +// poller appears to "hang" until the per-resource ReadyTimeout fires 15-20 +// minutes later. With a 30s cap each Get fails fast, so we surface the +// network problem early via the WARN log emitted by pollResourceUntilReady. +const PollGetTimeout = 30 * time.Second + +// PollTickInterval is the default tick interval between Get attempts when +// waiting for a Kubernetes resource to reach a ready state. +const PollTickInterval = 5 * time.Second + +// pollResourceUntilReady polls a single namespaced unstructured resource +// until isReady returns (true, "") or the parent timeout expires. +// +// It centralizes three behaviors that all of our Wait*Ready helpers want: +// - per-call deadline (PollGetTimeout) on every Get, so a dead network +// surfaces in seconds instead of after the readiness timeout; +// - WARN logs with a counter when consecutive network errors happen — silent +// pollers were the root cause of "test hangs forever after Wi-Fi flap"; +// - tolerance of NotFound (the resource may not have been seen by the +// watch cache yet) and of `isReady=false` (still progressing). +// +// Parameters: +// +// - kubeconfig: rest config used to construct the dynamic client. +// - gvr: GroupVersionResource of the resource being polled. +// - namespace, name: scope of the resource. Must both be non-empty. +// - readyTimeout: overall budget. Returns timeout error after this. +// - tickInterval: gap between Get attempts. Pass PollTickInterval if +// unsure; resources with slow reconcilers can use longer intervals. +// - resourceLabel: string used in log lines (e.g. "CephCluster"). Keep +// short — the namespace/name is appended for context. +// - isReady: decider over the unstructured object. Returns +// (ready, humanReason). If ready is true, pollResourceUntilReady +// prints a Success log including the reason and returns nil. +func pollResourceUntilReady( + ctx context.Context, + kubeconfig *rest.Config, + gvr schema.GroupVersionResource, + namespace, name string, + readyTimeout time.Duration, + tickInterval time.Duration, + resourceLabel string, + isReady func(obj *unstructured.Unstructured) (ready bool, reason string), +) error { + if name == "" { + return fmt.Errorf("name is required") + } + if isReady == nil { + return fmt.Errorf("isReady is required") + } + if tickInterval <= 0 { + tickInterval = PollTickInterval + } + + ref := formatRef(namespace, name) + logger.Debug("Waiting for %s %s to become Ready (timeout: %v)", resourceLabel, ref, readyTimeout) + + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + + deadlineCtx, cancel := context.WithTimeout(ctx, readyTimeout) + defer cancel() + + ticker := time.NewTicker(tickInterval) + defer ticker.Stop() + + var consecutiveErrs int + for { + obj, err := getWithTimeout(deadlineCtx, dynamicClient, gvr, namespace, name, PollGetTimeout) + switch { + case err == nil: + consecutiveErrs = 0 + // Refuse to wait for Ready on a Terminating object. Without this + // short-circuit a stale `Deleting` CR (e.g. CephCluster left over + // by a previous run that didn't finish teardown) would keep us + // polling for the full readyTimeout: phase=Deleting never matches + // any "Ready" condition. Failing fast here gives the operator a + // chance to clean up (or strip finalizers) instead of hiding the + // real state of the cluster behind a 15-20 minute timeout. + if dt := obj.GetDeletionTimestamp(); dt != nil { + return fmt.Errorf( + "%s %s is being deleted (deletionTimestamp=%s, finalizers=%v); "+ + "refusing to wait for Ready on a Terminating object", + resourceLabel, ref, + dt.Format(time.RFC3339), obj.GetFinalizers(), + ) + } + if ready, reason := isReady(obj); ready { + if reason != "" { + logger.Success("%s %s is Ready (%s)", resourceLabel, ref, reason) + } else { + logger.Success("%s %s is Ready", resourceLabel, ref) + } + return nil + } + case apierrors.IsNotFound(err): + // Resource hasn't propagated yet. Treat as "still progressing" + // without warning so we don't spam logs on healthy clusters that + // just haven't observed the create yet. + consecutiveErrs = 0 + logger.Debug("%s %s not found yet", resourceLabel, ref) + default: + consecutiveErrs++ + // Quiet the first two failures (spurious 5xx, leader re-election), + // loud after that. Loud == WARN at every iteration so the user + // can see the cluster connection is dying instead of waiting for + // the readyTimeout to fire. + if consecutiveErrs >= 3 { + logger.Warn( + "%s %s GET failed for %d consecutive iterations: %v", + resourceLabel, ref, consecutiveErrs, err, + ) + } else { + logger.Debug("Error getting %s %s: %v", resourceLabel, ref, err) + } + } + + select { + case <-deadlineCtx.Done(): + return fmt.Errorf("timeout waiting for %s %s: %w", resourceLabel, ref, deadlineCtx.Err()) + case <-ticker.C: + } + } +} + +// PollGoneProgressEvery controls how often pollResourceUntilGone emits a +// progress INFO line while the resource is still alive. We don't want a log +// per tick (chatty) but we also don't want long stretches of silence when a +// finalizer is stuck for minutes — every ~30s strikes a balance. +const PollGoneProgressEvery = 30 * time.Second + +// pollResourceUntilGone polls a single namespaced unstructured resource +// until a GET returns NotFound (i.e. the API server has GC'd the object) or +// the parent timeout expires. +// +// Mirrors pollResourceUntilReady but with inverted success criterion. Three +// behaviors worth calling out: +// - per-call deadline (PollGetTimeout) on every Get; +// - WARN logs after a few consecutive non-NotFound errors so a dropped +// SSH tunnel surfaces in seconds rather than at the timeout; +// - periodic INFO progress log including the object's deletionTimestamp +// and finalizers — that's exactly the diagnostic info you need to know +// why Rook hasn't finished tearing the resource down. We avoid logging +// this on every tick (chatty) and instead emit at most once per +// PollGoneProgressEvery. +func pollResourceUntilGone( + ctx context.Context, + kubeconfig *rest.Config, + gvr schema.GroupVersionResource, + namespace, name string, + goneTimeout time.Duration, + tickInterval time.Duration, + resourceLabel string, +) error { + if name == "" { + return fmt.Errorf("name is required") + } + if tickInterval <= 0 { + tickInterval = PollTickInterval + } + + ref := formatRef(namespace, name) + logger.Debug("Waiting for %s %s to be gone (timeout: %v)", resourceLabel, ref, goneTimeout) + + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + + deadlineCtx, cancel := context.WithTimeout(ctx, goneTimeout) + defer cancel() + + ticker := time.NewTicker(tickInterval) + defer ticker.Stop() + + var ( + consecutiveErrs int + lastProgress time.Time + lastFinalizers []string + lastDeletionTS string + ) + for { + obj, err := getWithTimeout(deadlineCtx, dynamicClient, gvr, namespace, name, PollGetTimeout) + switch { + case apierrors.IsNotFound(err): + logger.Success("%s %s is gone", resourceLabel, ref) + return nil + case err == nil: + consecutiveErrs = 0 + finalizers := obj.GetFinalizers() + deletionTS := "" + if dt := obj.GetDeletionTimestamp(); dt != nil { + deletionTS = dt.Format(time.RFC3339) + } + // Surface progress periodically OR whenever the visible state + // changes (finalizers list shrunk, deletionTimestamp finally + // appeared after a Delete request was missed, ...). + stateChanged := deletionTS != lastDeletionTS || !sameFinalizers(finalizers, lastFinalizers) + if stateChanged || time.Since(lastProgress) >= PollGoneProgressEvery { + if deletionTS == "" { + logger.Info("%s %s still alive (no deletionTimestamp yet, finalizers=%v)", + resourceLabel, ref, finalizers) + } else { + logger.Info("%s %s still terminating (deletionTimestamp=%s, finalizers=%v)", + resourceLabel, ref, deletionTS, finalizers) + } + lastProgress = time.Now() + lastFinalizers = append(lastFinalizers[:0], finalizers...) + lastDeletionTS = deletionTS + } + default: + consecutiveErrs++ + if consecutiveErrs >= 3 { + logger.Warn( + "%s %s GET failed for %d consecutive iterations: %v", + resourceLabel, ref, consecutiveErrs, err, + ) + } else { + logger.Debug("Error getting %s %s: %v", resourceLabel, ref, err) + } + } + + select { + case <-deadlineCtx.Done(): + // Surface the last observed state in the timeout error so the + // caller (and the dev reading the test log) can immediately tell + // whether they're stuck on a finalizer, on a missing + // deletionTimestamp, or on a network issue. + lastSeen := "no observation yet" + if lastDeletionTS != "" || len(lastFinalizers) > 0 { + lastSeen = fmt.Sprintf("deletionTimestamp=%q, finalizers=%v", lastDeletionTS, lastFinalizers) + } + return fmt.Errorf("timeout waiting for %s %s to be gone (%s): %w", + resourceLabel, ref, lastSeen, deadlineCtx.Err()) + case <-ticker.C: + } + } +} + +// formatRef renders a resource reference as either "name" (cluster-scoped) +// or "namespace/name" (namespaced) for log lines and error messages. +func formatRef(namespace, name string) string { + if namespace == "" { + return name + } + return namespace + "/" + name +} + +// errIfTerminating returns a descriptive error if obj has a non-nil +// metadata.deletionTimestamp. Used by Create* helpers to fail-fast in the +// IsAlreadyExists branch when an existing CR is in `Terminating` state — +// updating its spec would be a no-op (the controller is busy unwinding the +// finalizer), and a follow-up Wait*Ready would hang forever because phase +// transitions never reach a Ready state on a Terminating object. +// +// `kind` is the human-readable kind ("CephCluster") and `ref` is the +// formatted "[namespace/]name" identifier. +func errIfTerminating(obj *unstructured.Unstructured, kind, ref string) error { + dt := obj.GetDeletionTimestamp() + if dt == nil { + return nil + } + return fmt.Errorf( + "%s %s exists but is being deleted (deletionTimestamp=%s, finalizers=%v); "+ + "wait for it to disappear or remove finalizers manually before re-running", + kind, ref, dt.Format(time.RFC3339), obj.GetFinalizers(), + ) +} + +// sameFinalizers returns true when both slices contain the same strings in +// the same order. Used by pollResourceUntilGone to decide if the visible +// state has changed. +func sameFinalizers(a, b []string) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +} + +// getWithTimeout wraps dynamicClient.Get with a per-call deadline derived +// from the parent context. The wrapper avoids leaking goroutines blocked on +// a dead TCP connection. An empty namespace selects the cluster-scoped +// path (used by csi-ceph CRs like CephClusterConnection). +func getWithTimeout( + parent context.Context, + dynamicClient dynamic.Interface, + gvr schema.GroupVersionResource, + namespace, name string, + perCallTimeout time.Duration, +) (*unstructured.Unstructured, error) { + callCtx, cancel := context.WithTimeout(parent, perCallTimeout) + defer cancel() + if namespace == "" { + return dynamicClient.Resource(gvr).Get(callCtx, name, metav1.GetOptions{}) + } + return dynamicClient.Resource(gvr).Namespace(namespace).Get(callCtx, name, metav1.GetOptions{}) +} diff --git a/pkg/kubernetes/rookconfigoverride.go b/pkg/kubernetes/rookconfigoverride.go new file mode 100644 index 0000000..dab8aad --- /dev/null +++ b/pkg/kubernetes/rookconfigoverride.go @@ -0,0 +1,140 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubernetes + +import ( + "context" + "fmt" + "sort" + "strings" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/rest" + + "github.com/deckhouse/storage-e2e/internal/logger" +) + +// RookConfigOverrideName is the well-known ConfigMap name Rook reads Ceph +// config overrides from (see Rook docs: "Advanced Configuration – Custom +// ceph.conf Settings"). Rook watches this ConfigMap in its operator namespace +// and injects the `config` key into `/etc/ceph/ceph.conf` of every Ceph daemon. +const RookConfigOverrideName = "rook-config-override" + +// SetRookConfigOverride creates or updates the `rook-config-override` ConfigMap +// in the given Rook operator namespace so that Ceph daemons pick up the +// provided global settings. +// +// The ConfigMap format expected by Rook is: +// +// apiVersion: v1 +// kind: ConfigMap +// metadata: +// name: rook-config-override +// namespace: +// data: +// config: | +// [global] +// key1 = value1 +// key2 = value2 +// +// `globals` is rendered under `[global]`. Keys are sorted for a stable output. +// Passing an empty/nil `globals` map produces an empty `[global]` section, +// which effectively clears previously-set overrides. +func SetRookConfigOverride(ctx context.Context, kubeconfig *rest.Config, namespace string, globals map[string]string) error { + if namespace == "" { + return fmt.Errorf("namespace is required") + } + + clientset, err := NewClientsetWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create clientset: %w", err) + } + + cfg := RenderCephGlobalConfig(globals) + + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: RookConfigOverrideName, + Namespace: namespace, + }, + Data: map[string]string{ + "config": cfg, + }, + } + + existing, err := clientset.CoreV1().ConfigMaps(namespace).Get(ctx, RookConfigOverrideName, metav1.GetOptions{}) + if err != nil { + if apierrors.IsNotFound(err) { + logger.Info("Creating ConfigMap %s/%s with Ceph global overrides (%d keys)", namespace, RookConfigOverrideName, len(globals)) + if _, err := clientset.CoreV1().ConfigMaps(namespace).Create(ctx, cm, metav1.CreateOptions{}); err != nil { + return fmt.Errorf("failed to create ConfigMap %s/%s: %w", namespace, RookConfigOverrideName, err) + } + return nil + } + return fmt.Errorf("failed to get ConfigMap %s/%s: %w", namespace, RookConfigOverrideName, err) + } + + logger.Info("Updating ConfigMap %s/%s with Ceph global overrides (%d keys)", namespace, RookConfigOverrideName, len(globals)) + existing.Data = cm.Data + if _, err := clientset.CoreV1().ConfigMaps(namespace).Update(ctx, existing, metav1.UpdateOptions{}); err != nil { + return fmt.Errorf("failed to update ConfigMap %s/%s: %w", namespace, RookConfigOverrideName, err) + } + return nil +} + +// DeleteRookConfigOverride removes the `rook-config-override` ConfigMap. It +// is safe to call when the ConfigMap does not exist. +func DeleteRookConfigOverride(ctx context.Context, kubeconfig *rest.Config, namespace string) error { + if namespace == "" { + return fmt.Errorf("namespace is required") + } + + clientset, err := NewClientsetWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create clientset: %w", err) + } + + if err := clientset.CoreV1().ConfigMaps(namespace).Delete(ctx, RookConfigOverrideName, metav1.DeleteOptions{}); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed to delete ConfigMap %s/%s: %w", namespace, RookConfigOverrideName, err) + } + logger.Info("Deleted ConfigMap %s/%s", namespace, RookConfigOverrideName) + return nil +} + +// RenderCephGlobalConfig renders a `[global]` section for ceph.conf from the +// provided key/value pairs. Keys are sorted so the rendered output is stable +// across calls with logically-equivalent maps (avoids unnecessary CM updates). +func RenderCephGlobalConfig(globals map[string]string) string { + var b strings.Builder + b.WriteString("[global]\n") + + keys := make([]string, 0, len(globals)) + for k := range globals { + keys = append(keys, k) + } + sort.Strings(keys) + + for _, k := range keys { + fmt.Fprintf(&b, "%s = %s\n", k, globals[k]) + } + return b.String() +} diff --git a/pkg/kubernetes/storageclass_manage.go b/pkg/kubernetes/storageclass_manage.go new file mode 100644 index 0000000..bb7fb94 --- /dev/null +++ b/pkg/kubernetes/storageclass_manage.go @@ -0,0 +1,100 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubernetes + +import ( + "context" + "fmt" + + corev1 "k8s.io/api/core/v1" + storagev1 "k8s.io/api/storage/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/rest" + + "github.com/deckhouse/storage-e2e/internal/logger" +) + +type StorageClassCreateConfig struct { + Name string + Provisioner string + Parameters map[string]string + VolumeBindingMode storagev1.VolumeBindingMode + ReclaimPolicy corev1.PersistentVolumeReclaimPolicy + AllowExpansion bool + MakeDefault bool + AdditionalLabels map[string]string + AdditionalAnnot map[string]string +} + +func CreateStorageClass(ctx context.Context, kubeconfig *rest.Config, cfg StorageClassCreateConfig) error { + if cfg.Name == "" { + return fmt.Errorf("storage class name is required") + } + if cfg.Provisioner == "" { + return fmt.Errorf("provisioner is required") + } + + clientset, err := NewClientsetWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create clientset: %w", err) + } + + annotations := map[string]string{} + for k, v := range cfg.AdditionalAnnot { + annotations[k] = v + } + if cfg.MakeDefault { + annotations["storageclass.kubernetes.io/is-default-class"] = "true" + annotations["storageclass.beta.kubernetes.io/is-default-class"] = "true" + } + + labels := map[string]string{} + for k, v := range cfg.AdditionalLabels { + labels[k] = v + } + + sc := &storagev1.StorageClass{ + TypeMeta: metav1.TypeMeta{ + Kind: "StorageClass", + APIVersion: "storage.k8s.io/v1", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: cfg.Name, + Labels: labels, + Annotations: annotations, + }, + Provisioner: cfg.Provisioner, + Parameters: cfg.Parameters, + ReclaimPolicy: &cfg.ReclaimPolicy, + AllowVolumeExpansion: &cfg.AllowExpansion, + VolumeBindingMode: &cfg.VolumeBindingMode, + } + + logger.Info("Creating StorageClass %s (provisioner=%s)", cfg.Name, cfg.Provisioner) + _, err = clientset.StorageV1().StorageClasses().Create(ctx, sc, metav1.CreateOptions{}) + if err != nil { + if apierrors.IsAlreadyExists(err) { + logger.Info("StorageClass %s already exists, skipping create", cfg.Name) + return nil + } + return fmt.Errorf("failed to create StorageClass %s: %w", cfg.Name, err) + } + logger.Success("StorageClass %s created", cfg.Name) + return nil +} + diff --git a/pkg/kubernetes/volumesnapshotclass.go b/pkg/kubernetes/volumesnapshotclass.go new file mode 100644 index 0000000..9307615 --- /dev/null +++ b/pkg/kubernetes/volumesnapshotclass.go @@ -0,0 +1,125 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubernetes + +import ( + "context" + "fmt" + "time" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/rest" + + "github.com/deckhouse/storage-e2e/internal/logger" +) + +var VolumeSnapshotClassGVR = schema.GroupVersionResource{ + Group: "snapshot.storage.k8s.io", + Version: "v1", + Resource: "volumesnapshotclasses", +} + +type VolumeSnapshotClassConfig struct { + Name string + Driver string + DeletionPolicy string // "Delete" or "Retain" + Parameters map[string]string + MakeDefault bool +} + +func CreateVolumeSnapshotClass(ctx context.Context, kubeconfig *rest.Config, cfg VolumeSnapshotClassConfig) error { + if cfg.Name == "" { + return fmt.Errorf("volume snapshot class name is required") + } + if cfg.Driver == "" { + return fmt.Errorf("driver is required") + } + if cfg.DeletionPolicy == "" { + cfg.DeletionPolicy = "Delete" + } + + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + + annotations := map[string]interface{}{} + if cfg.MakeDefault { + annotations["snapshot.storage.kubernetes.io/is-default-class"] = "true" + } + + parameters := map[string]interface{}{} + for k, v := range cfg.Parameters { + parameters[k] = v + } + + vsc := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "snapshot.storage.k8s.io/v1", + "kind": "VolumeSnapshotClass", + "metadata": map[string]interface{}{ + "name": cfg.Name, + "annotations": annotations, + }, + "driver": cfg.Driver, + "deletionPolicy": cfg.DeletionPolicy, + "parameters": parameters, + }, + } + + logger.Info("Creating VolumeSnapshotClass %s (driver=%s, deletionPolicy=%s)", cfg.Name, cfg.Driver, cfg.DeletionPolicy) + _, err = dynamicClient.Resource(VolumeSnapshotClassGVR).Create(ctx, vsc, metav1.CreateOptions{}) + if err != nil { + if apierrors.IsAlreadyExists(err) { + logger.Info("VolumeSnapshotClass %s already exists, skipping create", cfg.Name) + return nil + } + return fmt.Errorf("failed to create VolumeSnapshotClass %s: %w", cfg.Name, err) + } + logger.Success("VolumeSnapshotClass %s created", cfg.Name) + return nil +} + +func WaitForVolumeSnapshotClass(ctx context.Context, kubeconfig *rest.Config, name string, timeout time.Duration) error { + logger.Debug("Waiting for VolumeSnapshotClass %s to become available (timeout: %v)", name, timeout) + + dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + + deadline := time.Now().Add(timeout) + for { + if ctx.Err() != nil { + return ctx.Err() + } + if time.Now().After(deadline) { + return fmt.Errorf("timeout waiting for VolumeSnapshotClass %s", name) + } + + _, err := dynamicClient.Resource(VolumeSnapshotClassGVR).Get(ctx, name, metav1.GetOptions{}) + if err == nil { + logger.Success("VolumeSnapshotClass %s is available", name) + return nil + } + + time.Sleep(5 * time.Second) + } +} diff --git a/pkg/testkit/ceph.go b/pkg/testkit/ceph.go new file mode 100644 index 0000000..7427967 --- /dev/null +++ b/pkg/testkit/ceph.go @@ -0,0 +1,622 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package testkit + +import ( + "context" + "fmt" + "time" + + "k8s.io/client-go/rest" + + "github.com/deckhouse/storage-e2e/internal/infrastructure/ssh" + "github.com/deckhouse/storage-e2e/internal/logger" + "github.com/deckhouse/storage-e2e/pkg/kubernetes" +) + +// Re-exports of the supported CephStorageClass types so callers don't have +// to import the lower-level pkg/kubernetes package just to set cfg.Type. +const ( + CephStorageClassTypeRBD = kubernetes.CephStorageClassTypeRBD + CephStorageClassTypeCephFS = kubernetes.CephStorageClassTypeCephFS +) + +// CephStorageClassConfig controls the end-to-end provisioning of a +// Rook-managed Ceph cluster plus a csi-ceph-backed k8s StorageClass: +// +// 1. Enables Deckhouse modules required for the stack: +// sds-node-configurator, sds-elastic (Rook), csi-ceph. +// 2. (Optional) Falls back to EnsureDefaultStorageClass to produce a +// sds-local-volume StorageClass for backing OSD PVCs. +// 3. Seeds `rook-config-override` with per-test global Ceph settings +// (e.g. `ms_crc_data = false` for the PR #131 scenario). +// 4. Creates a CephCluster (Rook) and waits until it is Created. +// 5. Creates a CephBlockPool and waits until it is Ready. +// 6. Reads fsid / monitors / CephX admin key from Rook-managed secrets +// and wires them into CephClusterConnection + CephClusterAuthentication +// CRs so csi-ceph can talk to the cluster. +// 7. Creates a CephStorageClass CR and waits for the csi-ceph controller +// to materialize a core storage.k8s.io/v1 StorageClass. +// +// Only StorageClassName is strictly required; everything else has sensible +// defaults tuned for single-node / tiny test clusters. +type CephStorageClassConfig struct { + // --- Top-level identity --- + + // StorageClassName is the name of the CephStorageClass CR (and of the + // resulting k8s StorageClass). Required. + StorageClassName string + + // Namespace is the Rook / sds-elastic namespace. Default: "d8-sds-elastic". + Namespace string + + // --- sds-elastic / Rook CephCluster --- + + // CephClusterName is the Rook CephCluster name. Default: "ceph-cluster". + CephClusterName string + + // CephImage is the Ceph container image tag. Default: "quay.io/ceph/ceph:v18.2.7". + CephImage string + + // MonCount / MgrCount are the Rook mon/mgr replica counts. + // Defaults: 1 / 1 (good for 1..3 node test clusters). + MonCount int + MgrCount int + + // NetworkProvider: "" for CNI (default), "host" for host networking. + NetworkProvider string + PublicNetworkCIDRs []string + ClusterNetworkCIDRs []string + + // GlobalCephConfigOverrides populates `rook-config-override` under + // `[global]`, e.g. {"ms_crc_data": "false"}. nil / empty map leaves + // the ConfigMap untouched except for creating it as an empty `[global]`. + GlobalCephConfigOverrides map[string]string + + // --- OSD backing --- + + // OSDStorageClass is a block-capable StorageClass used to back OSD PVCs. + // When empty, EnsureDefaultStorageClass is invoked with + // OSDBackingStorageClass* to provision a sds-local-volume SC. + OSDStorageClass string + + // OSDCount is the number of OSDs. Default: 1. + OSDCount int + + // OSDSize is the size of each OSD PVC. Default: "10Gi". + OSDSize string + + // --- Fallback SC provisioning via sds-local-volume (when OSDStorageClass is empty) --- + + // OSDBackingStorageClassName names the sds-local-volume SC that we + // auto-provision for OSDs. Default: "sds-local-volume-thin-ceph-osd". + OSDBackingStorageClassName string + + // OSDBackingLVMType passed to EnsureDefaultStorageClass ("Thick"/"Thin"). + // Default: "Thick" (simpler for block-mode PVCs used as Ceph OSDs). + OSDBackingLVMType string + + // OSDBackingIncludeMasters exposes EnsureDefaultStorageClass.IncludeMasters. + OSDBackingIncludeMasters bool + + // OSDBackingBaseKubeconfig/VMNamespace/BaseStorageClassName are plumbed + // through to EnsureDefaultStorageClass to enable automatic VirtualDisk + // attachment on nested-VM clusters. + OSDBackingBaseKubeconfig *rest.Config + OSDBackingVMNamespace string + OSDBackingBaseStorageClassName string + + // MasterSSH is optional SSH access to the control plane. Not used by + // EnsureCephStorageClass in this revision; callers may set it for + // follow-up bootstrap or diagnostics hooks. + MasterSSH ssh.SSHClient + + // --- CephBlockPool --- + + // PoolName is the Rook CephBlockPool name (also becomes the Ceph pool + // name referenced by CephStorageClass.spec.rbd.pool). + // Default: "ceph-rbd-r". + PoolName string + + // ReplicaSize is the CephBlockPool replication factor. Default: 1. + ReplicaSize int + + // FailureDomain is the CRUSH failure domain: "host" or "osd". + // Default: "osd" when ReplicaSize==1, "host" otherwise. + FailureDomain string + + // --- Pool kind --- + + // Type selects the backing Ceph primitive: "RBD" (default) provisions a + // CephBlockPool; "CephFS" provisions a CephFilesystem. The resulting + // csi-ceph CephStorageClass CR mirrors this choice via spec.type. + Type string + + // --- CephFilesystem (used only when Type == "CephFS") --- + + // CephFSName is the Rook CephFilesystem name. Default: "ceph-fs". + CephFSName string + + // CephFSDataPoolName is the per-filesystem data pool name (Rook-side, + // not the full Ceph pool name). Default: "data0". + CephFSDataPoolName string + + // CephFSMetadataReplicas is the metadata pool replication factor. + // Default: ReplicaSize. + CephFSMetadataReplicas int + + // CephFSDataReplicas is the data pool replication factor. + // Default: ReplicaSize. + CephFSDataReplicas int + + // CephFSActiveMDSCount is the number of active MDS daemons. Default: 1. + CephFSActiveMDSCount int + + // --- csi-ceph wiring --- + + // ClusterConnectionName and ClusterAuthenticationName point at the + // CephClusterConnection / CephClusterAuthentication CRs we create. + // Defaults: both "-conn". + ClusterConnectionName string + ClusterAuthenticationName string + + // RBDDefaultFSType picks the mkfs used on attach. Default: "ext4". + RBDDefaultFSType string + + // --- Modules --- + + // SkipModuleEnablement disables the module-enable step (useful when the + // caller has already configured ModuleConfig on the cluster). + SkipModuleEnablement bool + + // SkipClusterTeardown leaves the underlying Rook CephCluster and the + // rook-config-override ConfigMap in place during TeardownCephStorageClass. + // Use it when several StorageClasses share a single CephCluster — the + // "owning" call should leave the flag false and tear the cluster down + // last, while every other teardown sets it to true and only removes its + // SC-specific resources (CephStorageClass / connection / auth / pool / + // filesystem). + SkipClusterTeardown bool + + // SdsElasticSettings overrides `spec.settings` of the sds-elastic + // ModuleConfig. Defaults to the minimal set that makes sense on a + // single-node test cluster. + SdsElasticSettings map[string]interface{} + + // CsiCephSettings overrides `spec.settings` of the csi-ceph ModuleConfig. + CsiCephSettings map[string]interface{} + + // CsiCephModulePullOverride pins a specific csi-ceph image tag (dev + // registry only). Useful for testing PRs that haven't been released yet. + CsiCephModulePullOverride string + + // --- Timeouts --- + + ModulesReadyTimeout time.Duration // default 15m + CephClusterReadyTimeout time.Duration // default 20m + CephPoolReadyTimeout time.Duration // default 10m + CephFilesystemReadyTimeout time.Duration // default 10m + CredentialsTimeout time.Duration // default 10m + CSICephPhaseTimeout time.Duration // default 5m + StorageClassWaitTimeout time.Duration // default 2m +} + +func (c *CephStorageClassConfig) applyDefaults() { + if c.Namespace == "" { + c.Namespace = kubernetes.DefaultRookNamespace + } + if c.CephClusterName == "" { + c.CephClusterName = kubernetes.DefaultCephClusterName + } + if c.CephImage == "" { + c.CephImage = kubernetes.DefaultCephImage + } + if c.MonCount <= 0 { + c.MonCount = 1 + } + if c.MgrCount <= 0 { + c.MgrCount = 1 + } + if c.OSDCount <= 0 { + c.OSDCount = 1 + } + if c.OSDSize == "" { + c.OSDSize = kubernetes.DefaultOSDStorageClassSize + } + if c.OSDBackingStorageClassName == "" { + c.OSDBackingStorageClassName = "sds-local-volume-thick-ceph-osd" + } + if c.OSDBackingLVMType == "" { + c.OSDBackingLVMType = "Thick" + } + if c.ReplicaSize <= 0 { + c.ReplicaSize = 1 + } + if c.PoolName == "" { + c.PoolName = fmt.Sprintf("ceph-rbd-r%d", c.ReplicaSize) + } + if c.FailureDomain == "" { + if c.ReplicaSize == 1 { + c.FailureDomain = "osd" + } else { + c.FailureDomain = "host" + } + } + if c.ClusterConnectionName == "" { + c.ClusterConnectionName = c.StorageClassName + "-conn" + } + if c.ClusterAuthenticationName == "" { + c.ClusterAuthenticationName = c.StorageClassName + "-conn" + } + if c.RBDDefaultFSType == "" { + c.RBDDefaultFSType = "ext4" + } + if c.Type == "" { + c.Type = kubernetes.CephStorageClassTypeRBD + } + if c.CephFSName == "" { + c.CephFSName = "ceph-fs" + } + if c.CephFSDataPoolName == "" { + c.CephFSDataPoolName = "data0" + } + if c.CephFSMetadataReplicas <= 0 { + c.CephFSMetadataReplicas = c.ReplicaSize + } + if c.CephFSDataReplicas <= 0 { + c.CephFSDataReplicas = c.ReplicaSize + } + if c.CephFSActiveMDSCount <= 0 { + c.CephFSActiveMDSCount = 1 + } + if c.ModulesReadyTimeout == 0 { + c.ModulesReadyTimeout = 15 * time.Minute + } + if c.CephClusterReadyTimeout == 0 { + c.CephClusterReadyTimeout = 20 * time.Minute + } + if c.CephPoolReadyTimeout == 0 { + c.CephPoolReadyTimeout = 10 * time.Minute + } + if c.CephFilesystemReadyTimeout == 0 { + c.CephFilesystemReadyTimeout = 10 * time.Minute + } + if c.CredentialsTimeout == 0 { + c.CredentialsTimeout = 10 * time.Minute + } + if c.CSICephPhaseTimeout == 0 { + c.CSICephPhaseTimeout = 5 * time.Minute + } + if c.StorageClassWaitTimeout == 0 { + c.StorageClassWaitTimeout = 2 * time.Minute + } +} + +// EnsureCephStorageClass is the high-level entry point that turns an empty +// cluster into one with a working csi-ceph StorageClass. See +// CephStorageClassConfig for the step-by-step flow. +// +// The function is idempotent: re-running it picks up the existing Rook +// CephCluster / pool / csi-ceph CRs and only fills in whatever is still +// missing. Returns the name of the resulting k8s StorageClass. +func EnsureCephStorageClass(ctx context.Context, kubeconfig *rest.Config, cfg CephStorageClassConfig) (string, error) { + cfg.applyDefaults() + + if cfg.StorageClassName == "" { + return "", fmt.Errorf("StorageClassName is required") + } + + logger.Step(1, "Enabling Deckhouse modules for csi-ceph (sds-node-configurator, sds-elastic, csi-ceph)") + if !cfg.SkipModuleEnablement { + if err := ensureCephModules(ctx, kubeconfig, cfg); err != nil { + return "", fmt.Errorf("enable ceph modules: %w", err) + } + } + logger.StepComplete(1, "Modules enabled") + + logger.Step(2, "Resolving OSD backing StorageClass") + osdSC, err := ensureOSDBackingStorageClass(ctx, kubeconfig, &cfg) + if err != nil { + return "", fmt.Errorf("resolve OSD backing StorageClass: %w", err) + } + logger.StepComplete(2, "OSD backing StorageClass: %s", osdSC) + + logger.Step(3, "Seeding rook-config-override ConfigMap") + if err := kubernetes.SetRookConfigOverride(ctx, kubeconfig, cfg.Namespace, cfg.GlobalCephConfigOverrides); err != nil { + return "", fmt.Errorf("set rook-config-override: %w", err) + } + logger.StepComplete(3, "rook-config-override ready (%d global key(s))", len(cfg.GlobalCephConfigOverrides)) + + logger.Step(4, "Creating Rook CephCluster %s/%s", cfg.Namespace, cfg.CephClusterName) + if err := kubernetes.CreateCephCluster(ctx, kubeconfig, kubernetes.CephClusterConfig{ + Name: cfg.CephClusterName, + Namespace: cfg.Namespace, + CephImage: cfg.CephImage, + MonCount: cfg.MonCount, + MgrCount: cfg.MgrCount, + NetworkProvider: cfg.NetworkProvider, + PublicNetworkCIDRs: cfg.PublicNetworkCIDRs, + ClusterNetworkCIDRs: cfg.ClusterNetworkCIDRs, + OSDStorageClass: osdSC, + OSDCount: cfg.OSDCount, + OSDSize: cfg.OSDSize, + }); err != nil { + return "", fmt.Errorf("create CephCluster: %w", err) + } + if err := kubernetes.WaitForCephClusterReady(ctx, kubeconfig, cfg.Namespace, cfg.CephClusterName, cfg.CephClusterReadyTimeout); err != nil { + return "", fmt.Errorf("wait CephCluster: %w", err) + } + logger.StepComplete(4, "CephCluster %s/%s is Created", cfg.Namespace, cfg.CephClusterName) + + switch cfg.Type { + case kubernetes.CephStorageClassTypeRBD: + logger.Step(5, "Creating CephBlockPool %s/%s (replica=%d, failureDomain=%s)", + cfg.Namespace, cfg.PoolName, cfg.ReplicaSize, cfg.FailureDomain) + if err := kubernetes.CreateCephBlockPool(ctx, kubeconfig, kubernetes.CephBlockPoolConfig{ + Name: cfg.PoolName, + Namespace: cfg.Namespace, + FailureDomain: cfg.FailureDomain, + ReplicaSize: cfg.ReplicaSize, + }); err != nil { + return "", fmt.Errorf("create CephBlockPool: %w", err) + } + if err := kubernetes.WaitForCephBlockPoolReady(ctx, kubeconfig, cfg.Namespace, cfg.PoolName, cfg.CephPoolReadyTimeout); err != nil { + return "", fmt.Errorf("wait CephBlockPool: %w", err) + } + logger.StepComplete(5, "CephBlockPool %s/%s is Ready", cfg.Namespace, cfg.PoolName) + case kubernetes.CephStorageClassTypeCephFS: + logger.Step(5, "Creating CephFilesystem %s/%s (metadata replica=%d, data pool %q replica=%d, failureDomain=%s, activeMDS=%d)", + cfg.Namespace, cfg.CephFSName, + cfg.CephFSMetadataReplicas, cfg.CephFSDataPoolName, cfg.CephFSDataReplicas, + cfg.FailureDomain, cfg.CephFSActiveMDSCount) + if err := kubernetes.CreateCephFilesystem(ctx, kubeconfig, kubernetes.CephFilesystemConfig{ + Name: cfg.CephFSName, + Namespace: cfg.Namespace, + FailureDomain: cfg.FailureDomain, + MetadataPoolReplicas: cfg.CephFSMetadataReplicas, + DataPoolName: cfg.CephFSDataPoolName, + DataPoolReplicas: cfg.CephFSDataReplicas, + MetadataServerActiveCount: cfg.CephFSActiveMDSCount, + }); err != nil { + return "", fmt.Errorf("create CephFilesystem: %w", err) + } + if err := kubernetes.WaitForCephFilesystemReady(ctx, kubeconfig, cfg.Namespace, cfg.CephFSName, cfg.CephFilesystemReadyTimeout); err != nil { + return "", fmt.Errorf("wait CephFilesystem: %w", err) + } + logger.StepComplete(5, "CephFilesystem %s/%s is Ready", cfg.Namespace, cfg.CephFSName) + default: + return "", fmt.Errorf("unsupported CephStorageClass Type: %s", cfg.Type) + } + + logger.Step(6, "Extracting Rook-managed Ceph credentials (fsid, monitors, admin key)") + creds, err := kubernetes.WaitForCephCredentials(ctx, kubeconfig, cfg.Namespace, cfg.CredentialsTimeout) + if err != nil { + return "", fmt.Errorf("wait ceph credentials: %w", err) + } + logger.StepComplete(6, "Ceph credentials: fsid=%s, user=%s, %d monitor(s): %v", + creds.FSID, creds.AdminUser, len(creds.Monitors), creds.Monitors) + + logger.Step(7, "Wiring csi-ceph: CephClusterAuthentication %q + CephClusterConnection %q", + cfg.ClusterAuthenticationName, cfg.ClusterConnectionName) + if err := kubernetes.CreateCephClusterAuthentication(ctx, kubeconfig, kubernetes.CephClusterAuthenticationConfig{ + Name: cfg.ClusterAuthenticationName, + UserID: creds.AdminUser, + UserKey: creds.AdminKey, + }); err != nil { + return "", fmt.Errorf("create CephClusterAuthentication: %w", err) + } + if err := kubernetes.CreateCephClusterConnection(ctx, kubeconfig, kubernetes.CephClusterConnectionConfig{ + Name: cfg.ClusterConnectionName, + ClusterID: creds.FSID, + Monitors: creds.Monitors, + UserID: creds.AdminUser, + UserKey: creds.AdminKey, + }); err != nil { + return "", fmt.Errorf("create CephClusterConnection: %w", err) + } + if err := kubernetes.WaitForCephClusterConnectionCreated(ctx, kubeconfig, cfg.ClusterConnectionName, cfg.CSICephPhaseTimeout); err != nil { + return "", fmt.Errorf("wait CephClusterConnection: %w", err) + } + logger.StepComplete(7, "csi-ceph wired against Ceph cluster %s", creds.FSID) + + logger.Step(8, "Creating CephStorageClass %q (type=%s) → StorageClass", cfg.StorageClassName, cfg.Type) + cscCfg := kubernetes.CephStorageClassConfig{ + Name: cfg.StorageClassName, + ClusterConnectionName: cfg.ClusterConnectionName, + ClusterAuthenticationName: cfg.ClusterAuthenticationName, + Type: cfg.Type, + } + switch cfg.Type { + case kubernetes.CephStorageClassTypeRBD: + cscCfg.RBDPool = cfg.PoolName + cscCfg.RBDDefaultFSType = cfg.RBDDefaultFSType + case kubernetes.CephStorageClassTypeCephFS: + cscCfg.CephFSName = cfg.CephFSName + cscCfg.CephFSPool = kubernetes.CephFSDataPoolFullName(cfg.CephFSName, cfg.CephFSDataPoolName) + default: + return "", fmt.Errorf("unsupported CephStorageClass Type: %s", cfg.Type) + } + if err := kubernetes.CreateCephStorageClass(ctx, kubeconfig, cscCfg); err != nil { + return "", fmt.Errorf("create CephStorageClass: %w", err) + } + if err := kubernetes.WaitForCephStorageClassCreated(ctx, kubeconfig, cfg.StorageClassName, cfg.CSICephPhaseTimeout); err != nil { + return "", fmt.Errorf("wait CephStorageClass: %w", err) + } + if err := kubernetes.WaitForStorageClass(ctx, kubeconfig, cfg.StorageClassName, cfg.StorageClassWaitTimeout); err != nil { + return "", fmt.Errorf("wait core StorageClass: %w", err) + } + logger.StepComplete(8, "StorageClass %s is available", cfg.StorageClassName) + + switch cfg.Type { + case kubernetes.CephStorageClassTypeCephFS: + logger.Success("Ceph e2e stack ready: CephCluster %s/%s + filesystem %s → StorageClass %s", + cfg.Namespace, cfg.CephClusterName, cfg.CephFSName, cfg.StorageClassName) + default: + logger.Success("Ceph e2e stack ready: CephCluster %s/%s + pool %s → StorageClass %s", + cfg.Namespace, cfg.CephClusterName, cfg.PoolName, cfg.StorageClassName) + } + return cfg.StorageClassName, nil +} + +// TeardownCephStorageClass removes the csi-ceph wiring + Rook CephCluster + +// pool + rook-config-override produced by EnsureCephStorageClass. Safe to +// call on partial state (missing resources are skipped — the first error is +// returned but subsequent deletions are still attempted). +// +// Each Delete is followed by a Wait*Gone that waits for the apiserver to +// actually GC the CR. Without this synchronization the next test run (in +// alwaysUseExisting mode, or a fresh bootstrap that re-creates the same +// namespace) would race against Rook's finalizer and either: +// - find the CR still in Terminating and try to update its spec (no-op +// while the controller unwinds the finalizer); +// - delete the parent CephCluster while a child CephBlockPool / +// CephFilesystem is still alive — Rook then sets `DeletionIsBlocked / +// ObjectHasDependents` and the CephCluster sticks in `phase=Deleting` +// forever. +// +// On a Wait*Gone timeout we DO NOT auto-strip finalizers: the failure is +// surfaced as an aggregated error so the operator can investigate the +// cluster (typical reasons: HEALTH_ERR Ceph, stuck OSD prepare, dead mgr). +// +// It deliberately does NOT disable the Deckhouse modules: they may be owned +// by the cluster admin, and re-bootstrapping is cheaper than a full +// module-disable → module-enable cycle. +func TeardownCephStorageClass(ctx context.Context, kubeconfig *rest.Config, cfg CephStorageClassConfig) error { + cfg.applyDefaults() + + var firstErr error + note := func(err error, what string) { + if err == nil { + return + } + logger.Warn("teardown: %s: %v", what, err) + if firstErr == nil { + firstErr = fmt.Errorf("%s: %w", what, err) + } + } + + logger.Info("Tearing down csi-ceph StorageClass %q (type=%s)", cfg.StorageClassName, cfg.Type) + + // 1. CephStorageClass: leaf, no finalizer dependency on the rest. + note(kubernetes.DeleteCephStorageClass(ctx, kubeconfig, cfg.StorageClassName), "delete CephStorageClass") + note(kubernetes.WaitForCephStorageClassGone(ctx, kubeconfig, cfg.StorageClassName, 0), "wait CephStorageClass gone") + + // 2. CephClusterConnection / CephClusterAuthentication: csi-ceph CRs. + // Order between conn and auth doesn't matter — neither depends on the + // other. + note(kubernetes.DeleteCephClusterConnection(ctx, kubeconfig, cfg.ClusterConnectionName), "delete CephClusterConnection") + note(kubernetes.WaitForCephClusterConnectionGone(ctx, kubeconfig, cfg.ClusterConnectionName, 0), "wait CephClusterConnection gone") + + note(kubernetes.DeleteCephClusterAuthentication(ctx, kubeconfig, cfg.ClusterAuthenticationName), "delete CephClusterAuthentication") + note(kubernetes.WaitForCephClusterAuthenticationGone(ctx, kubeconfig, cfg.ClusterAuthenticationName, 0), "wait CephClusterAuthentication gone") + + // 3. Pool / Filesystem: must be fully gone before deleting CephCluster, + // otherwise Rook records DeletionIsBlocked / ObjectHasDependents. + switch cfg.Type { + case kubernetes.CephStorageClassTypeCephFS: + note(kubernetes.DeleteCephFilesystem(ctx, kubeconfig, cfg.Namespace, cfg.CephFSName), "delete CephFilesystem") + note(kubernetes.WaitForCephFilesystemGone(ctx, kubeconfig, cfg.Namespace, cfg.CephFSName, 0), "wait CephFilesystem gone") + default: + note(kubernetes.DeleteCephBlockPool(ctx, kubeconfig, cfg.Namespace, cfg.PoolName), "delete CephBlockPool") + note(kubernetes.WaitForCephBlockPoolGone(ctx, kubeconfig, cfg.Namespace, cfg.PoolName, 0), "wait CephBlockPool gone") + } + + // 4. CephCluster: only when this teardown call owns it (the other + // TeardownCephStorageClass call shares the same Rook cluster — see + // SkipClusterTeardown doc-comment). + if !cfg.SkipClusterTeardown { + note(kubernetes.DeleteCephCluster(ctx, kubeconfig, cfg.Namespace, cfg.CephClusterName), "delete CephCluster") + note(kubernetes.WaitForCephClusterGone(ctx, kubeconfig, cfg.Namespace, cfg.CephClusterName, 0), "wait CephCluster gone") + note(kubernetes.DeleteRookConfigOverride(ctx, kubeconfig, cfg.Namespace), "delete rook-config-override") + } else { + logger.Info("Skipping CephCluster + rook-config-override teardown (SkipClusterTeardown=true)") + } + return firstErr +} + +// EnsureDefaultCephStorageClass is EnsureCephStorageClass + SetGlobalDefaultStorageClass. +// After this call new PVCs without an explicit storageClassName will use the +// freshly-provisioned Ceph RBD class. +func EnsureDefaultCephStorageClass(ctx context.Context, kubeconfig *rest.Config, cfg CephStorageClassConfig) (string, error) { + scName, err := EnsureCephStorageClass(ctx, kubeconfig, cfg) + if err != nil { + return "", err + } + if err := kubernetes.SetGlobalDefaultStorageClass(ctx, kubeconfig, scName); err != nil { + return "", fmt.Errorf("set %s as default in global ModuleConfig: %w", scName, err) + } + logger.Success("StorageClass %s set as cluster default", scName) + return scName, nil +} + +// ensureCephModules enables sds-node-configurator + sds-elastic + csi-ceph +// and waits for their Ready phase. +func ensureCephModules(ctx context.Context, kubeconfig *rest.Config, cfg CephStorageClassConfig) error { + sdsElasticSettings := cfg.SdsElasticSettings + if sdsElasticSettings == nil { + sdsElasticSettings = map[string]interface{}{} + } + + csiCephSettings := cfg.CsiCephSettings + if csiCephSettings == nil { + csiCephSettings = map[string]interface{}{} + } + + modules := []kubernetes.ModuleSpec{ + { + Name: "sds-node-configurator", + Version: 1, + Enabled: true, + }, + { + Name: "sds-elastic", + Version: 1, + Enabled: true, + Settings: sdsElasticSettings, + Dependencies: []string{"sds-node-configurator"}, + }, + { + Name: "csi-ceph", + Version: 1, + Enabled: true, + Settings: csiCephSettings, + Dependencies: []string{"sds-elastic"}, + ModulePullOverride: cfg.CsiCephModulePullOverride, + }, + } + return kubernetes.EnableModulesAndWait(ctx, kubeconfig, nil, nil, modules, cfg.ModulesReadyTimeout) +} + +// ensureOSDBackingStorageClass returns an already-existing SC name (if the +// caller supplied OSDStorageClass) or delegates to EnsureDefaultStorageClass +// to provision a sds-local-volume SC on the fly. +func ensureOSDBackingStorageClass(ctx context.Context, kubeconfig *rest.Config, cfg *CephStorageClassConfig) (string, error) { + if cfg.OSDStorageClass != "" { + logger.Info("Using pre-existing OSD backing StorageClass %s", cfg.OSDStorageClass) + return cfg.OSDStorageClass, nil + } + + localCfg := DefaultStorageClassConfig{ + StorageClassName: cfg.OSDBackingStorageClassName, + LVMType: cfg.OSDBackingLVMType, + IncludeMasters: cfg.OSDBackingIncludeMasters, + BaseKubeconfig: cfg.OSDBackingBaseKubeconfig, + VMNamespace: cfg.OSDBackingVMNamespace, + BaseStorageClassName: cfg.OSDBackingBaseStorageClassName, + } + return EnsureDefaultStorageClass(ctx, kubeconfig, localCfg) +} diff --git a/pkg/testkit/ceph_cluster.go b/pkg/testkit/ceph_cluster.go new file mode 100644 index 0000000..cf683f2 --- /dev/null +++ b/pkg/testkit/ceph_cluster.go @@ -0,0 +1,295 @@ +/* +Copyright 2026 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package testkit + +import ( + "context" + "fmt" + "time" + + "k8s.io/client-go/rest" + + "github.com/deckhouse/storage-e2e/internal/logger" + "github.com/deckhouse/storage-e2e/pkg/kubernetes" +) + +// RookCephClusterConfig configures EnsureCephCluster — the "just bring up +// a Rook-managed Ceph cluster + pool" variant of EnsureCephStorageClass. +// +// Unlike EnsureCephStorageClass, EnsureCephCluster does NOT: +// - enable the `csi-ceph` Deckhouse module; +// - create CephClusterConnection / CephClusterAuthentication CRs; +// - create a CephStorageClass CR / materialize a core StorageClass. +// +// It stops once the Rook CephCluster is Created and the CephBlockPool is +// Ready. Use this when the test suite needs a live Ceph backend to exercise +// (e.g. to run rbd / ceph CLI against it, or to hook some other client) but +// deliberately does NOT want csi-ceph in the picture. +type RookCephClusterConfig struct { + // --- Namespacing / naming --- + + // Namespace is the Rook / sds-elastic namespace. Default: "d8-sds-elastic". + Namespace string + + // CephClusterName is the Rook CephCluster name. Default: "ceph-cluster". + CephClusterName string + + // CephImage is the Ceph container image. Default: + // "quay.io/ceph/ceph:v18.2.7". + CephImage string + + // MonCount / MgrCount are the Rook mon/mgr replica counts. + // Defaults: 1 / 1 (appropriate for 1..3-node test clusters). + MonCount int + MgrCount int + + // NetworkProvider: "" for CNI (default), "host" for host networking. + NetworkProvider string + PublicNetworkCIDRs []string + ClusterNetworkCIDRs []string + + // GlobalCephConfigOverrides populates `rook-config-override` under + // `[global]`, e.g. {"ms_crc_data": "false"} for the csi-ceph + // msCrcData matrix. nil leaves the ConfigMap otherwise empty. + GlobalCephConfigOverrides map[string]string + + // --- OSD backing --- + + // OSDStorageClass is a block-capable StorageClass used to back OSD PVCs. + // When empty, EnsureDefaultStorageClass is invoked with + // OSDBacking* to provision a sds-local-volume SC on the fly. + OSDStorageClass string + + // OSDCount is the number of OSDs. Default: 1. + OSDCount int + + // OSDSize is the size of each OSD PVC. Default: kubernetes.DefaultOSDStorageClassSize. + OSDSize string + + // --- Fallback SC provisioning via sds-local-volume --- + + // OSDBackingStorageClassName names the sds-local-volume SC we auto- + // provision for OSDs. Default: "sds-local-volume-thick-ceph-osd". + OSDBackingStorageClassName string + + // OSDBackingLVMType ("Thick"/"Thin"). Default: "Thick". + OSDBackingLVMType string + + OSDBackingIncludeMasters bool + OSDBackingBaseKubeconfig *rest.Config + OSDBackingVMNamespace string + OSDBackingBaseStorageClassName string + + // --- CephBlockPool --- + + // PoolName is the Rook CephBlockPool name. Default: + // "ceph-rbd-r". + PoolName string + + // ReplicaSize is the CephBlockPool replication factor. Default: 1. + ReplicaSize int + + // FailureDomain: "host" or "osd". Default: "osd" when ReplicaSize==1, + // "host" otherwise. + FailureDomain string + + // --- Modules --- + + // SkipModuleEnablement disables the module-enable step (useful when + // the caller has already enabled sds-node-configurator + sds-elastic + // through other means). + SkipModuleEnablement bool + + // SdsElasticSettings overrides `spec.settings` of the sds-elastic + // ModuleConfig. Defaults to an empty map. + SdsElasticSettings map[string]interface{} + + // --- Timeouts --- + + ModulesReadyTimeout time.Duration // default 15m + CephClusterReadyTimeout time.Duration // default 20m + CephPoolReadyTimeout time.Duration // default 10m +} + +func (c *RookCephClusterConfig) applyDefaults() { + if c.Namespace == "" { + c.Namespace = kubernetes.DefaultRookNamespace + } + if c.CephClusterName == "" { + c.CephClusterName = kubernetes.DefaultCephClusterName + } + if c.CephImage == "" { + c.CephImage = kubernetes.DefaultCephImage + } + if c.MonCount <= 0 { + c.MonCount = 1 + } + if c.MgrCount <= 0 { + c.MgrCount = 1 + } + if c.OSDCount <= 0 { + c.OSDCount = 1 + } + if c.OSDSize == "" { + c.OSDSize = kubernetes.DefaultOSDStorageClassSize + } + if c.OSDBackingStorageClassName == "" { + c.OSDBackingStorageClassName = "sds-local-volume-thick-ceph-osd" + } + if c.OSDBackingLVMType == "" { + c.OSDBackingLVMType = "Thick" + } + if c.ReplicaSize <= 0 { + c.ReplicaSize = 1 + } + if c.PoolName == "" { + c.PoolName = fmt.Sprintf("ceph-rbd-r%d", c.ReplicaSize) + } + if c.FailureDomain == "" { + if c.ReplicaSize == 1 { + c.FailureDomain = "osd" + } else { + c.FailureDomain = "host" + } + } + if c.ModulesReadyTimeout == 0 { + c.ModulesReadyTimeout = 15 * time.Minute + } + if c.CephClusterReadyTimeout == 0 { + c.CephClusterReadyTimeout = 20 * time.Minute + } + if c.CephPoolReadyTimeout == 0 { + c.CephPoolReadyTimeout = 10 * time.Minute + } +} + +// EnsureCephCluster brings up (or reuses) a Rook-managed Ceph cluster plus +// a CephBlockPool via sds-elastic — without touching csi-ceph. +// +// Flow: +// 1. Enable Deckhouse modules: sds-node-configurator + sds-elastic. +// 2. Resolve an OSD backing StorageClass (re-using EnsureDefaultStorageClass +// when none is pre-provided). +// 3. Seed `rook-config-override` with per-test global Ceph settings. +// 4. Create the Rook CephCluster and wait until it is Created. +// 5. Create the CephBlockPool and wait until it is Ready. +// +// Idempotent: re-running picks up existing resources. Returns the pool +// name (same one callers would reference as Ceph pool, e.g. for a +// subsequent `rbd create`/`CephStorageClass.rbd.pool`). +func EnsureCephCluster(ctx context.Context, kubeconfig *rest.Config, cfg RookCephClusterConfig) (string, error) { + cfg.applyDefaults() + + logger.Step(1, "Enabling Deckhouse modules for Rook (sds-node-configurator, sds-elastic)") + if !cfg.SkipModuleEnablement { + if err := ensureRookModules(ctx, kubeconfig, cfg.SdsElasticSettings, cfg.ModulesReadyTimeout); err != nil { + return "", fmt.Errorf("enable rook modules: %w", err) + } + } + logger.StepComplete(1, "Modules enabled") + + logger.Step(2, "Resolving OSD backing StorageClass") + osdSC := cfg.OSDStorageClass + if osdSC == "" { + local := DefaultStorageClassConfig{ + StorageClassName: cfg.OSDBackingStorageClassName, + LVMType: cfg.OSDBackingLVMType, + IncludeMasters: cfg.OSDBackingIncludeMasters, + BaseKubeconfig: cfg.OSDBackingBaseKubeconfig, + VMNamespace: cfg.OSDBackingVMNamespace, + BaseStorageClassName: cfg.OSDBackingBaseStorageClassName, + } + name, err := EnsureDefaultStorageClass(ctx, kubeconfig, local) + if err != nil { + return "", fmt.Errorf("resolve OSD backing StorageClass: %w", err) + } + osdSC = name + } else { + logger.Info("Using pre-existing OSD backing StorageClass %s", osdSC) + } + logger.StepComplete(2, "OSD backing StorageClass: %s", osdSC) + + logger.Step(3, "Seeding rook-config-override ConfigMap") + if err := kubernetes.SetRookConfigOverride(ctx, kubeconfig, cfg.Namespace, cfg.GlobalCephConfigOverrides); err != nil { + return "", fmt.Errorf("set rook-config-override: %w", err) + } + logger.StepComplete(3, "rook-config-override ready (%d global key(s))", len(cfg.GlobalCephConfigOverrides)) + + logger.Step(4, "Creating Rook CephCluster %s/%s", cfg.Namespace, cfg.CephClusterName) + if err := kubernetes.CreateCephCluster(ctx, kubeconfig, kubernetes.CephClusterConfig{ + Name: cfg.CephClusterName, + Namespace: cfg.Namespace, + CephImage: cfg.CephImage, + MonCount: cfg.MonCount, + MgrCount: cfg.MgrCount, + NetworkProvider: cfg.NetworkProvider, + PublicNetworkCIDRs: cfg.PublicNetworkCIDRs, + ClusterNetworkCIDRs: cfg.ClusterNetworkCIDRs, + OSDStorageClass: osdSC, + OSDCount: cfg.OSDCount, + OSDSize: cfg.OSDSize, + }); err != nil { + return "", fmt.Errorf("create CephCluster: %w", err) + } + if err := kubernetes.WaitForCephClusterReady(ctx, kubeconfig, cfg.Namespace, cfg.CephClusterName, cfg.CephClusterReadyTimeout); err != nil { + return "", fmt.Errorf("wait CephCluster: %w", err) + } + logger.StepComplete(4, "CephCluster %s/%s is Created", cfg.Namespace, cfg.CephClusterName) + + logger.Step(5, "Creating CephBlockPool %s/%s (replica=%d, failureDomain=%s)", + cfg.Namespace, cfg.PoolName, cfg.ReplicaSize, cfg.FailureDomain) + if err := kubernetes.CreateCephBlockPool(ctx, kubeconfig, kubernetes.CephBlockPoolConfig{ + Name: cfg.PoolName, + Namespace: cfg.Namespace, + FailureDomain: cfg.FailureDomain, + ReplicaSize: cfg.ReplicaSize, + }); err != nil { + return "", fmt.Errorf("create CephBlockPool: %w", err) + } + if err := kubernetes.WaitForCephBlockPoolReady(ctx, kubeconfig, cfg.Namespace, cfg.PoolName, cfg.CephPoolReadyTimeout); err != nil { + return "", fmt.Errorf("wait CephBlockPool: %w", err) + } + logger.StepComplete(5, "CephBlockPool %s/%s is Ready", cfg.Namespace, cfg.PoolName) + + logger.Success("Ceph cluster ready: CephCluster %s/%s + pool %s (no csi-ceph wiring)", + cfg.Namespace, cfg.CephClusterName, cfg.PoolName) + return cfg.PoolName, nil +} + +// ensureRookModules enables sds-node-configurator + sds-elastic (and nothing +// else). Used by EnsureCephCluster and as the Rook-only step of +// EnsureCephStorageClass's module list. +func ensureRookModules(ctx context.Context, kubeconfig *rest.Config, sdsElasticSettings map[string]interface{}, readyTimeout time.Duration) error { + if sdsElasticSettings == nil { + sdsElasticSettings = map[string]interface{}{} + } + modules := []kubernetes.ModuleSpec{ + { + Name: "sds-node-configurator", + Version: 1, + Enabled: true, + }, + { + Name: "sds-elastic", + Version: 1, + Enabled: true, + Settings: sdsElasticSettings, + Dependencies: []string{"sds-node-configurator"}, + }, + } + return kubernetes.EnableModulesAndWait(ctx, kubeconfig, nil, nil, modules, readyTimeout) +} diff --git a/pkg/testkit/ceph_crc.go b/pkg/testkit/ceph_crc.go new file mode 100644 index 0000000..39fb9a7 --- /dev/null +++ b/pkg/testkit/ceph_crc.go @@ -0,0 +1,347 @@ +/* +Copyright 2026 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package testkit + +import ( + "context" + "fmt" + "strconv" + "strings" + "time" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/rest" + + "github.com/deckhouse/storage-e2e/internal/logger" + "github.com/deckhouse/storage-e2e/pkg/kubernetes" +) + +// EnableServerCRC is the readable counterpart of +// `SetMsCrcDataOnServer(..., ptr.To(true))`. It writes +// `ms_crc_data = true` into rook-config-override and rolling-restarts +// mon/mgr/osd so the override is live on every daemon before returning. +// +// Useful for tests that want the Ceph cluster in an explicit CRC-on state +// (the default Ceph behaviour, but pinned in the ConfigMap so the test +// can assert on it). +func EnableServerCRC(ctx context.Context, kubeconfig *rest.Config, namespace string) error { + enabled := true + return SetMsCrcDataOnServer(ctx, kubeconfig, namespace, &enabled) +} + +// DisableServerCRC flips Ceph into the "CRC off" state: +// `ms_crc_data = false` in rook-config-override + rolling-restart of +// mon/mgr/osd. Paired with a csi-ceph client that still defaults to +// `msCrcData=true`, this reproduces the msCrcData matrix mismatch case. +func DisableServerCRC(ctx context.Context, kubeconfig *rest.Config, namespace string) error { + enabled := false + return SetMsCrcDataOnServer(ctx, kubeconfig, namespace, &enabled) +} + +// ResetServerCRCToDefault removes `ms_crc_data` from rook-config-override +// (rendered `[global]` section becomes empty). Ceph falls back to its +// compile-time default (ms_crc_data = true), matching a freshly-installed +// cluster. Convenient for AfterAll / AfterEach restoration. +func ResetServerCRCToDefault(ctx context.Context, kubeconfig *rest.Config, namespace string) error { + return SetMsCrcDataOnServer(ctx, kubeconfig, namespace, nil) +} + +// SetMsCrcDataOnServer rewrites `rook-config-override` so that only +// `ms_crc_data = ` ends up under `[global]` (nil removes the key +// entirely, falling back to Ceph's compile-time default = true). +// +// After flipping the ConfigMap, it force-restarts mon/mgr/osd Deployments +// in the Rook namespace and waits for them to converge. Idempotent: when +// the ConfigMap already encodes the desired state, nothing is restarted. +// +// Prefer EnableServerCRC / DisableServerCRC / ResetServerCRCToDefault at +// call sites for readability; this lower-level primitive exists so a +// boolean test parameter (e.g. a CRC compatibility matrix) doesn't have to branch. +func SetMsCrcDataOnServer(ctx context.Context, kubeconfig *rest.Config, namespace string, enabled *bool) error { + if namespace == "" { + namespace = kubernetes.DefaultRookNamespace + } + + overrides := renderMsCrcDataOverrides(enabled) + wantConfig := kubernetes.RenderCephGlobalConfig(overrides) + + clientset, err := kubernetes.NewClientsetWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create clientset: %w", err) + } + + existing, err := clientset.CoreV1().ConfigMaps(namespace).Get(ctx, kubernetes.RookConfigOverrideName, metav1.GetOptions{}) + if err != nil && !apierrors.IsNotFound(err) { + return fmt.Errorf("get %s/%s: %w", namespace, kubernetes.RookConfigOverrideName, err) + } + currentConfig := "" + if existing != nil { + currentConfig = existing.Data["config"] + } + + if currentConfig == wantConfig { + logger.Info("rook-config-override already has ms_crc_data=%s, skipping daemon restart", + msCrcDataString(enabled)) + return nil + } + + logger.Info("Setting server-side ms_crc_data=%s in rook-config-override", msCrcDataString(enabled)) + if err := kubernetes.SetRookConfigOverride(ctx, kubeconfig, namespace, overrides); err != nil { + return fmt.Errorf("set rook-config-override: %w", err) + } + + // Rook operator notices CM changes on its next reconcile loop; force + // a rolling restart of the core Ceph daemons so the new + // `/etc/ceph/ceph.conf` takes effect right now. + if err := RestartCephDaemons(ctx, kubeconfig, namespace, 10*time.Minute); err != nil { + return fmt.Errorf("restart ceph daemons: %w", err) + } + + // The operator pod is itself a Ceph admin client: it talks to mons + // to update CephCluster.status, evaluate CephFilesystem health, + // etc. Its in-pod ceph.conf was rendered at startup, so until it + // restarts it keeps using the old `ms_crc_data` value and can't + // connect to the freshly-bounced mons. Symptom: cephcluster CR + // flips to phase=Ready/state=Error with `failed to get status. . + // timed out` until the next reconcile after operator pod recycle. + // Bounce it now so the operator's view of the cluster lines up + // with reality before we return. + if err := RestartRookOperator(ctx, kubeconfig, namespace, 5*time.Minute); err != nil { + return fmt.Errorf("restart rook-ceph-operator: %w", err) + } + + // Final sanity check: any CephFilesystem in the namespace must be + // Ready before we consider the flip "live". This is the gate that + // catches the MDS-stuck-on-old-CRC class of bug — if the MDS + // daemons we just bounced fail to rejoin the mons, the CR will + // linger in a non-Ready phase and we'd rather surface that here + // than have a downstream csi-cephfs PVC hang for minutes. + if err := waitCephFilesystemsReady(ctx, kubeconfig, namespace, 5*time.Minute); err != nil { + return fmt.Errorf("wait CephFilesystem ready after CRC flip: %w", err) + } + + logger.Success("Server-side ms_crc_data=%s is now live on all Ceph daemons", msCrcDataString(enabled)) + return nil +} + +// RestartRookOperator rollout-restarts the rook-operator Deployment +// in the given namespace and waits for the new pod to become Ready. +// +// The operator runs as a Ceph admin client (uses the cluster admin +// keyring + a baked-in ceph.conf to query mon/osd state). When tests +// flip a global wire-protocol knob like `ms_crc_data` and bounce the +// daemons, the operator's existing connections become invalid — but +// without a pod restart it'll keep retrying with the stale ceph.conf +// and the cephcluster CR ends up reporting `HEALTH_ERR` / +// `state: Error` until the next operator reconcile cycle. +// +// Deckhouse packages the rook-operator binary inside a Deployment +// named after the Helm release, which conventionally equals the +// namespace minus the leading `d8-` prefix (`d8-sds-elastic` → +// `sds-elastic`, `d8-sds-replicated-volume` → `sds-replicated-volume`, +// etc.). storage-e2e targets that flavor exclusively — vanilla Rook +// (`rook-ceph-operator` Deployment in `rook-ceph` namespace) is not +// supported here. +func RestartRookOperator(ctx context.Context, kubeconfig *rest.Config, namespace string, timeout time.Duration) error { + if namespace == "" { + namespace = kubernetes.DefaultRookNamespace + } + clientset, err := kubernetes.NewClientsetWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create clientset: %w", err) + } + + operatorName, ok := strings.CutPrefix(namespace, "d8-") + if !ok || operatorName == "" { + return fmt.Errorf("namespace %q is not a deckhouse module namespace (expected d8- prefix); cannot derive rook-operator Deployment name", namespace) + } + if _, err := clientset.AppsV1().Deployments(namespace).Get(ctx, operatorName, metav1.GetOptions{}); err != nil { + return fmt.Errorf("get rook-operator Deployment %s/%s: %w", namespace, operatorName, err) + } + + logger.Info("Rolling-restarting %s/%s so its Ceph admin client picks up the new ceph.conf", namespace, operatorName) + stamp := time.Now().UTC().Format(time.RFC3339Nano) + patch := []byte(fmt.Sprintf( + `{"spec":{"template":{"metadata":{"annotations":{"storage-e2e/restarted-at":%q}}}}}`, stamp)) + if _, err := clientset.AppsV1().Deployments(namespace).Patch( + ctx, operatorName, types.StrategicMergePatchType, patch, metav1.PatchOptions{}); err != nil { + return fmt.Errorf("annotate Deployment %s/%s for rollout: %w", namespace, operatorName, err) + } + + waitCtx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + + for { + d, err := clientset.AppsV1().Deployments(namespace).Get(waitCtx, operatorName, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("get Deployment %s/%s: %w", namespace, operatorName, err) + } + desired := int32(1) + if d.Spec.Replicas != nil { + desired = *d.Spec.Replicas + } + if d.Status.ObservedGeneration >= d.Generation && d.Status.UpdatedReplicas >= desired && d.Status.AvailableReplicas >= desired { + logger.Success("%s/%s is Ready after rollout", namespace, operatorName) + return nil + } + select { + case <-waitCtx.Done(): + return fmt.Errorf("timed out after %s waiting for Deployment %s/%s to become ready", timeout, namespace, operatorName) + case <-ticker.C: + } + } +} + +// waitCephFilesystemsReady lists every CephFilesystem CR in +// `namespace` and waits for each to reach `status.phase=Ready` (or a +// matching Ready condition). If the namespace has no CephFilesystem +// CRs (RBD-only cluster), the function is a no-op. +func waitCephFilesystemsReady(ctx context.Context, kubeconfig *rest.Config, namespace string, timeout time.Duration) error { + if namespace == "" { + namespace = kubernetes.DefaultRookNamespace + } + dynamicClient, err := kubernetes.NewDynamicClientWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create dynamic client: %w", err) + } + + list, err := dynamicClient.Resource(kubernetes.CephFilesystemGVR).Namespace(namespace).List(ctx, metav1.ListOptions{}) + if err != nil { + return fmt.Errorf("list CephFilesystem in %s: %w", namespace, err) + } + if len(list.Items) == 0 { + return nil + } + + for i := range list.Items { + name := list.Items[i].GetName() + if err := kubernetes.WaitForCephFilesystemReady(ctx, kubeconfig, namespace, name, timeout); err != nil { + return fmt.Errorf("CephFilesystem %s/%s did not become Ready after CRC flip: %w", namespace, name, err) + } + } + return nil +} + +// RestartCephDaemons rollout-restarts every Rook-managed Ceph daemon +// Deployment that consumes `/etc/ceph/ceph.conf` (mon, mgr, osd, mds, +// rgw) and waits for each to reach its desired Ready replica count. +// +// Why all five roles, not just mon/mgr/osd: a global ConfigMap knob +// like `ms_crc_data` lives in ceph.conf, which means every daemon +// needs to be restarted for it to take effect. If only mon/mgr/osd +// are bounced and an MDS keeps running with the old value, the +// resulting CRC mismatch silently severs the MDS↔mon messenger +// channel, CephFS goes degraded, and any csi-cephfs PVC hangs in +// Pending until somebody (often the human running the test) bounces +// MDS by hand. Including `rook-ceph-mds` here is what unblocks the +// CephFS half of the msCrcData matrix. +// +// The selector also covers `rook-ceph-rgw` for forward-compat with +// future S3 tests; if no rgw Deployments exist in the cluster, the +// match list is just smaller and the function continues. Operator +// restart is intentionally out of scope here — see RestartRookOperator. +func RestartCephDaemons(ctx context.Context, kubeconfig *rest.Config, namespace string, timeout time.Duration) error { + if namespace == "" { + namespace = kubernetes.DefaultRookNamespace + } + clientset, err := kubernetes.NewClientsetWithRetry(ctx, kubeconfig) + if err != nil { + return fmt.Errorf("failed to create clientset: %w", err) + } + + // Rook labels each Ceph daemon Deployment with `app=rook-ceph-`. + labelSel := "app in (rook-ceph-mon,rook-ceph-mgr,rook-ceph-osd,rook-ceph-mds,rook-ceph-rgw)" + deployList, err := clientset.AppsV1().Deployments(namespace).List(ctx, metav1.ListOptions{LabelSelector: labelSel}) + if err != nil { + return fmt.Errorf("list ceph daemon Deployments (%s): %w", labelSel, err) + } + if len(deployList.Items) == 0 { + return fmt.Errorf("no Ceph daemon Deployments matched %q in namespace %s — is Rook running?", labelSel, namespace) + } + + names := make([]string, 0, len(deployList.Items)) + for i := range deployList.Items { + names = append(names, deployList.Items[i].Name) + } + logger.Info("Rolling-restarting %d Ceph daemon Deployment(s): %v", len(names), names) + + stamp := time.Now().UTC().Format(time.RFC3339Nano) + patch := []byte(fmt.Sprintf( + `{"spec":{"template":{"metadata":{"annotations":{"storage-e2e/restarted-at":%q}}}}}`, stamp)) + + for _, name := range names { + if _, err := clientset.AppsV1().Deployments(namespace).Patch( + ctx, name, types.StrategicMergePatchType, patch, metav1.PatchOptions{}); err != nil { + return fmt.Errorf("annotate Deployment %s/%s for rollout: %w", namespace, name, err) + } + } + + waitCtx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + + for { + ready := 0 + for _, name := range names { + d, err := clientset.AppsV1().Deployments(namespace).Get(waitCtx, name, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("get Deployment %s/%s: %w", namespace, name, err) + } + desired := int32(1) + if d.Spec.Replicas != nil { + desired = *d.Spec.Replicas + } + if d.Status.ObservedGeneration >= d.Generation && d.Status.UpdatedReplicas >= desired && d.Status.AvailableReplicas >= desired { + ready++ + } + } + if ready == len(names) { + logger.Success("All %d Ceph daemon Deployment(s) report Ready after rollout", len(names)) + return nil + } + select { + case <-waitCtx.Done(): + return fmt.Errorf("timed out after %s waiting for %d Ceph daemon Deployments to become ready (%d/%d)", + timeout, len(names), ready, len(names)) + case <-ticker.C: + } + } +} + +// renderMsCrcDataOverrides turns a *bool into the minimal rook-config-override +// key/value map used by the msCrcData test matrix. +func renderMsCrcDataOverrides(enabled *bool) map[string]string { + if enabled == nil { + return nil + } + return map[string]string{ + "ms_crc_data": strconv.FormatBool(*enabled), + } +} + +func msCrcDataString(enabled *bool) string { + if enabled == nil { + return "" + } + return strconv.FormatBool(*enabled) +}