diff --git a/.github/workflows/e2e-ginkgo.yaml b/.github/workflows/e2e-ginkgo.yaml index a24d3a23ea..fb5616564f 100644 --- a/.github/workflows/e2e-ginkgo.yaml +++ b/.github/workflows/e2e-ginkgo.yaml @@ -257,6 +257,7 @@ jobs: ginkgo-label: "${{ needs.parse_label-filter.outputs.label-filter }} || pr" additional-args: "--vcluster-image=${{ env.REPOSITORY_NAME }}:${{ env.TAG_NAME }} --teardown=false" additional-ginkgo-flags: "-v --show-node-events" + timeout: 90m continue-on-error: true - name: Upload Ginkgo JSON report diff --git a/e2e-next/constants/timeouts.go b/e2e-next/constants/timeouts.go index 94ef383882..a05f840e7e 100644 --- a/e2e-next/constants/timeouts.go +++ b/e2e-next/constants/timeouts.go @@ -9,4 +9,5 @@ const ( PollingTimeout = time.Second * 60 PollingTimeoutLong = time.Second * 120 PollingTimeoutVeryLong = time.Second * 300 + PollingTimeoutExtraLong = time.Minute * 90 ) diff --git a/e2e-next/labels/labels.go b/e2e-next/labels/labels.go index 0c4419eb64..334f921b82 100644 --- a/e2e-next/labels/labels.go +++ b/e2e-next/labels/labels.go @@ -20,19 +20,20 @@ var ( Security = Label("security") // Resource-specific labels for targeted filtering inside sync tests. - PriorityClasses = Label("priorityclasses") - RuntimeClasses = Label("runtimeclasses") - StorageClasses = Label("storageclasses") - IngressClasses = Label("ingressclasses") - ConfigMaps = Label("configmaps") - Secrets = Label("secrets") - NetworkPolicies = Label("networkpolicies") - Pods = Label("pods") - PVCs = Label("pvcs") - Events = Label("events") - CoreDNS = Label("coredns") - Webhooks = Label("webhooks") - Snapshots = Label("snapshots") + PriorityClasses = Label("priorityclasses") + RuntimeClasses = Label("runtimeclasses") + StorageClasses = Label("storageclasses") + IngressClasses = Label("ingressclasses") + ConfigMaps = Label("configmaps") + Secrets = Label("secrets") + NetworkPolicies = Label("networkpolicies") + Pods = Label("pods") + PVCs = Label("pvcs") + Events = Label("events") + CoreDNS = Label("coredns") + Webhooks = Label("webhooks") + Snapshots = Label("snapshots") + SnapshotLargeRestore = Label("snapshot-large-restore") // Suite-primary labels (one per opt-in suite). Scheduler = Label("scheduler") diff --git a/e2e-next/suite_snapshot_large_restore_test.go b/e2e-next/suite_snapshot_large_restore_test.go new file mode 100644 index 0000000000..d72e208806 --- /dev/null +++ b/e2e-next/suite_snapshot_large_restore_test.go @@ -0,0 +1,35 @@ +package e2e_next + +import ( + "context" + _ "embed" + + "github.com/loft-sh/e2e-framework/pkg/setup/cluster" + "github.com/loft-sh/vcluster/e2e-next/clusters" + "github.com/loft-sh/vcluster/e2e-next/labels" + "github.com/loft-sh/vcluster/e2e-next/setup" + "github.com/loft-sh/vcluster/e2e-next/setup/lazyvcluster" + "github.com/loft-sh/vcluster/e2e-next/test_storage/snapshot" + . "github.com/onsi/ginkgo/v2" +) + +const snapshotLargeRestoreVClusterName = "large-restore-vcluster" + +func init() { suiteSnapshotLargeRestore() } + +func suiteSnapshotLargeRestore() { + Describe("large-restore-vcluster", labels.SnapshotLargeRestore, Ordered, + cluster.Use(clusters.HostCluster), + func() { + BeforeAll(func(ctx context.Context) context.Context { + return lazyvcluster.LazyVCluster(ctx, + snapshotLargeRestoreVClusterName, + snapshotVClusterYAML, + lazyvcluster.WithPreSetup(setup.SnapshotPreSetup(snapshotLargeRestoreVClusterName)), + ) + }) + + snapshot.SnapshotLargeRestoreSpec() + }, + ) +} diff --git a/e2e-next/test_storage/snapshot/helpers.go b/e2e-next/test_storage/snapshot/helpers.go index 30f252a07b..ed51cfdc80 100644 --- a/e2e-next/test_storage/snapshot/helpers.go +++ b/e2e-next/test_storage/snapshot/helpers.go @@ -169,3 +169,26 @@ func toJSON(obj any) string { b, _ := json.Marshal(obj) return string(b) } + +// countConfigMapsByLabel counts all ConfigMaps matching the given label selector in namespace, +// paginating through the full result set to handle large counts. +func countConfigMapsByLabel(ctx context.Context, client kubernetes.Interface, namespace, labelSelector string) (int, error) { + var count int + var continueToken string + for { + list, err := client.CoreV1().ConfigMaps(namespace).List(ctx, metav1.ListOptions{ + LabelSelector: labelSelector, + Limit: 1000, + Continue: continueToken, + }) + if err != nil { + return 0, err + } + count += len(list.Items) + if list.Continue == "" { + break + } + continueToken = list.Continue + } + return count, nil +} diff --git a/e2e-next/test_storage/snapshot/test_snapshot.go b/e2e-next/test_storage/snapshot/test_snapshot.go index b4de8f8cc6..7784d817f2 100644 --- a/e2e-next/test_storage/snapshot/test_snapshot.go +++ b/e2e-next/test_storage/snapshot/test_snapshot.go @@ -7,6 +7,8 @@ import ( "strings" "time" + "golang.org/x/sync/errgroup" + "github.com/ghodss/yaml" snapshotsv1 "github.com/kubernetes-csi/external-snapshotter/client/v8/clientset/versioned" "github.com/loft-sh/e2e-framework/pkg/setup/cluster" @@ -800,6 +802,164 @@ func deletePVC(ctx context.Context, vClusterClient, _ kubernetes.Interface, _, _ }).WithPolling(constants.PollingInterval).WithTimeout(constants.PollingTimeout).Should(Succeed()) } +// SnapshotLargeRestoreSpec registers the large-object snapshot and restore test. +// Call this from a dedicated suite that provisions its own vCluster. +func SnapshotLargeRestoreSpec() { + var s snapshotCtx + BeforeAll(func(ctx context.Context) { + s = *newSnapshotCtx(ctx) + }) + describeSnapshotLargeObjectRestore(&s) +} + +func describeSnapshotLargeObjectRestore(s *snapshotCtx) { + const ( + objectCount = 100_000 + nsCount = 100 + objectsPerNS = objectCount / nsCount // 1_000 per namespace + deleteNSCount = 4 + createWorkers = 50 + labelKey = "large-restore" + ) + + // Ordered: write objects → take snapshot → delete objects → restore → verify. + Describe("snapshot and restore with 100,000 objects", Ordered, func() { + var ( + nsPrefix string + snapshotPath string + labelVal string + ) + + BeforeAll(func(ctx context.Context) { + nsPrefix = "large-restore-" + random.String(6) + snapshotPath = "container:///snapshot-data/" + nsPrefix + ".tar.gz" + labelVal = nsPrefix + cleanupAllSnapshotArtifacts(ctx, s.hostClient, s.vClusterNS) + + for i := range nsCount { + ns := fmt.Sprintf("%s-%d", nsPrefix, i) + _, err := s.vClusterClient.CoreV1().Namespaces().Create(ctx, &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{Name: ns}, + }, metav1.CreateOptions{}) + Expect(err).To(Or(Succeed(), Satisfy(kerrors.IsAlreadyExists))) + } + }) + + It("Writes 100,000 configmaps into the tenant cluster", func(ctx context.Context) { + By("Creating 1,000 configmaps in each of 100 namespaces concurrently", func() { + // The default client-go rate limiter (5 QPS, burst 10) would make 100k creates + // take ~20,000 seconds. Use a high-QPS client so the goroutines actually run in + // parallel and we hit the server's throughput ceiling instead. + currentClusterName := cluster.CurrentClusterNameFrom(ctx) + bulkRestConfig := *cluster.From(ctx, currentClusterName).KubernetesRestConfig() + bulkRestConfig.QPS = 500 + bulkRestConfig.Burst = 1000 + bulkClient, err := kubernetes.NewForConfig(&bulkRestConfig) + Expect(err).To(Succeed()) + + eg, egCtx := errgroup.WithContext(ctx) + ch := make(chan int, objectCount) + for i := range objectCount { + ch <- i + } + close(ch) + + for range createWorkers { + eg.Go(func() error { + for i := range ch { + // To avoid creating a channel for each namespace, we calculate the namespace and object name from the index. + // eg: object 0 goes to ns-0, object 1 to ns-0, ..., object 999 to ns-0, object 1000 to ns-1, etc. + ns := fmt.Sprintf("%s-%d", nsPrefix, i/objectsPerNS) + name := fmt.Sprintf("obj-%d", i%objectsPerNS) + _, createErr := bulkClient.CoreV1().ConfigMaps(ns).Create(egCtx, &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: ns, + Labels: map[string]string{labelKey: labelVal}, + }, + }, metav1.CreateOptions{}) + if createErr != nil { + return fmt.Errorf("create configmap %s/%s: %w", ns, name, createErr) + } + } + return nil + }) + } + Expect(eg.Wait()).To(Succeed()) + }) + }) + + It("Takes a snapshot of the tenant cluster", func(ctx context.Context) { + createSnapshot(s.vClusterName, s.vClusterNS, true, snapshotPath, false) + waitForRequestToFinish(ctx, s.hostClient, s.vClusterNS, + pkgconstants.SnapshotRequestLabel, snapshot.UnmarshalSnapshotRequest, + constants.PollingTimeoutExtraLong) + }) + + It("Deletes the first 4 namespaces and waits for them to be removed", func(ctx context.Context) { + By("Deleting namespaces 0 through 3", func() { + for i := range deleteNSCount { + ns := fmt.Sprintf("%s-%d", nsPrefix, i) + err := s.vClusterClient.CoreV1().Namespaces().Delete(ctx, ns, metav1.DeleteOptions{}) + Expect(err).To(Or(Succeed(), Satisfy(kerrors.IsNotFound))) + } + }) + + By("Waiting for the 4 namespaces to be fully terminated", func() { + Eventually(func(g Gomega, ctx context.Context) { + for i := range deleteNSCount { + ns := fmt.Sprintf("%s-%d", nsPrefix, i) + _, err := s.vClusterClient.CoreV1().Namespaces().Get(ctx, ns, metav1.GetOptions{}) + g.Expect(kerrors.IsNotFound(err)).To(BeTrue(), + "namespace %s should be deleted but still exists", ns) + } + }).WithContext(ctx).WithPolling(constants.PollingInterval).WithTimeout(constants.PollingTimeoutExtraLong).Should(Succeed()) + }) + }) + + It("Restores snapshot and verifies the deleted namespaces are back", func(ctx context.Context) { + By("Restoring the tenant cluster from snapshot", func() { + restoreVCluster(ctx, s.hostClient, s.vClusterName, s.vClusterNS, snapshotPath, true, false) + s.refreshClient(ctx) + }) + + By("Verifying the deleted namespaces and their configmaps are restored", func() { + Eventually(func(g Gomega, ctx context.Context) { + for i := range deleteNSCount { + ns := fmt.Sprintf("%s-%d", nsPrefix, i) + _, err := s.vClusterClient.CoreV1().Namespaces().Get(ctx, ns, metav1.GetOptions{}) + g.Expect(err).To(Succeed(), "namespace %s should exist after restore", ns) + + count, err := countConfigMapsByLabel(ctx, s.vClusterClient, ns, labelKey+"="+labelVal) + g.Expect(err).To(Succeed()) + g.Expect(count).To(Equal(objectsPerNS), + "namespace %s should have %d configmaps after restore, got %d", ns, objectsPerNS, count) + } + }).WithContext(ctx).WithPolling(constants.PollingInterval).WithTimeout(constants.PollingTimeoutExtraLong).Should(Succeed()) + }) + + By("Spot-checking non-deleted namespaces survived restore", func() { + for _, i := range []int{nsCount / 4, nsCount / 2, nsCount - 1} { + ns := fmt.Sprintf("%s-%d", nsPrefix, i) + count, err := countConfigMapsByLabel(ctx, s.vClusterClient, ns, labelKey+"="+labelVal) + Expect(err).To(Succeed()) + Expect(count).To(Equal(objectsPerNS), + "non-deleted namespace %s should have %d configmaps after restore, got %d", ns, objectsPerNS, count) + } + }) + }) + + AfterAll(func(ctx context.Context) { + for i := range nsCount { + ns := fmt.Sprintf("%s-%d", nsPrefix, i) + err := s.vClusterClient.CoreV1().Namespaces().Delete(ctx, ns, metav1.DeleteOptions{}) + Expect(err).To(Or(Succeed(), Satisfy(kerrors.IsNotFound))) + } + deleteSnapshotRequestConfigMaps(ctx, s.hostClient, s.vClusterNS) + }) + }) +} + func getTwoSnapshotRequests(g Gomega, ctx context.Context, hostClient kubernetes.Interface, vClusterNamespace string) (*snapshot.Request, *snapshot.Request) { configMaps, err := hostClient.CoreV1().ConfigMaps(vClusterNamespace).List(ctx, metav1.ListOptions{ LabelSelector: pkgconstants.SnapshotRequestLabel, diff --git a/e2e-next/vcluster-snapshot.yaml b/e2e-next/vcluster-snapshot.yaml index 6c22ff92b5..f3ee963f2a 100644 --- a/e2e-next/vcluster-snapshot.yaml +++ b/e2e-next/vcluster-snapshot.yaml @@ -1,4 +1,9 @@ controlPlane: + distro: + k8s: + apiServer: + extraArgs: + - "--etcd-compaction-interval=1h" statefulSet: image: registry: ""