-
Notifications
You must be signed in to change notification settings - Fork 577
test: e2e test for large snapshot restore #3912
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -9,4 +9,5 @@ const ( | |||||
| PollingTimeout = time.Second * 60 | ||||||
| PollingTimeoutLong = time.Second * 120 | ||||||
| PollingTimeoutVeryLong = time.Second * 300 | ||||||
| PollingTimeoutExtraLong = time.Minute * 90 | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| ) | ||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,35 @@ | ||
| package e2e_next | ||
|
|
||
| import ( | ||
| "context" | ||
| _ "embed" | ||
|
|
||
| "github.com/loft-sh/e2e-framework/pkg/setup/cluster" | ||
| "github.com/loft-sh/vcluster/e2e-next/clusters" | ||
| "github.com/loft-sh/vcluster/e2e-next/labels" | ||
| "github.com/loft-sh/vcluster/e2e-next/setup" | ||
| "github.com/loft-sh/vcluster/e2e-next/setup/lazyvcluster" | ||
| "github.com/loft-sh/vcluster/e2e-next/test_storage/snapshot" | ||
| . "github.com/onsi/ginkgo/v2" | ||
| ) | ||
|
|
||
| const snapshotLargeRestoreVClusterName = "large-restore-vcluster" | ||
|
|
||
| func init() { suiteSnapshotLargeRestore() } | ||
|
|
||
| func suiteSnapshotLargeRestore() { | ||
| Describe("large-restore-vcluster", labels.SnapshotLargeRestore, Ordered, | ||
| cluster.Use(clusters.HostCluster), | ||
| func() { | ||
| BeforeAll(func(ctx context.Context) context.Context { | ||
| return lazyvcluster.LazyVCluster(ctx, | ||
| snapshotLargeRestoreVClusterName, | ||
| snapshotVClusterYAML, | ||
| lazyvcluster.WithPreSetup(setup.SnapshotPreSetup(snapshotLargeRestoreVClusterName)), | ||
| ) | ||
| }) | ||
|
|
||
| snapshot.SnapshotLargeRestoreSpec() | ||
| }, | ||
| ) | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7,6 +7,8 @@ import ( | |
| "strings" | ||
| "time" | ||
|
|
||
| "golang.org/x/sync/errgroup" | ||
|
|
||
| "github.com/ghodss/yaml" | ||
| snapshotsv1 "github.com/kubernetes-csi/external-snapshotter/client/v8/clientset/versioned" | ||
| "github.com/loft-sh/e2e-framework/pkg/setup/cluster" | ||
|
|
@@ -800,6 +802,164 @@ func deletePVC(ctx context.Context, vClusterClient, _ kubernetes.Interface, _, _ | |
| }).WithPolling(constants.PollingInterval).WithTimeout(constants.PollingTimeout).Should(Succeed()) | ||
| } | ||
|
|
||
| // SnapshotLargeRestoreSpec registers the large-object snapshot and restore test. | ||
| // Call this from a dedicated suite that provisions its own vCluster. | ||
| func SnapshotLargeRestoreSpec() { | ||
| var s snapshotCtx | ||
| BeforeAll(func(ctx context.Context) { | ||
| s = *newSnapshotCtx(ctx) | ||
| }) | ||
| describeSnapshotLargeObjectRestore(&s) | ||
| } | ||
|
|
||
| func describeSnapshotLargeObjectRestore(s *snapshotCtx) { | ||
|
jjaferson marked this conversation as resolved.
|
||
| const ( | ||
| objectCount = 100_000 | ||
| nsCount = 100 | ||
| objectsPerNS = objectCount / nsCount // 1_000 per namespace | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| deleteNSCount = 4 | ||
| createWorkers = 50 | ||
| labelKey = "large-restore" | ||
| ) | ||
|
|
||
| // Ordered: write objects → take snapshot → delete objects → restore → verify. | ||
| Describe("snapshot and restore with 100,000 objects", Ordered, func() { | ||
| var ( | ||
| nsPrefix string | ||
| snapshotPath string | ||
| labelVal string | ||
| ) | ||
|
|
||
| BeforeAll(func(ctx context.Context) { | ||
| nsPrefix = "large-restore-" + random.String(6) | ||
| snapshotPath = "container:///snapshot-data/" + nsPrefix + ".tar.gz" | ||
| labelVal = nsPrefix | ||
| cleanupAllSnapshotArtifacts(ctx, s.hostClient, s.vClusterNS) | ||
|
|
||
| for i := range nsCount { | ||
|
jjaferson marked this conversation as resolved.
|
||
| ns := fmt.Sprintf("%s-%d", nsPrefix, i) | ||
| _, err := s.vClusterClient.CoreV1().Namespaces().Create(ctx, &corev1.Namespace{ | ||
| ObjectMeta: metav1.ObjectMeta{Name: ns}, | ||
| }, metav1.CreateOptions{}) | ||
| Expect(err).To(Or(Succeed(), Satisfy(kerrors.IsAlreadyExists))) | ||
| } | ||
| }) | ||
|
|
||
| It("Writes 100,000 configmaps into the tenant cluster", func(ctx context.Context) { | ||
| By("Creating 1,000 configmaps in each of 100 namespaces concurrently", func() { | ||
| // The default client-go rate limiter (5 QPS, burst 10) would make 100k creates | ||
| // take ~20,000 seconds. Use a high-QPS client so the goroutines actually run in | ||
| // parallel and we hit the server's throughput ceiling instead. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you roughly know how this performs on GH runners? I'm a little concerned about it taking forever there or just straight up failing
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've tested it locally and to run the requests in parallel without being blocked by the rate limiter in client-go, so using the default, the first 10 requests go through immediately then after that the next one goes every ~200 ms |
||
| currentClusterName := cluster.CurrentClusterNameFrom(ctx) | ||
| bulkRestConfig := *cluster.From(ctx, currentClusterName).KubernetesRestConfig() | ||
| bulkRestConfig.QPS = 500 | ||
| bulkRestConfig.Burst = 1000 | ||
| bulkClient, err := kubernetes.NewForConfig(&bulkRestConfig) | ||
| Expect(err).To(Succeed()) | ||
|
|
||
| eg, egCtx := errgroup.WithContext(ctx) | ||
| ch := make(chan int, objectCount) | ||
| for i := range objectCount { | ||
| ch <- i | ||
| } | ||
| close(ch) | ||
|
|
||
| for range createWorkers { | ||
| eg.Go(func() error { | ||
| for i := range ch { | ||
| // To avoid creating a channel for each namespace, we calculate the namespace and object name from the index. | ||
| // eg: object 0 goes to ns-0, object 1 to ns-0, ..., object 999 to ns-0, object 1000 to ns-1, etc. | ||
| ns := fmt.Sprintf("%s-%d", nsPrefix, i/objectsPerNS) | ||
| name := fmt.Sprintf("obj-%d", i%objectsPerNS) | ||
| _, createErr := bulkClient.CoreV1().ConfigMaps(ns).Create(egCtx, &corev1.ConfigMap{ | ||
| ObjectMeta: metav1.ObjectMeta{ | ||
| Name: name, | ||
| Namespace: ns, | ||
| Labels: map[string]string{labelKey: labelVal}, | ||
| }, | ||
| }, metav1.CreateOptions{}) | ||
| if createErr != nil { | ||
| return fmt.Errorf("create configmap %s/%s: %w", ns, name, createErr) | ||
| } | ||
| } | ||
| return nil | ||
| }) | ||
| } | ||
| Expect(eg.Wait()).To(Succeed()) | ||
| }) | ||
| }) | ||
|
|
||
| It("Takes a snapshot of the tenant cluster", func(ctx context.Context) { | ||
| createSnapshot(s.vClusterName, s.vClusterNS, true, snapshotPath, false) | ||
| waitForRequestToFinish(ctx, s.hostClient, s.vClusterNS, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This open-codes a |
||
| pkgconstants.SnapshotRequestLabel, snapshot.UnmarshalSnapshotRequest, | ||
| constants.PollingTimeoutExtraLong) | ||
| }) | ||
|
|
||
| It("Deletes the first 4 namespaces and waits for them to be removed", func(ctx context.Context) { | ||
| By("Deleting namespaces 0 through 3", func() { | ||
| for i := range deleteNSCount { | ||
| ns := fmt.Sprintf("%s-%d", nsPrefix, i) | ||
| err := s.vClusterClient.CoreV1().Namespaces().Delete(ctx, ns, metav1.DeleteOptions{}) | ||
| Expect(err).To(Or(Succeed(), Satisfy(kerrors.IsNotFound))) | ||
| } | ||
| }) | ||
|
|
||
| By("Waiting for the 4 namespaces to be fully terminated", func() { | ||
| Eventually(func(g Gomega, ctx context.Context) { | ||
| for i := range deleteNSCount { | ||
| ns := fmt.Sprintf("%s-%d", nsPrefix, i) | ||
| _, err := s.vClusterClient.CoreV1().Namespaces().Get(ctx, ns, metav1.GetOptions{}) | ||
| g.Expect(kerrors.IsNotFound(err)).To(BeTrue(), | ||
| "namespace %s should be deleted but still exists", ns) | ||
| } | ||
| }).WithContext(ctx).WithPolling(constants.PollingInterval).WithTimeout(constants.PollingTimeoutExtraLong).Should(Succeed()) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using a 90-minute polling timeout for namespace deletion conflates two concerns: the overall test budget and the expected operation latency. A real 90m wait on namespace termination usually masks a stuck finalizer rather than legitimately taking that long. Consider scoping each |
||
| }) | ||
| }) | ||
|
|
||
| It("Restores snapshot and verifies the deleted namespaces are back", func(ctx context.Context) { | ||
| By("Restoring the tenant cluster from snapshot", func() { | ||
| restoreVCluster(ctx, s.hostClient, s.vClusterName, s.vClusterNS, snapshotPath, true, false) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The large-restore suite uses |
||
| s.refreshClient(ctx) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| }) | ||
|
|
||
| By("Verifying the deleted namespaces and their configmaps are restored", func() { | ||
| Eventually(func(g Gomega, ctx context.Context) { | ||
| for i := range deleteNSCount { | ||
|
jjaferson marked this conversation as resolved.
|
||
| ns := fmt.Sprintf("%s-%d", nsPrefix, i) | ||
| _, err := s.vClusterClient.CoreV1().Namespaces().Get(ctx, ns, metav1.GetOptions{}) | ||
| g.Expect(err).To(Succeed(), "namespace %s should exist after restore", ns) | ||
|
|
||
| count, err := countConfigMapsByLabel(ctx, s.vClusterClient, ns, labelKey+"="+labelVal) | ||
| g.Expect(err).To(Succeed()) | ||
| g.Expect(count).To(Equal(objectsPerNS), | ||
| "namespace %s should have %d configmaps after restore, got %d", ns, objectsPerNS, count) | ||
| } | ||
| }).WithContext(ctx).WithPolling(constants.PollingInterval).WithTimeout(constants.PollingTimeoutExtraLong).Should(Succeed()) | ||
| }) | ||
|
|
||
| By("Spot-checking non-deleted namespaces survived restore", func() { | ||
| for _, i := range []int{nsCount / 4, nsCount / 2, nsCount - 1} { | ||
| ns := fmt.Sprintf("%s-%d", nsPrefix, i) | ||
| count, err := countConfigMapsByLabel(ctx, s.vClusterClient, ns, labelKey+"="+labelVal) | ||
| Expect(err).To(Succeed()) | ||
| Expect(count).To(Equal(objectsPerNS), | ||
| "non-deleted namespace %s should have %d configmaps after restore, got %d", ns, objectsPerNS, count) | ||
| } | ||
| }) | ||
| }) | ||
|
|
||
| AfterAll(func(ctx context.Context) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could this go into a
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good question, I am using AfterAll: splits creation and cleanup across two functions.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In this case
➕ here, for the sake of consistency, it's probably better to use The benefit of having All this being said, we could refactor cleanup code in tests and use
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the whole |
||
| for i := range nsCount { | ||
| ns := fmt.Sprintf("%s-%d", nsPrefix, i) | ||
| err := s.vClusterClient.CoreV1().Namespaces().Delete(ctx, ns, metav1.DeleteOptions{}) | ||
| Expect(err).To(Or(Succeed(), Satisfy(kerrors.IsNotFound))) | ||
| } | ||
| deleteSnapshotRequestConfigMaps(ctx, s.hostClient, s.vClusterNS) | ||
| }) | ||
| }) | ||
| } | ||
|
|
||
| func getTwoSnapshotRequests(g Gomega, ctx context.Context, hostClient kubernetes.Interface, vClusterNamespace string) (*snapshot.Request, *snapshot.Request) { | ||
| configMaps, err := hostClient.CoreV1().ConfigMaps(vClusterNamespace).List(ctx, metav1.ListOptions{ | ||
| LabelSelector: pkgconstants.SnapshotRequestLabel, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,9 @@ | ||
| controlPlane: | ||
| distro: | ||
| k8s: | ||
| apiServer: | ||
| extraArgs: | ||
| - "--etcd-compaction-interval=1h" | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Adding |
||
| statefulSet: | ||
| image: | ||
| registry: "" | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The matching
timeout: 90mwas not mirrored in.github/workflows/e2e-ginkgo-nightly.yaml:117, which still passestimeout: "60m"torun-ginkgo. If thesnapshot-large-restorelabel is ever included in a nightly run (e.g. viaworkflow_dispatchor once added to a default label set), Ginkgo will abort after 60 minutes — before the three 90-minuteEventually/waitForRequestToFinishcalls intest_snapshot.go(lines 894, 916, 938) can complete. Consider bumping the nightly workflow's timeout too, or documenting why it intentionally stays at 60m.