-
Notifications
You must be signed in to change notification settings - Fork 211
CNTR-1: fix stale gNOI connection post reboot + Implement PushConfig for static bind. #5367
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -991,15 +991,43 @@ func TestContainerPersistenceAfterColdReboot(t *testing.T) { | |||||
|
|
||||||
| t.Run("VerifyPersistence", func(t *testing.T) { | ||||||
| t.Log("Waiting for DUT to reboot and reconnect...") | ||||||
|
|
||||||
| // Wait for reboot. | ||||||
| time.Sleep(8 * time.Minute) | ||||||
| maxRebootTime := 8 * time.Minute | ||||||
| ticker := time.NewTicker(30 * time.Second) | ||||||
| defer ticker.Stop() | ||||||
| timeout := time.After(maxRebootTime) | ||||||
| var deviceWentDown bool | ||||||
|
|
||||||
| rebootLoop: | ||||||
| for { | ||||||
| select { | ||||||
| case <-timeout: | ||||||
| t.Fatalf("Timeout exceeded: DUT did not reboot within %v seconds.", maxRebootTime) | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The error message 'DUT did not reboot' is slightly misleading if the device actually went down but failed to become reachable again. Additionally, using %v seconds with a time.Duration results in a redundant unit (e.g., '8m0s seconds'). Using t.Fatalf is preferred here as the failure makes subsequent test steps meaningless.
Suggested change
References
|
||||||
| case <-ticker.C: | ||||||
| // use GNOI to refresh the stale cached connection post reboot. | ||||||
| sysClient := dut.RawAPIs().GNOI(t).System() | ||||||
| _, err := sysClient.Time(ctx, &gspb.TimeRequest{}) | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The Time RPC call uses the test context ctx, which has a long timeout (8 minutes). If the device is in a state where it accepts connections but hangs on RPCs, this call could block the polling loop for a long time. It's safer to use a shorter timeout for each individual reachability check. tctx, cancel := context.WithTimeout(ctx, 10*time.Second)
_, err := sysClient.Time(tctx, &gspb.TimeRequest{})
cancel() |
||||||
| if err != nil { | ||||||
| if !deviceWentDown { | ||||||
| t.Logf("Device is now unreachable. Waiting for it to come back up.") | ||||||
| deviceWentDown = true | ||||||
| } | ||||||
| } else { | ||||||
| if deviceWentDown { | ||||||
| t.Logf("Device rebooted successfully.") | ||||||
| break rebootLoop | ||||||
| } | ||||||
| t.Logf("Device is still reachable; reboot hasn't started yet.") | ||||||
| } | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| // Poll for container state. | ||||||
| cli = containerztest.Client(t, dut) | ||||||
|
|
||||||
| // Use a generous timeout for the device to come back up and the container to start. | ||||||
| timeout := 5 * time.Minute | ||||||
| if err := containerztest.WaitForRunning(ctx, t, cli, instanceName, timeout); err != nil { | ||||||
| if err := containerztest.WaitForRunning(ctx, t, cli, instanceName, 5*time.Minute); err != nil { | ||||||
| t.Errorf("Container persistence failed: %v", err) | ||||||
| } | ||||||
|
|
||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -175,6 +175,40 @@ func (d *staticDUT) reset(ctx context.Context) error { | |||||||||||||||||||||||||||||||
| return resetGRIBI(ctx, d) | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| func (d *staticDUT) PushConfig(ctx context.Context, config string, reset bool) error { | ||||||||||||||||||||||||||||||||
| if reset { | ||||||||||||||||||||||||||||||||
| if err := resetGNMI(ctx, d); err != nil { | ||||||||||||||||||||||||||||||||
| return err | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
| if config == "" { | ||||||||||||||||||||||||||||||||
| return nil | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| setRequest := &gpb.SetRequest{Update: []*gpb.Update{ | ||||||||||||||||||||||||||||||||
| { | ||||||||||||||||||||||||||||||||
| Path: &gpb.Path{ | ||||||||||||||||||||||||||||||||
| Origin: "cli", | ||||||||||||||||||||||||||||||||
| }, | ||||||||||||||||||||||||||||||||
| Val: &gpb.TypedValue{ | ||||||||||||||||||||||||||||||||
| Value: &gpb.TypedValue_AsciiVal{ | ||||||||||||||||||||||||||||||||
| AsciiVal: config, | ||||||||||||||||||||||||||||||||
| }, | ||||||||||||||||||||||||||||||||
| }, | ||||||||||||||||||||||||||||||||
| }, | ||||||||||||||||||||||||||||||||
| }} | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| gnmiClient, err := d.DialGNMI(ctx) | ||||||||||||||||||||||||||||||||
| if err != nil { | ||||||||||||||||||||||||||||||||
| return err | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
| if _, err := gnmiClient.Set(ctx, setRequest); err != nil { | ||||||||||||||||||||||||||||||||
| return err | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
|
Comment on lines
+201
to
+207
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The call to d.DialGNMI(ctx) creates a new gRPC connection that is never closed, leading to a resource leak. Since PushConfig is an internal method of staticDUT, you should use dialConn directly to obtain the connection and ensure it is closed after the operation.
Suggested change
|
||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| return nil | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| func (d *staticDUT) DialGNMI(ctx context.Context, opts ...grpc.DialOption) (gpb.GNMIClient, error) { | ||||||||||||||||||||||||||||||||
| conn, err := dialConn(ctx, d, introspect.GNMI, opts) | ||||||||||||||||||||||||||||||||
| if err != nil { | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A 30-second ticker interval might be too long for detecting a reboot, especially on virtual devices or fast-rebooting hardware. If the device reboots and comes back up within the 30-second window, the loop might miss the 'down' state and eventually timeout. Consider reducing the interval to 5 or 10 seconds for better reliability.