diff --git a/cmd/tidb-operator/main.go b/cmd/tidb-operator/main.go index f6eaa40d947..03e69507ad9 100644 --- a/cmd/tidb-operator/main.go +++ b/cmd/tidb-operator/main.go @@ -294,6 +294,10 @@ func setup(ctx context.Context, mgr ctrl.Manager) error { tsocm.Start(ctx) rmcm.Start(ctx) + if err := metrics.RegisterAbnormalInstanceCleanup(ctx, mgr); err != nil { + return fmt.Errorf("register abnormal-instance cleanup: %w", err) + } + if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { return fmt.Errorf("unable to set up health check: %w", err) } diff --git a/pkg/controllers/pd/builder.go b/pkg/controllers/pd/builder.go index 36d2bd41b10..0bca5f98b6b 100644 --- a/pkg/controllers/pd/builder.go +++ b/pkg/controllers/pd/builder.go @@ -39,11 +39,6 @@ func (r *Reconciler) NewRunner(state *tasks.ReconcileContext, reporter task.Task task.IfBreak(common.CondObjectIsDeleting[scope.PD](state), tasks.TaskDeleteMember(state), common.TaskInstanceFinalizerDel[scope.PD](state, r.Client, common.DefaultInstanceSubresourceLister), - // TODO(liubo02): if the finalizer has been removed, no need to update status - common.TaskInstanceConditionSynced[scope.PD](state), - common.TaskInstanceConditionReady[scope.PD](state), - common.TaskInstanceConditionRunning[scope.PD](state), - common.TaskStatusPersister[scope.PD](state, r.Client), ), common.TaskFinalizerAdd[scope.PD](state, r.Client), diff --git a/pkg/controllers/scheduling/builder.go b/pkg/controllers/scheduling/builder.go index 8f7ccb72902..1988af7605c 100644 --- a/pkg/controllers/scheduling/builder.go +++ b/pkg/controllers/scheduling/builder.go @@ -44,10 +44,6 @@ func (r *Reconciler) NewRunner(state *tasks.ReconcileContext, reporter task.Task // tasks.TaskContextInfoFromPD(state, r.PDClientManager), task.IfBreak(common.CondObjectIsDeleting[scope.Scheduling](state), common.TaskInstanceFinalizerDel[scope.Scheduling](state, r.Client, common.DefaultInstanceSubresourceLister), - common.TaskInstanceConditionSynced[scope.Scheduling](state), - common.TaskInstanceConditionReady[scope.Scheduling](state), - common.TaskInstanceConditionRunning[scope.Scheduling](state), - common.TaskStatusPersister[scope.Scheduling](state, r.Client), ), common.TaskFinalizerAdd[scope.Scheduling](state, r.Client), diff --git a/pkg/controllers/ticdc/builder.go b/pkg/controllers/ticdc/builder.go index 790301a83fa..6315669fbf4 100644 --- a/pkg/controllers/ticdc/builder.go +++ b/pkg/controllers/ticdc/builder.go @@ -42,11 +42,6 @@ func (r *Reconciler) NewRunner(state *tasks.ReconcileContext, reporter task.Task task.IfBreak(common.CondObjectIsDeleting[scope.TiCDC](state), common.TaskInstanceFinalizerDel[scope.TiCDC](state, r.Client, common.DefaultInstanceSubresourceLister), - // TODO(liubo02): if the finalizer has been removed, no need to update status - common.TaskInstanceConditionSynced[scope.TiCDC](state), - common.TaskInstanceConditionReady[scope.TiCDC](state), - common.TaskInstanceConditionRunning[scope.TiCDC](state), - common.TaskStatusPersister[scope.TiCDC](state, r.Client), ), common.TaskFinalizerAdd[scope.TiCDC](state, r.Client), diff --git a/pkg/controllers/tidb/builder.go b/pkg/controllers/tidb/builder.go index 3eba7b86b09..e09e1106ca8 100644 --- a/pkg/controllers/tidb/builder.go +++ b/pkg/controllers/tidb/builder.go @@ -48,11 +48,6 @@ func (r *Reconciler) NewRunner(state *tasks.ReconcileContext, reporter task.Task task.IfBreak(common.CondObjectIsDeleting[scope.TiDB](state), common.TaskInstanceFinalizerDel[scope.TiDB](state, r.Client, common.DefaultInstanceSubresourceLister), - // TODO(liubo02): if the finalizer has been removed, no need to update status - common.TaskInstanceConditionSynced[scope.TiDB](state), - common.TaskInstanceConditionReady[scope.TiDB](state), - common.TaskInstanceConditionRunning[scope.TiDB](state), - common.TaskStatusPersister[scope.TiDB](state, r.Client), ), common.TaskFinalizerAdd[scope.TiDB](state, r.Client), diff --git a/pkg/controllers/tikvworker/builder.go b/pkg/controllers/tikvworker/builder.go index 115abcdeec4..fbb9831bfa1 100644 --- a/pkg/controllers/tikvworker/builder.go +++ b/pkg/controllers/tikvworker/builder.go @@ -42,11 +42,6 @@ func (r *Reconciler) NewRunner(state *tasks.ReconcileContext, reporter task.Task task.IfBreak(common.CondObjectIsDeleting[scope.TiKVWorker](state), common.TaskInstanceFinalizerDel[scope.TiKVWorker](state, r.Client, common.DefaultInstanceSubresourceLister), - // TODO(liubo02): if the finalizer has been removed, no need to update status - common.TaskInstanceConditionSynced[scope.TiKVWorker](state), - common.TaskInstanceConditionReady[scope.TiKVWorker](state), - common.TaskInstanceConditionRunning[scope.TiKVWorker](state), - common.TaskStatusPersister[scope.TiKVWorker](state, r.Client), ), common.TaskFinalizerAdd[scope.TiKVWorker](state, r.Client), diff --git a/pkg/controllers/tiproxy/builder.go b/pkg/controllers/tiproxy/builder.go index 106219be402..3ad310bfb1d 100644 --- a/pkg/controllers/tiproxy/builder.go +++ b/pkg/controllers/tiproxy/builder.go @@ -44,11 +44,6 @@ func (r *Reconciler) NewRunner(state *tasks.ReconcileContext, reporter task.Task task.IfBreak(common.CondObjectIsDeleting[scope.TiProxy](state), common.TaskInstanceFinalizerDel[scope.TiProxy](state, r.Client, common.DefaultInstanceSubresourceLister), - // TODO(liubo02): if the finalizer has been removed, no need to update status - common.TaskInstanceConditionSynced[scope.TiProxy](state), - common.TaskInstanceConditionReady[scope.TiProxy](state), - common.TaskInstanceConditionRunning[scope.TiProxy](state), - common.TaskStatusPersister[scope.TiProxy](state, r.Client), ), common.TaskFinalizerAdd[scope.TiProxy](state, r.Client), diff --git a/pkg/controllers/tso/builder.go b/pkg/controllers/tso/builder.go index afc37fb6457..8e124abb501 100644 --- a/pkg/controllers/tso/builder.go +++ b/pkg/controllers/tso/builder.go @@ -44,11 +44,6 @@ func (r *Reconciler) NewRunner(state *tasks.ReconcileContext, reporter task.Task // tasks.TaskContextInfoFromPD(state, r.PDClientManager), task.IfBreak(common.CondObjectIsDeleting[scope.TSO](state), common.TaskInstanceFinalizerDel[scope.TSO](state, r.Client, common.DefaultInstanceSubresourceLister), - // TODO(liubo02): if the finalizer has been removed, no need to update status - common.TaskInstanceConditionSynced[scope.TSO](state), - common.TaskInstanceConditionReady[scope.TSO](state), - common.TaskInstanceConditionRunning[scope.TSO](state), - common.TaskStatusPersister[scope.TSO](state, r.Client), ), common.TaskFinalizerAdd[scope.TSO](state, r.Client), diff --git a/pkg/metrics/abnormal_instance.go b/pkg/metrics/abnormal_instance.go index 8fceb702afc..0abbb71f2c0 100644 --- a/pkg/metrics/abnormal_instance.go +++ b/pkg/metrics/abnormal_instance.go @@ -58,18 +58,21 @@ func ObserveCondition(obj client.Object, conds []metav1.Condition, condType stri } // ClearInstanceConditionMetrics removes every tracked-condition series for -// the given instance. +// the given instance. Two call sites keep the gauge honest: // -// Called from TaskInstanceFinalizerDel after the finalizer is removed, so -// every component that uses the standard finalize task is covered without -// per-builder wiring. Component builders short-circuit the deletion path -// with task.IfBreak around CondClusterIsDeleting / CondObjectIsDeleting, so -// the normal TaskInstanceConditionSynced / TaskInstanceConditionReady tasks -// (where ObserveCondition lives) never run during finalization; without -// this explicit cleanup, the gauge series would stay present at its last -// value forever, triggering false-positive `metric == 1 for: ` -// alerts on a non-existent instance and growing label cardinality across -// each cluster lifecycle. +// - TaskInstanceFinalizerDel, after the finalizer is removed. Covers the +// normal graceful delete path. The deletion branch of every instance +// builder runs only FinalizerDel (no CondSynced/Ready/Running afterwards), +// so the clear done here is not re-populated within the same reconcile. +// +// - RegisterAbnormalInstanceCleanup, on shared-informer DELETE events. +// Covers paths that bypass the finalizer entirely (force-delete via +// `patch metadata.finalizers=null`, operator down during delete, etc.). +// +// Without both paths, a leaked series stays present at its last value forever, +// triggering false-positive `metric == 1 for: ` alerts on a +// non-existent instance and growing label cardinality across each cluster +// lifecycle. func ClearInstanceConditionMetrics(obj client.Object) { base := instanceMetricBaseLabels(obj) for _, condType := range trackedConditions { diff --git a/pkg/metrics/cleanup_handler.go b/pkg/metrics/cleanup_handler.go new file mode 100644 index 00000000000..febcffa2ef4 --- /dev/null +++ b/pkg/metrics/cleanup_handler.go @@ -0,0 +1,80 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package metrics + +import ( + "context" + "fmt" + + toolscache "k8s.io/client-go/tools/cache" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/pingcap/tidb-operator/api/v2/core/v1alpha1" +) + +// abnormalInstanceKinds lists every instance CR that writes to the +// AbnormalInstance gauge via TaskInstanceConditionSynced / Ready. Keep this in +// sync with the controllers that wire those tasks into their builder. +var abnormalInstanceKinds = []client.Object{ + &v1alpha1.PD{}, + &v1alpha1.TiDB{}, + &v1alpha1.TiKV{}, + &v1alpha1.TiFlash{}, + &v1alpha1.TiCDC{}, + &v1alpha1.TSO{}, + &v1alpha1.Scheduling{}, + &v1alpha1.TiProxy{}, + &v1alpha1.TiKVWorker{}, +} + +// RegisterAbnormalInstanceCleanup attaches a watch-level delete handler on +// every instance CR so the AbnormalInstance gauge is cleared the moment the +// object disappears from the API server, even when the deletion bypasses the +// operator's finalizer (e.g. a manual `patch metadata.finalizers=null` used to +// unwedge a stuck instance). Without this, such paths skip +// TaskInstanceFinalizerDel entirely and leave the gauge at its last value +// forever, causing false-positive `metric == 1 for: ` alerts on a +// non-existent instance and unbounded label-cardinality growth. +// +// This complements, rather than replaces, the finalizer-time cleanup: the +// finalizer path covers graceful deletes and short-circuits the observe loop +// before it can re-populate the gauge, while this handler covers the paths +// that never reach the finalizer at all. +func RegisterAbnormalInstanceCleanup(ctx context.Context, mgr ctrl.Manager) error { + for _, obj := range abnormalInstanceKinds { + inf, err := mgr.GetCache().GetInformer(ctx, obj) + if err != nil { + return fmt.Errorf("get informer for %T: %w", obj, err) + } + if _, err := inf.AddEventHandler(toolscache.ResourceEventHandlerFuncs{ + DeleteFunc: clearOnDelete, + }); err != nil { + return fmt.Errorf("add delete handler for %T: %w", obj, err) + } + } + return nil +} + +func clearOnDelete(o interface{}) { + if t, ok := o.(toolscache.DeletedFinalStateUnknown); ok { + o = t.Obj + } + co, ok := o.(client.Object) + if !ok { + return + } + ClearInstanceConditionMetrics(co) +} diff --git a/tests/e2e/e2e_test.go b/tests/e2e/e2e_test.go index 1f40b56f085..81208e430d4 100644 --- a/tests/e2e/e2e_test.go +++ b/tests/e2e/e2e_test.go @@ -26,6 +26,7 @@ import ( _ "github.com/pingcap/tidb-operator/v2/tests/e2e/br" _ "github.com/pingcap/tidb-operator/v2/tests/e2e/cluster" _ "github.com/pingcap/tidb-operator/v2/tests/e2e/example" + _ "github.com/pingcap/tidb-operator/v2/tests/e2e/metrics" _ "github.com/pingcap/tidb-operator/v2/tests/e2e/pd" _ "github.com/pingcap/tidb-operator/v2/tests/e2e/ticdc" _ "github.com/pingcap/tidb-operator/v2/tests/e2e/tidb" diff --git a/tests/e2e/metrics/abnormal_instance_leak.go b/tests/e2e/metrics/abnormal_instance_leak.go new file mode 100644 index 00000000000..407c9d3a2e2 --- /dev/null +++ b/tests/e2e/metrics/abnormal_instance_leak.go @@ -0,0 +1,145 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package metrics + +import ( + "context" + "fmt" + "time" + + "github.com/onsi/ginkgo/v2" + "github.com/onsi/gomega" + "k8s.io/client-go/rest" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/pingcap/tidb-operator/api/v2/core/v1alpha1" + "github.com/pingcap/tidb-operator/v2/pkg/apicall" + "github.com/pingcap/tidb-operator/v2/pkg/runtime/scope" + "github.com/pingcap/tidb-operator/v2/tests/e2e/data" + "github.com/pingcap/tidb-operator/v2/tests/e2e/framework" + "github.com/pingcap/tidb-operator/v2/tests/e2e/label" + "github.com/pingcap/tidb-operator/v2/tests/e2e/utils/k8s" + metricutil "github.com/pingcap/tidb-operator/v2/tests/e2e/utils/metrics" +) + +const ( + operatorNamespace = "tidb-admin" + operatorDeploymentName = "tidb-operator" + abnormalInstanceMetric = "tidb_operator_abnormal_instance" +) + +var _ = ginkgo.Describe("Abnormal Instance Gauge Leak", label.TiKV, func() { + f := framework.New() + f.Setup() + + ginkgo.It("clears gauge series when the instance CR is deleted gracefully", label.P1, label.Delete, func(ctx context.Context) { + pdg := f.MustCreatePD(ctx) + kvg := f.MustCreateTiKV(ctx, data.WithReplicas[scope.TiKVGroup](1)) + f.WaitForPDGroupReady(ctx, pdg) + f.WaitForTiKVGroupReady(ctx, kvg) + + restConfig, err := k8s.LoadConfig() + f.Must(err) + + kvs, err := apicall.ListInstances[scope.TiKVGroup](ctx, f.Client, kvg) + f.Must(err) + gomega.Expect(kvs).To(gomega.HaveLen(1)) + target := kvs[0] + + ginkgo.By("waiting for the gauge series to be populated for the tikv instance") + gomega.Eventually(func() error { + return expectInstanceSample(ctx, restConfig, target.Namespace, target.Name, true) + }, 3*time.Minute, 5*time.Second).Should(gomega.Succeed()) + + ginkgo.By("deleting the tikv group to trigger graceful deletion of the instance") + f.Must(f.Client.Delete(ctx, kvg)) + + ginkgo.By("waiting for the gauge series to disappear") + gomega.Eventually(func() error { + return expectInstanceSample(ctx, restConfig, target.Namespace, target.Name, false) + }, 3*time.Minute, 5*time.Second).Should(gomega.Succeed()) + }) + + ginkgo.It("clears gauge series on force-delete that bypasses the finalizer", label.P1, label.Delete, func(ctx context.Context) { + pdg := f.MustCreatePD(ctx) + kvg := f.MustCreateTiKV(ctx, data.WithReplicas[scope.TiKVGroup](1)) + f.WaitForPDGroupReady(ctx, pdg) + f.WaitForTiKVGroupReady(ctx, kvg) + + restConfig, err := k8s.LoadConfig() + f.Must(err) + + kvs, err := apicall.ListInstances[scope.TiKVGroup](ctx, f.Client, kvg) + f.Must(err) + gomega.Expect(kvs).To(gomega.HaveLen(1)) + target := kvs[0] + + ginkgo.By("waiting for the gauge series to be populated for the tikv instance") + gomega.Eventually(func() error { + return expectInstanceSample(ctx, restConfig, target.Namespace, target.Name, true) + }, 3*time.Minute, 5*time.Second).Should(gomega.Succeed()) + + ginkgo.By("deleting the tikv CR then stripping its finalizers to force GC without waiting on the finalize task") + f.Must(f.Client.Delete(ctx, target)) + + // After Delete sets DeletionTimestamp, clearing finalizers causes the API + // server to GC the object immediately, skipping TaskInstanceFinalizerDel. + gomega.Eventually(func() error { + latest := &v1alpha1.TiKV{} + if err := f.Client.Get(ctx, client.ObjectKeyFromObject(target), latest); err != nil { + // Already gone; nothing more to do. + return nil + } + if len(latest.Finalizers) == 0 { + return nil + } + patch := client.MergeFrom(latest.DeepCopy()) + latest.Finalizers = nil + return f.Client.Patch(ctx, latest, patch) + }, 30*time.Second, time.Second).Should(gomega.Succeed()) + + ginkgo.By("waiting for the gauge series to be cleared by the informer DELETE handler") + gomega.Eventually(func() error { + return expectInstanceSample(ctx, restConfig, target.Namespace, target.Name, false) + }, 2*time.Minute, 5*time.Second).Should(gomega.Succeed()) + + // Stop the owner group so cleanup completes before the spec exits. + f.Must(f.Client.Delete(ctx, kvg)) + }) +}) + +// expectInstanceSample scrapes the operator pod and returns nil when the +// presence of a sample matching (namespace, instance) equals want. +func expectInstanceSample(ctx context.Context, restConfig *rest.Config, namespace, instance string, want bool) error { + samples, err := metricutil.FetchOperatorMetric(ctx, restConfig, + operatorNamespace, operatorDeploymentName, abnormalInstanceMetric) + if err != nil { + return err + } + if hasInstanceSample(samples, namespace, instance) != want { + return fmt.Errorf("sample presence for %s/%s: got %v, want %v", + namespace, instance, !want, want) + } + return nil +} + +func hasInstanceSample(samples []metricutil.MetricSample, namespace, instance string) bool { + for _, s := range samples { + if s.Labels["namespace"] == namespace && s.Labels["instance"] == instance { + return true + } + } + return false +} diff --git a/tests/e2e/utils/metrics/metrics.go b/tests/e2e/utils/metrics/metrics.go index f77f5a2f3a5..6058d2e74df 100644 --- a/tests/e2e/utils/metrics/metrics.go +++ b/tests/e2e/utils/metrics/metrics.go @@ -27,6 +27,8 @@ import ( "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" + dto "github.com/prometheus/client_model/go" + "github.com/pingcap/tidb-operator/v2/tests/e2e/utils/portforwarder" ) @@ -144,3 +146,95 @@ func parseOperatorPanicMetric(metricsReader io.Reader) (float64, error) { // If metric not found, it means no panics have occurred (metric not yet emitted) return 0, nil } + +// MetricSample is one Prometheus time series scraped from the operator pod. +// Value carries the raw number for counter / gauge samples. +type MetricSample struct { + Labels map[string]string + Value float64 +} + +// FetchOperatorMetric scrapes the operator's /metrics endpoint and returns +// every sample of the requested metric name across all operator pods. With +// HA replicas, the same logical series can appear more than once. +func FetchOperatorMetric(ctx context.Context, restConfig *rest.Config, ns, deploy, metricName string) ([]MetricSample, error) { + clientset, err := kubernetes.NewForConfig(restConfig) + if err != nil { + return nil, fmt.Errorf("failed to create kubernetes client: %w", err) + } + + d, err := clientset.AppsV1().Deployments(ns).Get(ctx, deploy, metav1.GetOptions{}) + if err != nil { + return nil, err + } + + pods, err := clientset.CoreV1().Pods(ns).List(ctx, metav1.ListOptions{ + LabelSelector: metav1.FormatLabelSelector(d.Spec.Selector), + }) + if err != nil { + return nil, fmt.Errorf("failed to list operator pods: %w", err) + } + if len(pods.Items) == 0 { + return nil, fmt.Errorf("no operator pod found in namespace %s with label %s", + ns, metav1.FormatLabelSelector(d.Spec.Selector)) + } + + pf := portforwarder.New(restConfig) + var samples []MetricSample + for i := range pods.Items { + pod := &pods.Items[i] + fam, err := fetchMetricFamiliesFromPod(ctx, pf, pod) + if err != nil { + return nil, fmt.Errorf("cannot get metrics of pod %v: %w", pod.Name, err) + } + mf, ok := fam[metricName] + if !ok { + continue + } + for _, m := range mf.GetMetric() { + s := MetricSample{Labels: map[string]string{}} + for _, lp := range m.GetLabel() { + s.Labels[lp.GetName()] = lp.GetValue() + } + switch { + case m.Gauge != nil: + s.Value = m.Gauge.GetValue() + case m.Counter != nil: + s.Value = m.Counter.GetValue() + } + samples = append(samples, s) + } + } + return samples, nil +} + +func fetchMetricFamiliesFromPod(ctx context.Context, pf portforwarder.PortForwarder, pod *corev1.Pod) (map[string]*dto.MetricFamily, error) { + forwarded, err := pf.ForwardPod(ctx, pod, fmt.Sprintf(":%d", OperatorMetricsPort)) + if err != nil { + return nil, fmt.Errorf("failed to port forward: %w", err) + } + defer forwarded.Cancel() + + localPort, ok := forwarded.Local(OperatorMetricsPort) + if !ok { + return nil, fmt.Errorf("failed to get local port for metrics") + } + + metricsURL := fmt.Sprintf("http://localhost:%d/metrics", localPort) + req, err := http.NewRequestWithContext(ctx, "GET", metricsURL, nil) + if err != nil { + return nil, err + } + resp, err := http.DefaultClient.Do(req) + if err != nil { + return nil, fmt.Errorf("failed to fetch metrics from %s: %w", metricsURL, err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("metrics endpoint returned status %d", resp.StatusCode) + } + + parser := expfmt.TextParser{} + return parser.TextToMetricFamilies(resp.Body) +}