Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions cmd/tidb-operator/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,10 @@ func setup(ctx context.Context, mgr ctrl.Manager) error {
tsocm.Start(ctx)
rmcm.Start(ctx)

if err := metrics.RegisterAbnormalInstanceCleanup(ctx, mgr); err != nil {
return fmt.Errorf("register abnormal-instance cleanup: %w", err)
}

if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
return fmt.Errorf("unable to set up health check: %w", err)
}
Expand Down
5 changes: 0 additions & 5 deletions pkg/controllers/pd/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,6 @@ func (r *Reconciler) NewRunner(state *tasks.ReconcileContext, reporter task.Task
task.IfBreak(common.CondObjectIsDeleting[scope.PD](state),
tasks.TaskDeleteMember(state),
common.TaskInstanceFinalizerDel[scope.PD](state, r.Client, common.DefaultInstanceSubresourceLister),
// TODO(liubo02): if the finalizer has been removed, no need to update status
common.TaskInstanceConditionSynced[scope.PD](state),
common.TaskInstanceConditionReady[scope.PD](state),
common.TaskInstanceConditionRunning[scope.PD](state),
common.TaskStatusPersister[scope.PD](state, r.Client),
),
common.TaskFinalizerAdd[scope.PD](state, r.Client),

Expand Down
4 changes: 0 additions & 4 deletions pkg/controllers/scheduling/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,6 @@ func (r *Reconciler) NewRunner(state *tasks.ReconcileContext, reporter task.Task
// tasks.TaskContextInfoFromPD(state, r.PDClientManager),
task.IfBreak(common.CondObjectIsDeleting[scope.Scheduling](state),
common.TaskInstanceFinalizerDel[scope.Scheduling](state, r.Client, common.DefaultInstanceSubresourceLister),
common.TaskInstanceConditionSynced[scope.Scheduling](state),
common.TaskInstanceConditionReady[scope.Scheduling](state),
common.TaskInstanceConditionRunning[scope.Scheduling](state),
common.TaskStatusPersister[scope.Scheduling](state, r.Client),
),
common.TaskFinalizerAdd[scope.Scheduling](state, r.Client),

Expand Down
5 changes: 0 additions & 5 deletions pkg/controllers/ticdc/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,6 @@ func (r *Reconciler) NewRunner(state *tasks.ReconcileContext, reporter task.Task

task.IfBreak(common.CondObjectIsDeleting[scope.TiCDC](state),
common.TaskInstanceFinalizerDel[scope.TiCDC](state, r.Client, common.DefaultInstanceSubresourceLister),
// TODO(liubo02): if the finalizer has been removed, no need to update status
common.TaskInstanceConditionSynced[scope.TiCDC](state),
common.TaskInstanceConditionReady[scope.TiCDC](state),
common.TaskInstanceConditionRunning[scope.TiCDC](state),
common.TaskStatusPersister[scope.TiCDC](state, r.Client),
),
common.TaskFinalizerAdd[scope.TiCDC](state, r.Client),

Expand Down
5 changes: 0 additions & 5 deletions pkg/controllers/tidb/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,6 @@ func (r *Reconciler) NewRunner(state *tasks.ReconcileContext, reporter task.Task

task.IfBreak(common.CondObjectIsDeleting[scope.TiDB](state),
common.TaskInstanceFinalizerDel[scope.TiDB](state, r.Client, common.DefaultInstanceSubresourceLister),
// TODO(liubo02): if the finalizer has been removed, no need to update status
common.TaskInstanceConditionSynced[scope.TiDB](state),
common.TaskInstanceConditionReady[scope.TiDB](state),
common.TaskInstanceConditionRunning[scope.TiDB](state),
common.TaskStatusPersister[scope.TiDB](state, r.Client),
),
common.TaskFinalizerAdd[scope.TiDB](state, r.Client),

Expand Down
5 changes: 0 additions & 5 deletions pkg/controllers/tikvworker/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,6 @@ func (r *Reconciler) NewRunner(state *tasks.ReconcileContext, reporter task.Task

task.IfBreak(common.CondObjectIsDeleting[scope.TiKVWorker](state),
common.TaskInstanceFinalizerDel[scope.TiKVWorker](state, r.Client, common.DefaultInstanceSubresourceLister),
// TODO(liubo02): if the finalizer has been removed, no need to update status
common.TaskInstanceConditionSynced[scope.TiKVWorker](state),
common.TaskInstanceConditionReady[scope.TiKVWorker](state),
common.TaskInstanceConditionRunning[scope.TiKVWorker](state),
common.TaskStatusPersister[scope.TiKVWorker](state, r.Client),
),
common.TaskFinalizerAdd[scope.TiKVWorker](state, r.Client),

Expand Down
5 changes: 0 additions & 5 deletions pkg/controllers/tiproxy/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,6 @@ func (r *Reconciler) NewRunner(state *tasks.ReconcileContext, reporter task.Task

task.IfBreak(common.CondObjectIsDeleting[scope.TiProxy](state),
common.TaskInstanceFinalizerDel[scope.TiProxy](state, r.Client, common.DefaultInstanceSubresourceLister),
// TODO(liubo02): if the finalizer has been removed, no need to update status
common.TaskInstanceConditionSynced[scope.TiProxy](state),
common.TaskInstanceConditionReady[scope.TiProxy](state),
common.TaskInstanceConditionRunning[scope.TiProxy](state),
common.TaskStatusPersister[scope.TiProxy](state, r.Client),
),
common.TaskFinalizerAdd[scope.TiProxy](state, r.Client),

Expand Down
5 changes: 0 additions & 5 deletions pkg/controllers/tso/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,6 @@ func (r *Reconciler) NewRunner(state *tasks.ReconcileContext, reporter task.Task
// tasks.TaskContextInfoFromPD(state, r.PDClientManager),
task.IfBreak(common.CondObjectIsDeleting[scope.TSO](state),
common.TaskInstanceFinalizerDel[scope.TSO](state, r.Client, common.DefaultInstanceSubresourceLister),
// TODO(liubo02): if the finalizer has been removed, no need to update status
common.TaskInstanceConditionSynced[scope.TSO](state),
common.TaskInstanceConditionReady[scope.TSO](state),
common.TaskInstanceConditionRunning[scope.TSO](state),
common.TaskStatusPersister[scope.TSO](state, r.Client),
),
common.TaskFinalizerAdd[scope.TSO](state, r.Client),

Expand Down
25 changes: 14 additions & 11 deletions pkg/metrics/abnormal_instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,18 +58,21 @@ func ObserveCondition(obj client.Object, conds []metav1.Condition, condType stri
}

// ClearInstanceConditionMetrics removes every tracked-condition series for
// the given instance.
// the given instance. Two call sites keep the gauge honest:
//
// Called from TaskInstanceFinalizerDel after the finalizer is removed, so
// every component that uses the standard finalize task is covered without
// per-builder wiring. Component builders short-circuit the deletion path
// with task.IfBreak around CondClusterIsDeleting / CondObjectIsDeleting, so
// the normal TaskInstanceConditionSynced / TaskInstanceConditionReady tasks
// (where ObserveCondition lives) never run during finalization; without
// this explicit cleanup, the gauge series would stay present at its last
// value forever, triggering false-positive `metric == 1 for: <duration>`
// alerts on a non-existent instance and growing label cardinality across
// each cluster lifecycle.
// - TaskInstanceFinalizerDel, after the finalizer is removed. Covers the
// normal graceful delete path. The deletion branch of every instance
// builder runs only FinalizerDel (no CondSynced/Ready/Running afterwards),
// so the clear done here is not re-populated within the same reconcile.
//
// - RegisterAbnormalInstanceCleanup, on shared-informer DELETE events.
// Covers paths that bypass the finalizer entirely (force-delete via
// `patch metadata.finalizers=null`, operator down during delete, etc.).
//
// Without both paths, a leaked series stays present at its last value forever,
// triggering false-positive `metric == 1 for: <duration>` alerts on a
// non-existent instance and growing label cardinality across each cluster
// lifecycle.
func ClearInstanceConditionMetrics(obj client.Object) {
base := instanceMetricBaseLabels(obj)
for _, condType := range trackedConditions {
Expand Down
80 changes: 80 additions & 0 deletions pkg/metrics/cleanup_handler.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// Copyright 2026 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package metrics

import (
"context"
"fmt"

toolscache "k8s.io/client-go/tools/cache"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/pingcap/tidb-operator/api/v2/core/v1alpha1"
)

// abnormalInstanceKinds lists every instance CR that writes to the
// AbnormalInstance gauge via TaskInstanceConditionSynced / Ready. Keep this in
// sync with the controllers that wire those tasks into their builder.
var abnormalInstanceKinds = []client.Object{
&v1alpha1.PD{},
&v1alpha1.TiDB{},
&v1alpha1.TiKV{},
&v1alpha1.TiFlash{},
&v1alpha1.TiCDC{},
&v1alpha1.TSO{},
&v1alpha1.Scheduling{},
&v1alpha1.TiProxy{},
&v1alpha1.TiKVWorker{},
}

// RegisterAbnormalInstanceCleanup attaches a watch-level delete handler on
// every instance CR so the AbnormalInstance gauge is cleared the moment the
// object disappears from the API server, even when the deletion bypasses the
// operator's finalizer (e.g. a manual `patch metadata.finalizers=null` used to
// unwedge a stuck instance). Without this, such paths skip
// TaskInstanceFinalizerDel entirely and leave the gauge at its last value
// forever, causing false-positive `metric == 1 for: <duration>` alerts on a
// non-existent instance and unbounded label-cardinality growth.
//
// This complements, rather than replaces, the finalizer-time cleanup: the
// finalizer path covers graceful deletes and short-circuits the observe loop
// before it can re-populate the gauge, while this handler covers the paths
// that never reach the finalizer at all.
func RegisterAbnormalInstanceCleanup(ctx context.Context, mgr ctrl.Manager) error {
for _, obj := range abnormalInstanceKinds {
inf, err := mgr.GetCache().GetInformer(ctx, obj)
if err != nil {
return fmt.Errorf("get informer for %T: %w", obj, err)
}
if _, err := inf.AddEventHandler(toolscache.ResourceEventHandlerFuncs{
DeleteFunc: clearOnDelete,
}); err != nil {
return fmt.Errorf("add delete handler for %T: %w", obj, err)
}
}
return nil
}

func clearOnDelete(o interface{}) {
if t, ok := o.(toolscache.DeletedFinalStateUnknown); ok {
o = t.Obj
}
co, ok := o.(client.Object)
if !ok {
return
}
ClearInstanceConditionMetrics(co)
}
1 change: 1 addition & 0 deletions tests/e2e/e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
_ "github.com/pingcap/tidb-operator/v2/tests/e2e/br"
_ "github.com/pingcap/tidb-operator/v2/tests/e2e/cluster"
_ "github.com/pingcap/tidb-operator/v2/tests/e2e/example"
_ "github.com/pingcap/tidb-operator/v2/tests/e2e/metrics"
_ "github.com/pingcap/tidb-operator/v2/tests/e2e/pd"
_ "github.com/pingcap/tidb-operator/v2/tests/e2e/ticdc"
_ "github.com/pingcap/tidb-operator/v2/tests/e2e/tidb"
Expand Down
145 changes: 145 additions & 0 deletions tests/e2e/metrics/abnormal_instance_leak.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
// Copyright 2026 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package metrics

import (
"context"
"fmt"
"time"

"github.com/onsi/ginkgo/v2"
"github.com/onsi/gomega"
"k8s.io/client-go/rest"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/pingcap/tidb-operator/api/v2/core/v1alpha1"
"github.com/pingcap/tidb-operator/v2/pkg/apicall"
"github.com/pingcap/tidb-operator/v2/pkg/runtime/scope"
"github.com/pingcap/tidb-operator/v2/tests/e2e/data"
"github.com/pingcap/tidb-operator/v2/tests/e2e/framework"
"github.com/pingcap/tidb-operator/v2/tests/e2e/label"
"github.com/pingcap/tidb-operator/v2/tests/e2e/utils/k8s"
metricutil "github.com/pingcap/tidb-operator/v2/tests/e2e/utils/metrics"
)

const (
operatorNamespace = "tidb-admin"
operatorDeploymentName = "tidb-operator"
abnormalInstanceMetric = "tidb_operator_abnormal_instance"
)

var _ = ginkgo.Describe("Abnormal Instance Gauge Leak", label.TiKV, func() {
f := framework.New()
f.Setup()

ginkgo.It("clears gauge series when the instance CR is deleted gracefully", label.P1, label.Delete, func(ctx context.Context) {
pdg := f.MustCreatePD(ctx)
kvg := f.MustCreateTiKV(ctx, data.WithReplicas[scope.TiKVGroup](1))
f.WaitForPDGroupReady(ctx, pdg)
f.WaitForTiKVGroupReady(ctx, kvg)

restConfig, err := k8s.LoadConfig()
f.Must(err)

kvs, err := apicall.ListInstances[scope.TiKVGroup](ctx, f.Client, kvg)
f.Must(err)
gomega.Expect(kvs).To(gomega.HaveLen(1))
target := kvs[0]

ginkgo.By("waiting for the gauge series to be populated for the tikv instance")
gomega.Eventually(func() error {
return expectInstanceSample(ctx, restConfig, target.Namespace, target.Name, true)
}, 3*time.Minute, 5*time.Second).Should(gomega.Succeed())

ginkgo.By("deleting the tikv group to trigger graceful deletion of the instance")
f.Must(f.Client.Delete(ctx, kvg))

ginkgo.By("waiting for the gauge series to disappear")
gomega.Eventually(func() error {
return expectInstanceSample(ctx, restConfig, target.Namespace, target.Name, false)
}, 3*time.Minute, 5*time.Second).Should(gomega.Succeed())
})

ginkgo.It("clears gauge series on force-delete that bypasses the finalizer", label.P1, label.Delete, func(ctx context.Context) {
pdg := f.MustCreatePD(ctx)
kvg := f.MustCreateTiKV(ctx, data.WithReplicas[scope.TiKVGroup](1))
f.WaitForPDGroupReady(ctx, pdg)
f.WaitForTiKVGroupReady(ctx, kvg)

restConfig, err := k8s.LoadConfig()
f.Must(err)

kvs, err := apicall.ListInstances[scope.TiKVGroup](ctx, f.Client, kvg)
f.Must(err)
gomega.Expect(kvs).To(gomega.HaveLen(1))
target := kvs[0]

ginkgo.By("waiting for the gauge series to be populated for the tikv instance")
gomega.Eventually(func() error {
return expectInstanceSample(ctx, restConfig, target.Namespace, target.Name, true)
}, 3*time.Minute, 5*time.Second).Should(gomega.Succeed())

ginkgo.By("deleting the tikv CR then stripping its finalizers to force GC without waiting on the finalize task")
f.Must(f.Client.Delete(ctx, target))

// After Delete sets DeletionTimestamp, clearing finalizers causes the API
// server to GC the object immediately, skipping TaskInstanceFinalizerDel.
gomega.Eventually(func() error {
latest := &v1alpha1.TiKV{}
if err := f.Client.Get(ctx, client.ObjectKeyFromObject(target), latest); err != nil {
// Already gone; nothing more to do.
return nil
}
if len(latest.Finalizers) == 0 {
return nil
}
patch := client.MergeFrom(latest.DeepCopy())
latest.Finalizers = nil
return f.Client.Patch(ctx, latest, patch)
}, 30*time.Second, time.Second).Should(gomega.Succeed())

ginkgo.By("waiting for the gauge series to be cleared by the informer DELETE handler")
gomega.Eventually(func() error {
return expectInstanceSample(ctx, restConfig, target.Namespace, target.Name, false)
}, 2*time.Minute, 5*time.Second).Should(gomega.Succeed())

// Stop the owner group so cleanup completes before the spec exits.
f.Must(f.Client.Delete(ctx, kvg))
})
})

// expectInstanceSample scrapes the operator pod and returns nil when the
// presence of a sample matching (namespace, instance) equals want.
func expectInstanceSample(ctx context.Context, restConfig *rest.Config, namespace, instance string, want bool) error {
samples, err := metricutil.FetchOperatorMetric(ctx, restConfig,
operatorNamespace, operatorDeploymentName, abnormalInstanceMetric)
if err != nil {
return err
}
if hasInstanceSample(samples, namespace, instance) != want {
return fmt.Errorf("sample presence for %s/%s: got %v, want %v",
namespace, instance, !want, want)
}
return nil
}

func hasInstanceSample(samples []metricutil.MetricSample, namespace, instance string) bool {
for _, s := range samples {
if s.Labels["namespace"] == namespace && s.Labels["instance"] == instance {
return true
}
}
return false
}
Loading
Loading