diff --git a/pkg/controllers/common/task_finalizer.go b/pkg/controllers/common/task_finalizer.go index 8400b5edfba..9198e1e292b 100644 --- a/pkg/controllers/common/task_finalizer.go +++ b/pkg/controllers/common/task_finalizer.go @@ -24,6 +24,7 @@ import ( utilerr "k8s.io/apimachinery/pkg/util/errors" "github.com/pingcap/tidb-operator/v2/pkg/client" + "github.com/pingcap/tidb-operator/v2/pkg/metrics" "github.com/pingcap/tidb-operator/v2/pkg/runtime" "github.com/pingcap/tidb-operator/v2/pkg/runtime/scope" "github.com/pingcap/tidb-operator/v2/pkg/utils/k8s" @@ -226,6 +227,12 @@ func TaskInstanceFinalizerDel[ return task.Fail().With("cannot remove finalizer: %v", err) } + // Finalize succeeded; drop per-instance metric series so deleted + // instances do not leave stale samples in Prometheus. Doing this here + // (rather than as a separate task in every component builder) ensures + // new instance controllers cannot forget the cleanup. + metrics.ClearInstanceConditionMetrics(state.Object()) + return task.Complete().With("finalizer is removed") }) } diff --git a/pkg/controllers/common/task_status.go b/pkg/controllers/common/task_status.go index 35c8a744508..42c2bc54ba2 100644 --- a/pkg/controllers/common/task_status.go +++ b/pkg/controllers/common/task_status.go @@ -24,6 +24,7 @@ import ( "github.com/pingcap/tidb-operator/api/v2/core/v1alpha1" coreutil "github.com/pingcap/tidb-operator/v2/pkg/apiutil/core/v1alpha1" "github.com/pingcap/tidb-operator/v2/pkg/client" + "github.com/pingcap/tidb-operator/v2/pkg/metrics" "github.com/pingcap/tidb-operator/v2/pkg/runtime" "github.com/pingcap/tidb-operator/v2/pkg/runtime/scope" "github.com/pingcap/tidb-operator/v2/pkg/utils/podutil" @@ -220,6 +221,11 @@ func TaskInstanceConditionReady[ state.SetStatusChanged() } + // Refresh the abnormal-instance gauge so a long-running Ready=False + // (e.g. pod up but cannot serve) becomes alertable without any + // per-component glue. + metrics.ObserveCondition(instance, coreutil.StatusConditions[S](instance), v1alpha1.CondReady) + if !isReady { return task.Wait().With("instance is unready: %s", coreutil.SprintCondition(cond)) } @@ -280,6 +286,11 @@ func TaskInstanceConditionSynced[ state.SetStatusChanged() } + // Refresh the abnormal-instance gauge so a long-running Synced=False + // (rolling restart stuck, scale-in stuck, ...) becomes alertable + // without any per-component glue. + metrics.ObserveCondition(instance, coreutil.StatusConditions[S](instance), v1alpha1.CondSynced) + if !isSynced { return task.Wait().With("instance is unsynced: %s", coreutil.SprintCondition(cond)) } diff --git a/pkg/metrics/abnormal_instance.go b/pkg/metrics/abnormal_instance.go new file mode 100644 index 00000000000..8fceb702afc --- /dev/null +++ b/pkg/metrics/abnormal_instance.go @@ -0,0 +1,78 @@ +// Copyright 2024 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package metrics + +import ( + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/pingcap/tidb-operator/api/v2/core/v1alpha1" +) + +// trackedConditions are the condition types observed by ObserveInstance* and +// cleared by ClearInstanceConditionMetrics. Add new types here when extending +// the AbnormalInstance gauge to additional condition signals. +var trackedConditions = []string{v1alpha1.CondSynced, v1alpha1.CondReady} + +// instanceMetricBaseLabels reads the standard well-known labels carried by +// every managed instance (LabelKeyCluster / LabelKeyComponent / LabelKeyGroup) +// plus its namespace and name. The caller appends the per-condition label. +func instanceMetricBaseLabels(obj client.Object) []string { + ls := obj.GetLabels() + return []string{ + obj.GetNamespace(), + ls[v1alpha1.LabelKeyCluster], + ls[v1alpha1.LabelKeyComponent], + ls[v1alpha1.LabelKeyGroup], + obj.GetName(), + } +} + +// ObserveCondition writes 1 to the abnormal-instance gauge when the named +// condition is False; 0 otherwise (True or absent are treated as healthy). +// The series stays present so PromQL `for:` alerts can fire reliably without +// gaps, and so dashboards never see missing samples for managed instances. +// +// condType must be one of trackedConditions so the finalize-time cleanup in +// ClearInstanceConditionMetrics covers the same set of series this writes. +func ObserveCondition(obj client.Object, conds []metav1.Condition, condType string) { + labels := append(instanceMetricBaseLabels(obj), condType) + value := 0.0 + if cond := meta.FindStatusCondition(conds, condType); cond != nil && cond.Status == metav1.ConditionFalse { + value = 1 + } + AbnormalInstance.WithLabelValues(labels...).Set(value) +} + +// ClearInstanceConditionMetrics removes every tracked-condition series for +// the given instance. +// +// Called from TaskInstanceFinalizerDel after the finalizer is removed, so +// every component that uses the standard finalize task is covered without +// per-builder wiring. Component builders short-circuit the deletion path +// with task.IfBreak around CondClusterIsDeleting / CondObjectIsDeleting, so +// the normal TaskInstanceConditionSynced / TaskInstanceConditionReady tasks +// (where ObserveCondition lives) never run during finalization; without +// this explicit cleanup, the gauge series would stay present at its last +// value forever, triggering false-positive `metric == 1 for: ` +// alerts on a non-existent instance and growing label cardinality across +// each cluster lifecycle. +func ClearInstanceConditionMetrics(obj client.Object) { + base := instanceMetricBaseLabels(obj) + for _, condType := range trackedConditions { + AbnormalInstance.DeleteLabelValues(append(base, condType)...) + } +} diff --git a/pkg/metrics/abnormal_instance_test.go b/pkg/metrics/abnormal_instance_test.go new file mode 100644 index 00000000000..a29de512595 --- /dev/null +++ b/pkg/metrics/abnormal_instance_test.go @@ -0,0 +1,149 @@ +// Copyright 2024 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package metrics + +import ( + "testing" + + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/pingcap/tidb-operator/api/v2/core/v1alpha1" +) + +func newTiKVForMetricTest(name string) *v1alpha1.TiKV { + return &v1alpha1.TiKV{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "test-ns", + Name: name, + Labels: map[string]string{ + v1alpha1.LabelKeyCluster: "test-cluster", + v1alpha1.LabelKeyComponent: "tikv", + v1alpha1.LabelKeyGroup: "test-group", + }, + }, + } +} + +// abnormalGaugeValue reads the AbnormalInstance gauge for the given instance +// + condition. Returns (value, present). present is false when the series has +// not been touched (collector returns the default zero metric without a Gauge +// payload set). +func abnormalGaugeValue(t *testing.T, instance, condition string) (float64, bool) { + t.Helper() + g, err := AbnormalInstance.GetMetricWithLabelValues( + "test-ns", "test-cluster", "tikv", "test-group", instance, condition, + ) + require.NoError(t, err) + m := &dto.Metric{} + require.NoError(t, g.Write(m)) + if m.Gauge == nil { + return 0, false + } + return m.Gauge.GetValue(), true +} + +func gaugeSeriesExists(t *testing.T, instance, condition string) bool { + t.Helper() + ch := make(chan prometheus.Metric, 16) + AbnormalInstance.Collect(ch) + close(ch) + want := []string{"test-ns", "test-cluster", "tikv", "test-group", instance, condition} + for m := range ch { + dm := &dto.Metric{} + if err := m.Write(dm); err != nil { + continue + } + if labelsEqual(dm.GetLabel(), want) { + return true + } + } + return false +} + +func labelsEqual(got []*dto.LabelPair, wantValues []string) bool { + if len(got) != len(InstanceAbnormalMetricLabels) { + return false + } + want := map[string]string{} + for i, k := range InstanceAbnormalMetricLabels { + want[k] = wantValues[i] + } + for _, p := range got { + if want[p.GetName()] != p.GetValue() { + return false + } + } + return true +} + +func TestObserveCondition(t *testing.T) { + cases := []struct { + name string + condType string + status metav1.ConditionStatus + omitCond bool + wantValue float64 + }{ + {name: "Synced=False sets 1", condType: v1alpha1.CondSynced, status: metav1.ConditionFalse, wantValue: 1}, + {name: "Synced=True sets 0", condType: v1alpha1.CondSynced, status: metav1.ConditionTrue, wantValue: 0}, + {name: "Synced absent treated as healthy", condType: v1alpha1.CondSynced, omitCond: true, wantValue: 0}, + {name: "Ready=False sets 1", condType: v1alpha1.CondReady, status: metav1.ConditionFalse, wantValue: 1}, + {name: "Ready=True sets 0", condType: v1alpha1.CondReady, status: metav1.ConditionTrue, wantValue: 0}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + instanceName := "tikv-" + tc.name + defer AbnormalInstance.DeleteLabelValues( + "test-ns", "test-cluster", "tikv", "test-group", instanceName, tc.condType, + ) + + obj := newTiKVForMetricTest(instanceName) + var conds []metav1.Condition + if !tc.omitCond { + conds = []metav1.Condition{{Type: tc.condType, Status: tc.status}} + } + ObserveCondition(obj, conds, tc.condType) + + val, present := abnormalGaugeValue(t, instanceName, tc.condType) + require.True(t, present) + assert.InDelta(t, tc.wantValue, val, 1e-9) + assert.True(t, gaugeSeriesExists(t, instanceName, tc.condType), + "series must remain present so PromQL `for:` alerts have continuous samples") + }) + } +} + +func TestClearInstanceConditionMetrics(t *testing.T) { + name := "tikv-clear" + obj := newTiKVForMetricTest(name) + + // Pre-populate both tracked conditions. + ObserveCondition(obj, []metav1.Condition{{Type: v1alpha1.CondSynced, Status: metav1.ConditionFalse}}, v1alpha1.CondSynced) + ObserveCondition(obj, []metav1.Condition{{Type: v1alpha1.CondReady, Status: metav1.ConditionFalse}}, v1alpha1.CondReady) + require.True(t, gaugeSeriesExists(t, name, v1alpha1.CondSynced)) + require.True(t, gaugeSeriesExists(t, name, v1alpha1.CondReady)) + + ClearInstanceConditionMetrics(obj) + + assert.False(t, gaugeSeriesExists(t, name, v1alpha1.CondSynced), + "Synced series must be removed on clear") + assert.False(t, gaugeSeriesExists(t, name, v1alpha1.CondReady), + "Ready series must be removed on clear") +} diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 2ac0b573ca6..f1791c45cb1 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -19,6 +19,11 @@ import ( "sigs.k8s.io/controller-runtime/pkg/metrics" ) +// InstanceAbnormalMetricLabels is the canonical label order for the +// per-instance abnormal-condition gauge. Keep in sync with WithLabelValues / +// DeleteLabelValues callers. +var InstanceAbnormalMetricLabels = []string{"namespace", "cluster", "component", "group", "instance", "condition"} + var ( // ControllerPanic is a counter to record the number of panics in the controller. @@ -30,9 +35,26 @@ var ( Help: "The total number of panics in the controller", }, []string{}, ) + + // AbnormalInstance is 1 when the named condition on the instance is False + // (abnormal), 0 otherwise. The series stays present while the operator + // manages the instance and is removed only when the instance is finalized. + // + // Use `metric == 1` together with PromQL `for: ` to alert on + // instances stuck in an abnormal state, e.g. a rolling restart that cannot + // converge or a pod that is up but cannot serve. + AbnormalInstance = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: "tidb_operator", + Name: "abnormal_instance", + Help: "1 when the named condition on the instance is False, 0 otherwise. " + + "Use `metric == 1` with PromQL `for: ` to alert on stuck state.", + }, InstanceAbnormalMetricLabels, + ) ) func init() { // Register custom metrics with the global prometheus registry metrics.Registry.MustRegister(ControllerPanic) + metrics.Registry.MustRegister(AbnormalInstance) }