Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions pkg/controllers/common/task_finalizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
utilerr "k8s.io/apimachinery/pkg/util/errors"

"github.com/pingcap/tidb-operator/v2/pkg/client"
"github.com/pingcap/tidb-operator/v2/pkg/metrics"
"github.com/pingcap/tidb-operator/v2/pkg/runtime"
"github.com/pingcap/tidb-operator/v2/pkg/runtime/scope"
"github.com/pingcap/tidb-operator/v2/pkg/utils/k8s"
Expand Down Expand Up @@ -226,6 +227,12 @@ func TaskInstanceFinalizerDel[
return task.Fail().With("cannot remove finalizer: %v", err)
}

// Finalize succeeded; drop per-instance metric series so deleted
// instances do not leave stale samples in Prometheus. Doing this here
// (rather than as a separate task in every component builder) ensures
// new instance controllers cannot forget the cleanup.
metrics.ClearInstanceConditionMetrics(state.Object())

return task.Complete().With("finalizer is removed")
})
}
11 changes: 11 additions & 0 deletions pkg/controllers/common/task_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"github.com/pingcap/tidb-operator/api/v2/core/v1alpha1"
coreutil "github.com/pingcap/tidb-operator/v2/pkg/apiutil/core/v1alpha1"
"github.com/pingcap/tidb-operator/v2/pkg/client"
"github.com/pingcap/tidb-operator/v2/pkg/metrics"
"github.com/pingcap/tidb-operator/v2/pkg/runtime"
"github.com/pingcap/tidb-operator/v2/pkg/runtime/scope"
"github.com/pingcap/tidb-operator/v2/pkg/utils/podutil"
Expand Down Expand Up @@ -220,6 +221,11 @@ func TaskInstanceConditionReady[
state.SetStatusChanged()
}

// Refresh the abnormal-instance gauge so a long-running Ready=False
// (e.g. pod up but cannot serve) becomes alertable without any
// per-component glue.
metrics.ObserveCondition(instance, coreutil.StatusConditions[S](instance), v1alpha1.CondReady)

if !isReady {
return task.Wait().With("instance is unready: %s", coreutil.SprintCondition(cond))
}
Expand Down Expand Up @@ -280,6 +286,11 @@ func TaskInstanceConditionSynced[
state.SetStatusChanged()
}

// Refresh the abnormal-instance gauge so a long-running Synced=False
// (rolling restart stuck, scale-in stuck, ...) becomes alertable
// without any per-component glue.
metrics.ObserveCondition(instance, coreutil.StatusConditions[S](instance), v1alpha1.CondSynced)

if !isSynced {
return task.Wait().With("instance is unsynced: %s", coreutil.SprintCondition(cond))
}
Expand Down
78 changes: 78 additions & 0 deletions pkg/metrics/abnormal_instance.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
// Copyright 2024 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package metrics

import (
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/pingcap/tidb-operator/api/v2/core/v1alpha1"
)

// trackedConditions are the condition types observed by ObserveInstance* and
// cleared by ClearInstanceConditionMetrics. Add new types here when extending
// the AbnormalInstance gauge to additional condition signals.
var trackedConditions = []string{v1alpha1.CondSynced, v1alpha1.CondReady}

// instanceMetricBaseLabels reads the standard well-known labels carried by
// every managed instance (LabelKeyCluster / LabelKeyComponent / LabelKeyGroup)
// plus its namespace and name. The caller appends the per-condition label.
func instanceMetricBaseLabels(obj client.Object) []string {
ls := obj.GetLabels()
return []string{
obj.GetNamespace(),
ls[v1alpha1.LabelKeyCluster],
ls[v1alpha1.LabelKeyComponent],
ls[v1alpha1.LabelKeyGroup],
obj.GetName(),
}
}

// ObserveCondition writes 1 to the abnormal-instance gauge when the named
// condition is False; 0 otherwise (True or absent are treated as healthy).
// The series stays present so PromQL `for:` alerts can fire reliably without
// gaps, and so dashboards never see missing samples for managed instances.
//
// condType must be one of trackedConditions so the finalize-time cleanup in
// ClearInstanceConditionMetrics covers the same set of series this writes.
func ObserveCondition(obj client.Object, conds []metav1.Condition, condType string) {
labels := append(instanceMetricBaseLabels(obj), condType)
value := 0.0
if cond := meta.FindStatusCondition(conds, condType); cond != nil && cond.Status == metav1.ConditionFalse {
value = 1
}
AbnormalInstance.WithLabelValues(labels...).Set(value)
}

// ClearInstanceConditionMetrics removes every tracked-condition series for
// the given instance.
//
// Called from TaskInstanceFinalizerDel after the finalizer is removed, so
// every component that uses the standard finalize task is covered without
// per-builder wiring. Component builders short-circuit the deletion path
// with task.IfBreak around CondClusterIsDeleting / CondObjectIsDeleting, so
// the normal TaskInstanceConditionSynced / TaskInstanceConditionReady tasks
// (where ObserveCondition lives) never run during finalization; without
// this explicit cleanup, the gauge series would stay present at its last
// value forever, triggering false-positive `metric == 1 for: <duration>`
// alerts on a non-existent instance and growing label cardinality across
// each cluster lifecycle.
func ClearInstanceConditionMetrics(obj client.Object) {
base := instanceMetricBaseLabels(obj)
for _, condType := range trackedConditions {
AbnormalInstance.DeleteLabelValues(append(base, condType)...)
}
}
149 changes: 149 additions & 0 deletions pkg/metrics/abnormal_instance_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
// Copyright 2024 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package metrics

import (
"testing"

"github.com/prometheus/client_golang/prometheus"
dto "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"github.com/pingcap/tidb-operator/api/v2/core/v1alpha1"
)

func newTiKVForMetricTest(name string) *v1alpha1.TiKV {
return &v1alpha1.TiKV{
ObjectMeta: metav1.ObjectMeta{
Namespace: "test-ns",
Name: name,
Labels: map[string]string{
v1alpha1.LabelKeyCluster: "test-cluster",
v1alpha1.LabelKeyComponent: "tikv",
v1alpha1.LabelKeyGroup: "test-group",
},
},
}
}

// abnormalGaugeValue reads the AbnormalInstance gauge for the given instance
// + condition. Returns (value, present). present is false when the series has
// not been touched (collector returns the default zero metric without a Gauge
// payload set).
func abnormalGaugeValue(t *testing.T, instance, condition string) (float64, bool) {
t.Helper()
g, err := AbnormalInstance.GetMetricWithLabelValues(
"test-ns", "test-cluster", "tikv", "test-group", instance, condition,
)
require.NoError(t, err)
m := &dto.Metric{}
require.NoError(t, g.Write(m))
if m.Gauge == nil {
return 0, false
}
return m.Gauge.GetValue(), true
}

func gaugeSeriesExists(t *testing.T, instance, condition string) bool {
t.Helper()
ch := make(chan prometheus.Metric, 16)
AbnormalInstance.Collect(ch)
close(ch)
want := []string{"test-ns", "test-cluster", "tikv", "test-group", instance, condition}
for m := range ch {
dm := &dto.Metric{}
if err := m.Write(dm); err != nil {
continue
}
if labelsEqual(dm.GetLabel(), want) {
return true
}
}
return false
}

func labelsEqual(got []*dto.LabelPair, wantValues []string) bool {
if len(got) != len(InstanceAbnormalMetricLabels) {
return false
}
want := map[string]string{}
for i, k := range InstanceAbnormalMetricLabels {
want[k] = wantValues[i]
}
for _, p := range got {
if want[p.GetName()] != p.GetValue() {
return false
}
}
return true
}

func TestObserveCondition(t *testing.T) {
cases := []struct {
name string
condType string
status metav1.ConditionStatus
omitCond bool
wantValue float64
}{
{name: "Synced=False sets 1", condType: v1alpha1.CondSynced, status: metav1.ConditionFalse, wantValue: 1},
{name: "Synced=True sets 0", condType: v1alpha1.CondSynced, status: metav1.ConditionTrue, wantValue: 0},
{name: "Synced absent treated as healthy", condType: v1alpha1.CondSynced, omitCond: true, wantValue: 0},
{name: "Ready=False sets 1", condType: v1alpha1.CondReady, status: metav1.ConditionFalse, wantValue: 1},
{name: "Ready=True sets 0", condType: v1alpha1.CondReady, status: metav1.ConditionTrue, wantValue: 0},
}

for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
instanceName := "tikv-" + tc.name
defer AbnormalInstance.DeleteLabelValues(
"test-ns", "test-cluster", "tikv", "test-group", instanceName, tc.condType,
)

obj := newTiKVForMetricTest(instanceName)
var conds []metav1.Condition
if !tc.omitCond {
conds = []metav1.Condition{{Type: tc.condType, Status: tc.status}}
}
ObserveCondition(obj, conds, tc.condType)

val, present := abnormalGaugeValue(t, instanceName, tc.condType)
require.True(t, present)
assert.InDelta(t, tc.wantValue, val, 1e-9)
assert.True(t, gaugeSeriesExists(t, instanceName, tc.condType),
"series must remain present so PromQL `for:` alerts have continuous samples")
})
}
}

func TestClearInstanceConditionMetrics(t *testing.T) {
name := "tikv-clear"
obj := newTiKVForMetricTest(name)

// Pre-populate both tracked conditions.
ObserveCondition(obj, []metav1.Condition{{Type: v1alpha1.CondSynced, Status: metav1.ConditionFalse}}, v1alpha1.CondSynced)
ObserveCondition(obj, []metav1.Condition{{Type: v1alpha1.CondReady, Status: metav1.ConditionFalse}}, v1alpha1.CondReady)
require.True(t, gaugeSeriesExists(t, name, v1alpha1.CondSynced))
require.True(t, gaugeSeriesExists(t, name, v1alpha1.CondReady))

ClearInstanceConditionMetrics(obj)

assert.False(t, gaugeSeriesExists(t, name, v1alpha1.CondSynced),
"Synced series must be removed on clear")
assert.False(t, gaugeSeriesExists(t, name, v1alpha1.CondReady),
"Ready series must be removed on clear")
}
22 changes: 22 additions & 0 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@ import (
"sigs.k8s.io/controller-runtime/pkg/metrics"
)

// InstanceAbnormalMetricLabels is the canonical label order for the
// per-instance abnormal-condition gauge. Keep in sync with WithLabelValues /
// DeleteLabelValues callers.
var InstanceAbnormalMetricLabels = []string{"namespace", "cluster", "component", "group", "instance", "condition"}

var (

// ControllerPanic is a counter to record the number of panics in the controller.
Expand All @@ -30,9 +35,26 @@ var (
Help: "The total number of panics in the controller",
}, []string{},
)

// AbnormalInstance is 1 when the named condition on the instance is False
// (abnormal), 0 otherwise. The series stays present while the operator
// manages the instance and is removed only when the instance is finalized.
//
// Use `metric == 1` together with PromQL `for: <duration>` to alert on
// instances stuck in an abnormal state, e.g. a rolling restart that cannot
// converge or a pod that is up but cannot serve.
AbnormalInstance = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "tidb_operator",
Name: "abnormal_instance",
Help: "1 when the named condition on the instance is False, 0 otherwise. " +
"Use `metric == 1` with PromQL `for: <duration>` to alert on stuck state.",
}, InstanceAbnormalMetricLabels,
)
)

func init() {
// Register custom metrics with the global prometheus registry
metrics.Registry.MustRegister(ControllerPanic)
metrics.Registry.MustRegister(AbnormalInstance)
}
Loading