diff --git a/packages/activemq_otel/changelog.yml b/packages/activemq_otel/changelog.yml index cd89ac3296e..acdbb06070a 100644 --- a/packages/activemq_otel/changelog.yml +++ b/packages/activemq_otel/changelog.yml @@ -1,4 +1,9 @@ # newer versions go on top +- version: "0.2.0" + changes: + - description: Add description and artifact fields to alerting rule template. + type: enhancement + link: https://github.com/elastic/integrations/pull/18506 - version: "0.1.0" changes: - description: Initial draft of the package diff --git a/packages/activemq_otel/kibana/alerting_rule_template/activemq_otel-blocked-sends-detected.json b/packages/activemq_otel/kibana/alerting_rule_template/activemq_otel-blocked-sends-detected.json index 91a7b56c8ad..93762924b87 100644 --- a/packages/activemq_otel/kibana/alerting_rule_template/activemq_otel-blocked-sends-detected.json +++ b/packages/activemq_otel/kibana/alerting_rule_template/activemq_otel-blocked-sends-detected.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when producer sends are blocked by broker memory pressure (`activemq.queue.blocked.sends` rate > 0). Any non-zero value means the broker is already out of memory headroom and is back-pressuring producers.", "name": "[ActiveMQ OTel] Blocked sends detected", "ruleTypeId": ".es-query", "tags": [ @@ -16,13 +17,31 @@ "alertDelay": { "active": 1 }, + "artifacts": { + "dashboards": [ + { + "id": "activemq_otel-broker-health" + }, + { + "id": "activemq_otel-destinations" + }, + { + "id": "activemq_otel-overview" + } + ], + "investigation_guide": { + "blob": "## ActiveMQ blocked sends detected\n\n### What fired\nThe counter `activemq.queue.blocked.sends` registered a non-zero rate on at least one destination. Producers are being blocked while the broker waits for memory to free up.\n\n### Why it matters\nBlocked sends are the canary for broker memory exhaustion. When memory is saturated, producers stall instead of pushing new messages through; enqueue latency spikes and throughput drops. Left alone, the broker may stop accepting new persistent messages entirely.\n\n### Triage\n1. Identify the affected destinations and brokers from the alert context.\n2. On the Broker Health dashboard, check `activemq.memory.utilization` \u2014 it is almost certainly above 0.85.\n3. Compare enqueue vs dequeue rates (`activemq.message.enqueued` / `activemq.message.dequeued`) on the hot destinations to confirm consumer lag.\n4. Check `activemq.consumer.count` on affected queues \u2014 zero or insufficient consumers is a common root cause.\n5. Inspect JVM state (`activemq.jvm.memory.heap.used`, GC activity) \u2014 broker under JVM pressure will drag memory signals with it.\n\n### Remediation\n- Scale consumers or fix slow consumer code paths for the backed-up destinations.\n- Raise broker memory limit if the workload has legitimately grown.\n- Drain the DLQ or pressure-relief queues if they are consuming memory budget.\n\n### Tuning\n- The rule fires on *any* blocked send rate > 0, which matches the documented \"zero is the only healthy value\" stance. If your cluster tolerates brief bursts, require `blocked_rate > N` over a longer window.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { - "esql": "TS metrics-activemq.otel-*\n// activemq.queue.blocked.sends is counter_long; dimension: activemq.destination.name\n| WHERE activemq.queue.blocked.sends IS NOT NULL\n| STATS blocked_rate = SUM(RATE(activemq.queue.blocked.sends)) BY activemq.broker.name, activemq.destination.name\n// Any blocked sends indicate memory pressure — critical signal\n| WHERE blocked_rate > 0\n| SORT blocked_rate DESC\n| LIMIT 10" + "esql": "TS metrics-activemq.otel-*\n// activemq.queue.blocked.sends is counter_long; dimension: activemq.destination.name\n| WHERE activemq.queue.blocked.sends IS NOT NULL\n| STATS blocked_rate = SUM(RATE(activemq.queue.blocked.sends)) BY activemq.broker.name, activemq.destination.name\n// Any blocked sends indicate memory pressure \u2014 critical signal\n| WHERE blocked_rate > 0\n| SORT blocked_rate DESC\n| LIMIT 10" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 10, diff --git a/packages/activemq_otel/kibana/alerting_rule_template/activemq_otel-broker-memory-utilization-high.json b/packages/activemq_otel/kibana/alerting_rule_template/activemq_otel-broker-memory-utilization-high.json index 241ba2710f4..a11d2c36e35 100644 --- a/packages/activemq_otel/kibana/alerting_rule_template/activemq_otel-broker-memory-utilization-high.json +++ b/packages/activemq_otel/kibana/alerting_rule_template/activemq_otel-broker-memory-utilization-high.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when a broker's memory utilization sustains above 85% (`activemq.memory.utilization > 0.85`). At this level producers are likely to be blocked and the broker is one incident away from refusing writes.", "name": "[ActiveMQ OTel] Broker memory utilization high", "ruleTypeId": ".es-query", "tags": [ @@ -16,13 +17,28 @@ "alertDelay": { "active": 2 }, + "artifacts": { + "dashboards": [ + { + "id": "activemq_otel-broker-health" + }, + { + "id": "activemq_otel-overview" + } + ], + "investigation_guide": { + "blob": "## ActiveMQ broker memory utilization high\n\n### What fired\n`activemq.memory.utilization` averaged above 0.85 over the evaluation window.\n\n### Why it matters\nActiveMQ reserves a bounded pool of JVM heap for buffering in-flight messages. As the pool fills, the broker applies back-pressure \u2014 first by slowing producers, then by blocking sends outright. Approaching 100% puts the broker in a near-unavailable state.\n\n### Triage\n1. Overview dashboard: confirm which broker(s) are affected.\n2. Broker Health dashboard: inspect `activemq.memory.usage` vs `activemq.memory.limit` to see absolute pressure.\n3. Correlate with `activemq.queue.blocked.sends` \u2014 if > 0 the broker is already blocking producers.\n4. Check consumer counts on busy destinations; consumer starvation is the usual root cause.\n5. Review JVM heap (`activemq.jvm.memory.heap.used` vs `.max`) \u2014 a broker under GC pressure drives this metric too.\n\n### Remediation\n- Unblock consumers / add consumer capacity for the backed-up destinations.\n- Raise the broker's memory limit (`systemUsage/memoryUsage/limit`) if the workload has legitimately grown.\n- Investigate message producers bursting into topics/queues without matching consumption capacity.\n\n### Tuning\n- `> 0.85` with a 15-minute window balances responsiveness and noise. Tighten to `> 0.80` for latency-sensitive tiers; loosen to `> 0.90` if your workload sustains high utilization in steady state.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { - "esql": "TS metrics-activemq.otel-*\n// Broker memory utilization is a gauge (0–1 ratio)\n| WHERE activemq.memory.utilization IS NOT NULL\n| STATS max_util = MAX(AVG_OVER_TIME(activemq.memory.utilization)) BY activemq.broker.name, host.name\n// Alert when memory utilization exceeds 85%; producers may be blocked\n| WHERE max_util > 0.85\n| SORT max_util DESC\n| LIMIT 10" + "esql": "TS metrics-activemq.otel-*\n// Broker memory utilization is a gauge (0\u20131 ratio)\n| WHERE activemq.memory.utilization IS NOT NULL\n| STATS max_util = MAX(AVG_OVER_TIME(activemq.memory.utilization)) BY activemq.broker.name, host.name\n// Alert when memory utilization exceeds 85%; producers may be blocked\n| WHERE max_util > 0.85\n| SORT max_util DESC\n| LIMIT 10" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/activemq_otel/kibana/alerting_rule_template/activemq_otel-dlq-depth-high.json b/packages/activemq_otel/kibana/alerting_rule_template/activemq_otel-dlq-depth-high.json index 7f78065cbc2..e0405723ff0 100644 --- a/packages/activemq_otel/kibana/alerting_rule_template/activemq_otel-dlq-depth-high.json +++ b/packages/activemq_otel/kibana/alerting_rule_template/activemq_otel-dlq-depth-high.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when the `ActiveMQ.DLQ` destination accumulates more than 100 unconsumed messages. Growth in the dead-letter queue means messages are failing delivery \u2014 poison payloads, serialization errors, or downstream failures.", "name": "[ActiveMQ OTel] Dead letter queue depth high", "ruleTypeId": ".es-query", "tags": [ @@ -16,13 +17,31 @@ "alertDelay": { "active": 2 }, + "artifacts": { + "dashboards": [ + { + "id": "activemq_otel-destinations" + }, + { + "id": "activemq_otel-overview" + }, + { + "id": "activemq_otel-broker-health" + } + ], + "investigation_guide": { + "blob": "## ActiveMQ dead letter queue depth high\n\n### What fired\nThe gauge `activemq.message.current` for destination `ActiveMQ.DLQ` exceeded 100 in the evaluation window.\n\n### Why it matters\nMessages land in the DLQ after exhausting broker-side retries. DLQ growth is almost always an application-layer signal: a poison message format, a serializer bug, a downstream service outage, or a TTL that is too aggressive for current consumer throughput.\n\n### Triage\n1. Destinations dashboard: confirm the DLQ depth and rate of growth.\n2. Check which source destinations are feeding the DLQ \u2014 usually via broker logs or by inspecting message headers (`OriginalDestination`).\n3. Correlate with `activemq.message.expired` increases and downstream service health.\n4. Sample a few DLQ messages (via JMX/console/Jolokia) to inspect payload and failure reason.\n\n### Remediation\n- Fix the application or consumer that is rejecting messages.\n- Drain the DLQ after the root cause is addressed (reprocess or discard as business rules dictate).\n- If the DLQ is used as a feature for async retries, consider adding a separate queue rather than accumulating in `ActiveMQ.DLQ`.\n\n### Tuning\n- Threshold (`> 100`) and 15-minute window can be lowered for zero-tolerance environments; raise for noisy systems where small DLQ churn is expected.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "TS metrics-activemq.otel-*\n// activemq.message.current is gauge; destination name in activemq.destination.name\n| WHERE activemq.message.current IS NOT NULL AND activemq.destination.name == \"ActiveMQ.DLQ\"\n| STATS dlq_depth = MAX(LAST_OVER_TIME(activemq.message.current)) BY activemq.broker.name, activemq.destination.name\n// Alert when DLQ accumulates significant messages; adjust threshold for your tolerance\n| WHERE dlq_depth > 100\n| SORT dlq_depth DESC\n| LIMIT 10" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/activemq_otel/kibana/alerting_rule_template/activemq_otel-high-jvm-cpu.json b/packages/activemq_otel/kibana/alerting_rule_template/activemq_otel-high-jvm-cpu.json index 6633e4e3702..7d8406dfcc9 100644 --- a/packages/activemq_otel/kibana/alerting_rule_template/activemq_otel-high-jvm-cpu.json +++ b/packages/activemq_otel/kibana/alerting_rule_template/activemq_otel-high-jvm-cpu.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when the ActiveMQ broker JVM CPU usage sustains above 85% (`activemq.jvm.cpu.process.usage > 0.85`). The broker is CPU-bound and will degrade message latency and throughput.", "name": "[ActiveMQ OTel] High JVM CPU utilization", "ruleTypeId": ".es-query", "tags": [ @@ -16,13 +17,28 @@ "alertDelay": { "active": 2 }, + "artifacts": { + "dashboards": [ + { + "id": "activemq_otel-broker-health" + }, + { + "id": "activemq_otel-overview" + } + ], + "investigation_guide": { + "blob": "## ActiveMQ high JVM CPU utilization\n\n### What fired\n`activemq.jvm.cpu.process.usage` averaged above 0.85 over the evaluation window.\n\n### Why it matters\nA CPU-bound broker cannot drain its destinations fast enough. Expect GC pressure to grow (compounded by high heap use), enqueue latency to spike, and queue depth to accumulate. Recovery without adding capacity is difficult once the broker is pegged.\n\n### Triage\n1. Broker Health dashboard: confirm which brokers and compare against system CPU (`activemq.jvm.cpu.system.usage`).\n2. Check GC activity (`activemq.jvm.gc.collections` / `activemq.jvm.gc.duration` rates) \u2014 frequent old-gen GCs waste CPU.\n3. Look at destination-level traffic \u2014 is one queue producing an outsized dispatch/forward rate?\n4. Inspect thread counts (`activemq.jvm.thread.count`) and file descriptor usage.\n\n### Remediation\n- Scale out brokers / use a network of brokers to distribute load.\n- Tune GC settings or raise heap size if GC is the dominant CPU consumer.\n- Optimise high-volume destinations (batching, compression, selector simplification).\n\n### Tuning\n- `> 0.85` over 15 minutes. Raise to 0.90 for CPU-tight clusters; lower for sensitive tiers.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { - "esql": "TS metrics-activemq.otel-*\n// JVM CPU is broker-level metric (ratio 0–1)\n| WHERE activemq.jvm.cpu.process.usage IS NOT NULL\n| STATS avg_cpu = MAX(AVG_OVER_TIME(activemq.jvm.cpu.process.usage)) BY activemq.broker.name, host.name\n// Alert when CPU exceeds 85%; adjust for your baseline\n| WHERE avg_cpu > 0.85\n| SORT avg_cpu DESC\n| LIMIT 10" + "esql": "TS metrics-activemq.otel-*\n// JVM CPU is broker-level metric (ratio 0\u20131)\n| WHERE activemq.jvm.cpu.process.usage IS NOT NULL\n| STATS avg_cpu = MAX(AVG_OVER_TIME(activemq.jvm.cpu.process.usage)) BY activemq.broker.name, host.name\n// Alert when CPU exceeds 85%; adjust for your baseline\n| WHERE avg_cpu > 0.85\n| SORT avg_cpu DESC\n| LIMIT 10" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/activemq_otel/kibana/alerting_rule_template/activemq_otel-high-jvm-heap-utilization.json b/packages/activemq_otel/kibana/alerting_rule_template/activemq_otel-high-jvm-heap-utilization.json index e2861062107..066ffb9e06f 100644 --- a/packages/activemq_otel/kibana/alerting_rule_template/activemq_otel-high-jvm-heap-utilization.json +++ b/packages/activemq_otel/kibana/alerting_rule_template/activemq_otel-high-jvm-heap-utilization.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when the broker JVM heap utilization exceeds 85% (`heap.used / heap.max > 0.85`). Sustained high heap triggers GC thrashing and often precedes OutOfMemoryError.", "name": "[ActiveMQ OTel] High JVM heap utilization", "ruleTypeId": ".es-query", "tags": [ @@ -16,13 +17,28 @@ "alertDelay": { "active": 2 }, + "artifacts": { + "dashboards": [ + { + "id": "activemq_otel-broker-health" + }, + { + "id": "activemq_otel-overview" + } + ], + "investigation_guide": { + "blob": "## ActiveMQ JVM heap utilization high\n\n### What fired\n`activemq.jvm.memory.heap.used / activemq.jvm.memory.heap.max` peaked above 0.85 over the evaluation window.\n\n### Why it matters\nWhen the JVM cannot reclaim enough heap, GC pauses lengthen and become more frequent. Message processing stalls during major collections, destination memory fills up, and a full heap eventually crashes the broker with `OutOfMemoryError`.\n\n### Triage\n1. Broker Health dashboard: confirm heap trend vs `.committed` and `.max`.\n2. Correlate with broker memory utilization (`activemq.memory.utilization`) \u2014 these often move together.\n3. Look at GC metrics for a sustained upward step in `activemq.jvm.gc.collections` or `.duration` rates.\n4. Watch `activemq.jvm.thread.count` \u2014 thread leaks can drive heap growth.\n\n### Remediation\n- Raise `-Xmx` if the workload has legitimately grown.\n- Investigate leaks: unbounded DLQ, long-lived subscriptions, custom plugins holding references.\n- Ensure `storeCursor` / `fileCursor` is used for large queues instead of `vmCursor` which keeps everything in memory.\n\n### Tuning\n- Threshold `> 0.85` matches the documented \"warning\" band. Tighten to 0.80 for tight clusters.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "TS metrics-activemq.otel-*\n// JVM heap metrics are broker-level (no destination dimension)\n| WHERE activemq.jvm.memory.heap.used IS NOT NULL AND activemq.jvm.memory.heap.max IS NOT NULL AND activemq.jvm.memory.heap.max > 0\n| STATS heap_used = MAX(LAST_OVER_TIME(activemq.jvm.memory.heap.used)), heap_max = MAX(LAST_OVER_TIME(activemq.jvm.memory.heap.max)) BY activemq.broker.name, host.name\n| EVAL heap_util = heap_used / heap_max\n// Alert when heap utilization exceeds 85%; adjust threshold for your environment\n| WHERE heap_util > 0.85\n| SORT heap_util DESC\n| LIMIT 10" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/activemq_otel/kibana/alerting_rule_template/activemq_otel-queue-depth-high.json b/packages/activemq_otel/kibana/alerting_rule_template/activemq_otel-queue-depth-high.json index b1b4909eff7..7910ead854d 100644 --- a/packages/activemq_otel/kibana/alerting_rule_template/activemq_otel-queue-depth-high.json +++ b/packages/activemq_otel/kibana/alerting_rule_template/activemq_otel-queue-depth-high.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when a non-advisory destination holds more than 1000 unconsumed messages. A deep queue indicates the consumer cohort cannot keep up with producer rate \u2014 the classic consumer-lag signal.", "name": "[ActiveMQ OTel] Queue depth high", "ruleTypeId": ".es-query", "tags": [ @@ -16,13 +17,28 @@ "alertDelay": { "active": 2 }, + "artifacts": { + "dashboards": [ + { + "id": "activemq_otel-destinations" + }, + { + "id": "activemq_otel-overview" + } + ], + "investigation_guide": { + "blob": "## ActiveMQ queue depth high\n\n### What fired\nThe gauge `activemq.message.current` on a user destination (DLQ and advisory destinations excluded) exceeded 1000 over the evaluation window.\n\n### Why it matters\nGrowing queue depth is the canonical signal that consumers are falling behind producers. Effects compound quickly: message wait times rise, broker memory usage climbs, and eventually producers are blocked or messages expire.\n\n### Triage\n1. Destinations dashboard: identify the affected destination(s) and trend.\n2. Check enqueue vs dequeue rates (`activemq.message.enqueued` rate / `activemq.message.dequeued` rate). A persistent gap confirms consumer lag.\n3. Check `activemq.consumer.count` on affected queues \u2014 zero or too few consumers is a common root cause.\n4. Inspect `activemq.queue.message.inflight` for slow/stuck consumers holding unacked messages.\n\n### Remediation\n- Add consumers or fix slow consumer code paths.\n- Shed load on the producer side if consumer capacity cannot grow.\n- For bursty workloads, consider topics with selectors or sharding.\n\n### Tuning\n- `> 1000` is a conservative default; tune to your expected steady state (deep queues are acceptable for batch-style workloads).\n- Lengthen the evaluation window to smooth bursty traffic.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "TS metrics-activemq.otel-*\n// activemq.message.current is gauge; exclude advisory/system destinations\n| WHERE activemq.message.current IS NOT NULL\n AND activemq.destination.name IS NOT NULL\n AND activemq.destination.name NOT IN (\"ActiveMQ.DLQ\", \"ActiveMQ.Advisory.MasterBroker\", \"ActiveMQ.Advisory.Queue\", \"ActiveMQ.Advisory.Topic\")\n| STATS queue_depth = MAX(LAST_OVER_TIME(activemq.message.current)) BY activemq.broker.name, activemq.destination.name\n// Alert when queue depth exceeds 1000; adjust for your expected throughput\n| WHERE queue_depth > 1000\n| SORT queue_depth DESC\n| LIMIT 10" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/activemq_otel/kibana/alerting_rule_template/activemq_otel-store-utilization-high.json b/packages/activemq_otel/kibana/alerting_rule_template/activemq_otel-store-utilization-high.json index c2d35683032..f2f788a2c85 100644 --- a/packages/activemq_otel/kibana/alerting_rule_template/activemq_otel-store-utilization-high.json +++ b/packages/activemq_otel/kibana/alerting_rule_template/activemq_otel-store-utilization-high.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when the broker persistent store utilization exceeds 90% (`activemq.store.utilization > 0.90`). Above this level the broker may halt accepting new persistent messages.", "name": "[ActiveMQ OTel] Persistent store utilization high", "ruleTypeId": ".es-query", "tags": [ @@ -16,13 +17,28 @@ "alertDelay": { "active": 2 }, + "artifacts": { + "dashboards": [ + { + "id": "activemq_otel-broker-health" + }, + { + "id": "activemq_otel-overview" + } + ], + "investigation_guide": { + "blob": "## ActiveMQ persistent store utilization high\n\n### What fired\n`activemq.store.utilization` averaged above 0.90 over the evaluation window.\n\n### Why it matters\nThe persistent store (KahaDB or JDBC-backed) is the durable buffer for messages that must survive broker restarts. When it nears capacity, ActiveMQ stops accepting new persistent messages \u2014 producers using `DeliveryMode.PERSISTENT` will block or fail.\n\n### Triage\n1. Broker Health dashboard: check trends on `activemq.store.utilization` and absolute `activemq.store.limit` / `activemq.disk.store_usage`.\n2. Is the issue cluster-wide (data growth) or isolated (slow consumer leaving deep queues)?\n3. Look at queue depth per destination \u2014 most often a handful of queues hold most of the persisted data.\n4. Check for paused/stopped consumers that are preventing message dequeue.\n\n### Remediation\n- Clear backlog on deep queues by restoring consumer capacity.\n- Expand the configured store size (`storeUsage/limit`).\n- Add broker storage or rotate to tiered storage backends.\n\n### Tuning\n- `> 0.90` with a 15-minute window is conservative. Tighten to 0.80 for deployments with long response times on provisioning new storage.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { - "esql": "TS metrics-activemq.otel-*\n// Store utilization is a gauge (0–1 ratio)\n| WHERE activemq.store.utilization IS NOT NULL\n| STATS max_util = MAX(AVG_OVER_TIME(activemq.store.utilization)) BY activemq.broker.name, host.name\n// Alert when persistent store exceeds 90%; broker may halt accepting new persistent messages\n| WHERE max_util > 0.90\n| SORT max_util DESC\n| LIMIT 10" + "esql": "TS metrics-activemq.otel-*\n// Store utilization is a gauge (0\u20131 ratio)\n| WHERE activemq.store.utilization IS NOT NULL\n| STATS max_util = MAX(AVG_OVER_TIME(activemq.store.utilization)) BY activemq.broker.name, host.name\n// Alert when persistent store exceeds 90%; broker may halt accepting new persistent messages\n| WHERE max_util > 0.90\n| SORT max_util DESC\n| LIMIT 10" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/activemq_otel/manifest.yml b/packages/activemq_otel/manifest.yml index 8de583e746e..4435a2954d2 100644 --- a/packages/activemq_otel/manifest.yml +++ b/packages/activemq_otel/manifest.yml @@ -1,7 +1,7 @@ format_version: 3.5.5 name: activemq_otel title: "ActiveMQ OpenTelemetry Assets" -version: 0.1.0 +version: 0.2.0 source: license: "Elastic-2.0" description: "ActiveMQ Assets for OpenTelemetry Collector" @@ -12,7 +12,7 @@ categories: - opentelemetry conditions: kibana: - version: "^9.2.3" + version: "^9.4.0" elastic: subscription: "basic" discovery: diff --git a/packages/airflow_otel/changelog.yml b/packages/airflow_otel/changelog.yml index 2a28fe7a967..63273b000a9 100644 --- a/packages/airflow_otel/changelog.yml +++ b/packages/airflow_otel/changelog.yml @@ -1,5 +1,10 @@ # newer versions go on top -- version: 0.1.0 +- version: "0.2.0" + changes: + - description: Add description and artifact fields to alerting rule template. + type: enhancement + link: https://github.com/elastic/integrations/pull/18506 +- version: "0.1.0" changes: - description: initial release type: enhancement diff --git a/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-dag-file-queue-large.json b/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-dag-file-queue-large.json index 0515022c5b4..9d83c4bcc87 100644 --- a/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-dag-file-queue-large.json +++ b/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-dag-file-queue-large.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when the DAG file processing queue exceeds 20 (`airflow.dag_processing.file_path_queue_size > 20`), indicating the DAG processor cannot keep up with file parsing.", "name": "[Airflow OTel] DAG file processing queue large", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 10, "statusChangeThreshold": 4 }, + "artifacts": { + "dashboards": [ + { + "id": "airflow_otel-dag-processing" + }, + { + "id": "airflow_otel-overview" + } + ], + "investigation_guide": { + "blob": "## Airflow DAG file processing queue large\n\n### What fired\n`airflow.dag_processing.file_path_queue_size` peaked above 20 over the evaluation window.\n\n### Why it matters\nThe DAG processor parses DAG files on a schedule. When the parse queue grows, DAG changes (new DAGs, edits) take longer to appear in the scheduler, and runs begin to lag relative to their schedule. Large backlogs often correlate with slow filesystem operations or heavy DAG files.\n\n### Triage\n1. DAG Processing dashboard: check queue trend and processing rate.\n2. Inspect `airflow.dag_processing.processor_timeouts` and `airflow.dag_processing.import_errors` for correlated issues.\n3. Look at DAG count \u2014 is this just a consequence of growth?\n4. Check filesystem I/O on DAG storage (NFS mounts are notorious for this).\n\n### Remediation\n- Increase `dag_dir_list_interval` and `parsing_processes` in `[scheduler]` config.\n- Move DAGs to faster storage (local disk, efficient remote file systems).\n- Reduce top-level Python work in DAG files (defer imports, avoid expensive calls during parse).\n\n### Tuning\n- Raise the threshold if you have a large DAG inventory; lower it for latency-sensitive DAG updates.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-airflow.otel-default*\n| WHERE data_stream.dataset == \"airflow.otel\"\n| WHERE airflow.dag_processing.file_path_queue_size IS NOT NULL\n// Gauge: DAG files pending processing; growing queue = processor falling behind\n| STATS max_queue = MAX(airflow.dag_processing.file_path_queue_size)\n// Adjust threshold (20) for your DAG count\n| WHERE max_queue > 20" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-dag-import-errors.json b/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-dag-import-errors.json index db25c1eb620..36645f8aa92 100644 --- a/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-dag-import-errors.json +++ b/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-dag-import-errors.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when any DAG files fail to import (`airflow.dag_processing.import_errors > 0`). Import errors prevent the affected DAGs from running.", "name": "[Airflow OTel] DAG import errors present", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 10, "statusChangeThreshold": 4 }, + "artifacts": { + "dashboards": [ + { + "id": "airflow_otel-dag-processing" + }, + { + "id": "airflow_otel-overview" + } + ], + "investigation_guide": { + "blob": "## Airflow DAG import errors present\n\n### What fired\n`airflow.dag_processing.import_errors` is greater than zero.\n\n### Why it matters\nA DAG that fails to import is invisible to the scheduler \u2014 its runs will not be created, and downstream systems that expect those runs will silently miss them.\n\n### Triage\n1. Check the Airflow webserver \"Import errors\" page for the traceback.\n2. Correlate with recent deploys or config pushes to the DAG folder.\n3. Look at `airflow.dag_processing.total_parse_time` and processor logs for environmental issues.\n\n### Remediation\n- Fix or revert the offending DAG file.\n- If an external dependency is missing in the scheduler environment, install it and restart the scheduler.\n\n### Tuning\n- This rule fires on any non-zero value. Do not raise the threshold \u2014 every import error should be triaged.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-airflow.otel-default*\n| WHERE data_stream.dataset == \"airflow.otel\"\n| WHERE airflow.dag_processing.import_errors IS NOT NULL\n// Use max gauge value in window; import_errors is a gauge\n| STATS max_errors = MAX(airflow.dag_processing.import_errors)\n| WHERE max_errors > 0" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-dag-processor-timeouts.json b/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-dag-processor-timeouts.json index 50a695137dd..dde40e4a9e3 100644 --- a/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-dag-processor-timeouts.json +++ b/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-dag-processor-timeouts.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when the DAG processor records any timeout events (`airflow.dag_processing.processor_timeouts` counter increases). Timeouts mean specific DAG files are too slow to parse.", "name": "[Airflow OTel] DAG processor timeouts", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 10, "statusChangeThreshold": 4 }, + "artifacts": { + "dashboards": [ + { + "id": "airflow_otel-dag-processing" + }, + { + "id": "airflow_otel-overview" + } + ], + "investigation_guide": { + "blob": "## Airflow DAG processor timeouts\n\n### What fired\nThe counter `airflow.dag_processing.processor_timeouts` increased during the evaluation window.\n\n### Why it matters\nWhen DAG parsing exceeds `dag_file_processor_timeout`, the processor aborts and retries, delaying visibility of the affected DAGs and consuming scheduler capacity on repeated retries.\n\n### Triage\n1. DAG Processing dashboard: confirm processor load (queue size, parse time trends).\n2. Identify slow DAG files via scheduler logs (`dag_processor_manager.log`).\n3. Check for expensive top-level code: blocking API calls, large list comprehensions, heavy imports.\n\n### Remediation\n- Refactor slow DAGs to defer work into tasks rather than top-level code.\n- Increase `dag_file_processor_timeout` as a short-term mitigation.\n- Consider sharding DAG files across multiple folders and processors.\n\n### Tuning\n- Fires on any timeout. For high-churn environments you can require `timeout_count > N`.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "TS metrics-airflow.otel-default*\n| WHERE airflow.dag_processing.processor_timeouts IS NOT NULL\n// Counter: detect new processor timeouts in the window\n| STATS timeout_count = SUM(INCREASE(airflow.dag_processing.processor_timeouts))\n| WHERE timeout_count > 0" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-executor-slots-exhausted.json b/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-executor-slots-exhausted.json index e9a01aa9c73..39111da3946 100644 --- a/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-executor-slots-exhausted.json +++ b/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-executor-slots-exhausted.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when the executor has zero open slots (`airflow.executor.open_slots <= 0`) at any point in the window, meaning tasks cannot be dispatched and the scheduler is back-pressured.", "name": "[Airflow OTel] Executor open slots exhausted", "ruleTypeId": ".es-query", "tags": [ @@ -21,13 +22,28 @@ "lookBackWindow": 10, "statusChangeThreshold": 4 }, + "artifacts": { + "dashboards": [ + { + "id": "airflow_otel-scheduler-capacity" + }, + { + "id": "airflow_otel-overview" + } + ], + "investigation_guide": { + "blob": "## Airflow executor open slots exhausted\n\n### What fired\n`airflow.executor.open_slots` dropped to zero or below at least once during the evaluation window.\n\n### Why it matters\nWith no free executor slots, the scheduler cannot dispatch new task instances. Runs accumulate in the queue, schedule delay grows, and SLA miss rates climb. This is a capacity signal \u2014 tasks are running, but the pool is fully committed.\n\n### Triage\n1. Scheduler & Capacity dashboard: confirm executor slot trend against `airflow.executor.running_tasks`.\n2. Check for stuck / long-running tasks that hold slots. Inspect DAGs with unusually long task durations.\n3. Look at pools (`airflow.pool.*`) for additional bottlenecks.\n4. Cross-check `airflow.scheduler.tasks.starving` \u2014 related but distinct signal.\n\n### Remediation\n- Increase `parallelism` and/or the executor worker count.\n- Kill or re-queue hung tasks.\n- Split workloads across pools so high-priority DAGs are not blocked by bulk jobs.\n\n### Tuning\n- The rule fires if slots hit zero at any sample. If your workload routinely runs at full capacity without issue, consider requiring a sustained condition (e.g. `min_slots <= 0` over a longer window).\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-airflow.otel-default*\n| WHERE data_stream.dataset == \"airflow.otel\"\n| WHERE airflow.executor.open_slots IS NOT NULL\n// Gauge: use min to detect any point where slots hit zero\n| STATS min_slots = MIN(airflow.executor.open_slots)\n| WHERE min_slots <= 0" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-heartbeat-failures.json b/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-heartbeat-failures.json index e13d70fcd95..a61c9776b7a 100644 --- a/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-heartbeat-failures.json +++ b/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-heartbeat-failures.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when any scheduler/worker jobs record heartbeat failures (`airflow.job_heartbeat_failure > 0`). Heartbeat failures indicate workers are dying unexpectedly or the metadata DB is unreachable.", "name": "[Airflow OTel] Scheduler job heartbeat failures", "ruleTypeId": ".es-query", "tags": [ @@ -21,13 +22,28 @@ "lookBackWindow": 10, "statusChangeThreshold": 4 }, + "artifacts": { + "dashboards": [ + { + "id": "airflow_otel-scheduler-capacity" + }, + { + "id": "airflow_otel-overview" + } + ], + "investigation_guide": { + "blob": "## Airflow job heartbeat failures\n\n### What fired\nThe counter `airflow.job_heartbeat_failure` increased during the evaluation window.\n\n### Why it matters\nAirflow uses heartbeats to detect live schedulers, workers, and DAG processors. Heartbeat failures signal that jobs are crashing, the metadata database is unreachable, or the host is unhealthy. Failed heartbeats lead to task zombies and missed runs.\n\n### Triage\n1. Scheduler & Capacity dashboard: confirm which component's heartbeats are failing.\n2. Check metadata DB health \u2014 heartbeats are DB writes.\n3. Look at host-level resources: CPU, memory, network on scheduler/worker nodes.\n4. Review scheduler logs around the failure timestamps.\n\n### Remediation\n- Stabilise the metadata DB (scale up, reduce contention).\n- Address host OOM / eviction issues on affected nodes.\n- Ensure clock skew is small; large skew triggers spurious heartbeat failures.\n\n### Tuning\n- Fires on any increase, intentionally. Heartbeat failures are always worth triaging.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-airflow.otel-default*\n| WHERE data_stream.dataset == \"airflow.otel\"\n| WHERE airflow.job_heartbeat_failure IS NOT NULL\n// Counter: sum heartbeat failures in the window\n| STATS failure_count = SUM(airflow.job_heartbeat_failure)\n| WHERE failure_count > 0" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-schedule-delay-high.json b/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-schedule-delay-high.json index 40e0730fb0a..1e1ff48fd55 100644 --- a/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-schedule-delay-high.json +++ b/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-schedule-delay-high.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when DAG runs are scheduled more than 5 minutes after their `data_interval_end` (`airflow.dagrun.schedule_delay > 300s`). Schedule delay is the freshness SLI for Airflow.", "name": "[Airflow OTel] DAG run schedule delay high", "ruleTypeId": ".es-query", "tags": [ @@ -21,13 +22,28 @@ "lookBackWindow": 10, "statusChangeThreshold": 4 }, + "artifacts": { + "dashboards": [ + { + "id": "airflow_otel-overview" + }, + { + "id": "airflow_otel-scheduler-capacity" + } + ], + "investigation_guide": { + "blob": "## Airflow DAG run schedule delay high\n\n### What fired\n`airflow.dagrun.schedule_delay` peaked above 300 seconds (5 minutes) during the evaluation window.\n\n### Why it matters\nSchedule delay is the time between a DAG run's data interval end and when the scheduler actually starts the run. High delay directly translates to stale data for downstream consumers. It is the primary freshness indicator for scheduled workloads.\n\n### Triage\n1. Overview dashboard: confirm affected DAGs and trend.\n2. Check scheduler loop duration (`airflow.scheduler.scheduler_loop_duration`) and executor slot availability.\n3. Look at DAG processor queue \u2014 if parsing is slow, scheduling is slow too.\n4. Correlate with metadata DB latency.\n\n### Remediation\n- Scale scheduler and executor capacity.\n- Reduce DAG parsing cost (see DAG-processor runbooks).\n- Split DAGs across multiple schedulers if running in HA mode.\n\n### Tuning\n- 300s is a common default. Tighten for SLA-critical tiers; relax for batch/nightly workloads.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-airflow.otel-default*\n| WHERE data_stream.dataset == \"airflow.otel\"\n| WHERE airflow.dagrun.schedule_delay IS NOT NULL\n// Timer values are in SECONDS (OTel statsd receiver convention)\n// Threshold 300 = 5 minutes delay; adjust for your SLA\n| STATS max_delay_sec = MAX(airflow.dagrun.schedule_delay)\n| WHERE max_delay_sec > 300" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-scheduler-heartbeat-low.json b/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-scheduler-heartbeat-low.json index 2bd1cb6ea96..f979e8362d4 100644 --- a/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-scheduler-heartbeat-low.json +++ b/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-scheduler-heartbeat-low.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when the scheduler heartbeat rate falls far below expected (fewer than 5 heartbeat events in 15 minutes despite active telemetry). Low heartbeats suggest a stalled or dying scheduler.", "name": "[Airflow OTel] Scheduler heartbeats critically low", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 10, "statusChangeThreshold": 4 }, + "artifacts": { + "dashboards": [ + { + "id": "airflow_otel-scheduler-capacity" + }, + { + "id": "airflow_otel-overview" + } + ], + "investigation_guide": { + "blob": "## Airflow scheduler heartbeats critically low\n\n### What fired\nFewer than 5 `airflow.scheduler_heartbeat` events were observed in the evaluation window while other Airflow telemetry was still flowing.\n\n### Why it matters\nThe scheduler emits a heartbeat every few seconds during normal operation. A steep drop indicates the scheduler is stalled (long-running DAG file parse, DB lock, blocked network call). No heartbeats means no new task instances are being dispatched.\n\n### Triage\n1. Scheduler & Capacity dashboard: verify heartbeat rate against `airflow.scheduler.scheduler_loop_duration`.\n2. Inspect DAG processor health and metadata DB latency.\n3. Check for GIL contention / blocked threads in scheduler logs.\n4. Verify the scheduler process is actually running (OS-level) on the host.\n\n### Remediation\n- Restart the scheduler once root cause is understood.\n- Shed load off the metadata DB.\n- Migrate to HA scheduler configuration if single-point-of-failure is unacceptable.\n\n### Tuning\n- The `hb_count < 5 AND total_docs > 10` condition avoids false positives when Airflow telemetry itself is paused. Adjust both numbers to your scrape cadence.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-airflow.otel-default*\n| WHERE data_stream.dataset == \"airflow.otel\"\n// Count scheduler heartbeats in the rule's time window\n| STATS hb_count = COUNT(airflow.scheduler_heartbeat), total_docs = COUNT(*)\n// Alert when we have Airflow activity but fewer than 5 heartbeats\n// Adjust threshold (5) for your scheduler heartbeat cadence\n| WHERE total_docs > 10 AND hb_count < 5" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-scheduler-loop-slow.json b/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-scheduler-loop-slow.json index 09a65971473..32c04c5f950 100644 --- a/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-scheduler-loop-slow.json +++ b/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-scheduler-loop-slow.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when the scheduler main-loop duration exceeds 60 seconds (`airflow.scheduler.scheduler_loop_duration > 60`). A slow loop starves task dispatch and inflates schedule delay.", "name": "[Airflow OTel] Scheduler loop duration excessive", "ruleTypeId": ".es-query", "tags": [ @@ -21,13 +22,28 @@ "lookBackWindow": 10, "statusChangeThreshold": 4 }, + "artifacts": { + "dashboards": [ + { + "id": "airflow_otel-scheduler-capacity" + }, + { + "id": "airflow_otel-overview" + } + ], + "investigation_guide": { + "blob": "## Airflow scheduler loop duration excessive\n\n### What fired\n`airflow.scheduler.scheduler_loop_duration` peaked above 60 seconds during the evaluation window.\n\n### Why it matters\nThe scheduler main loop evaluates DAGs and dispatches tasks. A slow loop means fewer dispatches per unit time, queues grow, and SLAs slip. It is usually caused by DB latency, many DAGs, or a slow DAG processor.\n\n### Triage\n1. Scheduler & Capacity dashboard: correlate loop duration with schedule delay and executor slot availability.\n2. Check DB latency and connection pool saturation on the metadata store.\n3. Look for high DAG counts or recent DAG inventory growth.\n4. Inspect scheduler logs for slow queries or long-running hooks.\n\n### Remediation\n- Scale the metadata DB or add read replicas.\n- Increase `min_file_process_interval` so the processor re-parses DAGs less often.\n- Split the workload across multiple schedulers (HA mode).\n\n### Tuning\n- 60s is a conservative default. For latency-sensitive tiers tighten to 30s; for heavy deployments raise to 90-120s.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-airflow.otel-default*\n| WHERE data_stream.dataset == \"airflow.otel\"\n| WHERE airflow.scheduler.scheduler_loop_duration IS NOT NULL\n// Timer values are in SECONDS (OTel statsd receiver convention)\n// Threshold 60 = 60 seconds; adjust for your environment\n| STATS max_loop_sec = MAX(airflow.scheduler.scheduler_loop_duration)\n| WHERE max_loop_sec > 60" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-starving-tasks.json b/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-starving-tasks.json index d60053a6dca..17d291cba91 100644 --- a/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-starving-tasks.json +++ b/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-starving-tasks.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when more than 5 tasks are starving due to pool slot limits (`airflow.scheduler.tasks.starving > 5`). Starvation means eligible tasks cannot run because their pool has no free slots.", "name": "[Airflow OTel] Tasks starving due to pool limits", "ruleTypeId": ".es-query", "tags": [ @@ -21,13 +22,28 @@ "lookBackWindow": 10, "statusChangeThreshold": 4 }, + "artifacts": { + "dashboards": [ + { + "id": "airflow_otel-tasks" + }, + { + "id": "airflow_otel-scheduler-capacity" + } + ], + "investigation_guide": { + "blob": "## Airflow tasks starving due to pool limits\n\n### What fired\n`airflow.scheduler.tasks.starving` peaked above 5 during the evaluation window.\n\n### Why it matters\nStarving tasks have satisfied all DAG dependencies and are ready to run, but their assigned pool has no open slots. Sustained starvation causes SLA misses on tasks that never actually executed.\n\n### Triage\n1. Tasks dashboard: find which pools are saturated.\n2. Review pool configuration (size, priority_weight distribution).\n3. Look for long-running tasks that dominate a pool.\n\n### Remediation\n- Increase pool size or re-assign tasks across pools.\n- Add priority weighting so high-SLA tasks are dispatched first.\n- Split critical workloads into dedicated pools.\n\n### Tuning\n- Threshold of 5 is reasonable for mid-sized deployments. Raise or lower to match DAG inventory.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-airflow.otel-default*\n| WHERE data_stream.dataset == \"airflow.otel\"\n| WHERE airflow.scheduler.tasks.starving IS NOT NULL\n// Gauge: tasks that cannot run due to pool slot limits\n| STATS max_starving = MAX(airflow.scheduler.tasks.starving)\n// Adjust threshold (5) based on acceptable starvation\n| WHERE max_starving > 5" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-task-failures-high.json b/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-task-failures-high.json index 13d5ea60f3d..a3d819bec0f 100644 --- a/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-task-failures-high.json +++ b/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-task-failures-high.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when task instance failures exceed 10 in the evaluation window (`airflow.ti_failures > 10`). Elevated task failures usually reflect application bugs or unstable dependencies.", "name": "[Airflow OTel] Task failures above threshold", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 10, "statusChangeThreshold": 4 }, + "artifacts": { + "dashboards": [ + { + "id": "airflow_otel-tasks" + }, + { + "id": "airflow_otel-overview" + } + ], + "investigation_guide": { + "blob": "## Airflow task failures above threshold\n\n### What fired\nSummed `airflow.ti_failures` over the evaluation window exceeded 10.\n\n### Why it matters\nA spike in task failures degrades DAG completion rates and burns through retry budgets, amplifying load on downstream systems. Failures are usually concentrated on a handful of DAGs or operators.\n\n### Triage\n1. Tasks dashboard: identify which DAGs/tasks are failing most.\n2. Sample a failing task log for the root cause (application error, dependency timeout, OOM).\n3. Correlate with infrastructure events: deployments, DB issues, third-party outages.\n\n### Remediation\n- Fix the failing task or its upstream dependency.\n- If a dependency is flaky, add retries with exponential backoff \u2014 but alert if retry counts are the dominant driver of load.\n\n### Tuning\n- 10 failures / 15 min is mid-range. Lower for SLA-critical DAGs; raise for large noisy pipelines.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-airflow.otel-default*\n| WHERE data_stream.dataset == \"airflow.otel\"\n| WHERE airflow.ti_failures IS NOT NULL\n// Sum task instance failures in the time window\n| STATS failure_count = SUM(airflow.ti_failures)\n// Adjust threshold (10) for your expected failure volume\n| WHERE failure_count > 10" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-tasks-killed-externally.json b/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-tasks-killed-externally.json index cfcb28d4a42..22ddebd126a 100644 --- a/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-tasks-killed-externally.json +++ b/packages/airflow_otel/kibana/alerting_rule_template/airflow_otel-tasks-killed-externally.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when the scheduler observes any tasks killed externally (`airflow.scheduler.tasks.killed_externally > 0`). External kills usually come from OOM killers, pod evictions, or SIGKILL from autoscalers.", "name": "[Airflow OTel] Tasks killed externally", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 10, "statusChangeThreshold": 4 }, + "artifacts": { + "dashboards": [ + { + "id": "airflow_otel-tasks" + }, + { + "id": "airflow_otel-overview" + } + ], + "investigation_guide": { + "blob": "## Airflow tasks killed externally\n\n### What fired\nThe counter `airflow.scheduler.tasks.killed_externally` increased during the evaluation window.\n\n### Why it matters\nAirflow marks a task as externally killed when its state transitions unexpectedly (not by the task itself or the scheduler). The usual cause is infrastructure: the Linux OOM killer, Kubernetes pod evictions, spot/preemptible VM reclamation, or container runtime kills.\n\n### Triage\n1. Tasks dashboard: identify which tasks and when they were killed.\n2. Check host/pod memory and CPU metrics at the kill time.\n3. Review cluster autoscaler / spot reclamation events.\n4. Look at task logs for sudden truncation.\n\n### Remediation\n- Raise worker resource requests / limits.\n- Move sensitive tasks off preemptible infrastructure.\n- Add checkpointing to long-running tasks so they can resume after a kill.\n\n### Tuning\n- Fires on any non-zero value. Raise the threshold only if occasional kills are part of steady-state operation.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-airflow.otel-default*\n| WHERE data_stream.dataset == \"airflow.otel\"\n| WHERE airflow.scheduler.tasks.killed_externally IS NOT NULL\n// Counter: tasks killed by OOM, node eviction, etc.\n| STATS killed_count = SUM(airflow.scheduler.tasks.killed_externally)\n| WHERE killed_count > 0" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/airflow_otel/manifest.yml b/packages/airflow_otel/manifest.yml index d6da54a48e1..3d256812170 100644 --- a/packages/airflow_otel/manifest.yml +++ b/packages/airflow_otel/manifest.yml @@ -1,7 +1,7 @@ format_version: 3.5.7 name: airflow_otel title: Airflow OpenTelemetry -version: "0.1.0" +version: "0.2.0" description: Airflow OpenTelemetry Integration. type: content source: @@ -12,7 +12,7 @@ categories: - process_manager conditions: kibana: - version: "^9.3.0" + version: "^9.4.0" elastic: subscription: basic discovery: diff --git a/packages/apache_otel/changelog.yml b/packages/apache_otel/changelog.yml index 33c26436a18..dfebf4e6458 100644 --- a/packages/apache_otel/changelog.yml +++ b/packages/apache_otel/changelog.yml @@ -1,4 +1,9 @@ # newer versions go on top +- version: "0.4.0" + changes: + - description: Add description and artifact fields to alerting rule template. + type: enhancement + link: https://github.com/elastic/integrations/pull/18506 - version: "0.3.0" changes: - description: Replace dashboard with ES|QL-based overview dashboard, add alert rule templates, SLO template, and updated documentation diff --git a/packages/apache_otel/kibana/alerting_rule_template/apache_otel-availability-server-restart.json b/packages/apache_otel/kibana/alerting_rule_template/apache_otel-availability-server-restart.json new file mode 100644 index 00000000000..6ec1059ca14 --- /dev/null +++ b/packages/apache_otel/kibana/alerting_rule_template/apache_otel-availability-server-restart.json @@ -0,0 +1,43 @@ +{ + "id": "apache_otel-availability-server-restart", + "type": "alerting_rule_template", + "attributes": { + "description": "Alerts when Apache's uptime counter resets, indicating the server has restarted within the evaluation window. Unplanned restarts should be correlated with crashes, OOM, or operator-driven maintenance.", + "name": "[Apache OTEL] Server Restart Detected", + "ruleTypeId": ".es-query", + "tags": [ + "Apache", + "OTEL", + "Availability" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 1 + }, + "artifacts": { + "dashboards": [ + { + "id": "apache_otel-overview" + } + ], + "investigation_guide": { + "blob": "## Apache server restart detected\n\n### What fired\n`apache.uptime` dropped to a low value within the evaluation window while the observed delta indicates the counter rolled \u2014 Apache was restarted.\n\n### Why it matters\nA restart means every in-flight request was terminated and all worker state was reset. Unplanned restarts usually reflect crashes (segfault, out-of-memory, module bugs) or forcible kills from monitoring/orchestration. Even planned restarts should be noted so they can be correlated with other signals.\n\n### Triage\n1. Overview dashboard: confirm whether one server or the fleet restarted.\n2. Inspect system logs (`journalctl`, `/var/log/messages`) for OOM killer activity and service restart events.\n3. Look at `apache.load.1`/`apache.load.5` leading up to the restart \u2014 elevated load often precedes a crash.\n4. Check whether the restart correlates with a recent deploy or config push.\n\n### Remediation\n- If restart was operator-driven, document and close.\n- If crash-driven, capture core dumps, inspect Apache error logs (`error_log`), and file bug if MPM/module-related.\n- For resource exhaustion, tune `MaxRequestWorkers` / `MaxConnectionsPerChild` and scale the host.\n\n### Tuning\n- `min_uptime < 300 AND uptime_delta > 300` balances detection speed vs false positives. Tighten the `min_uptime` if instant detection is required.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "timeWindowSize": 10, + "timeWindowUnit": "m", + "esqlQuery": { + "esql": "// Alert triggers when Apache server uptime resets, indicating a restart.\n// A restart is detected when the current uptime is significantly lower than the previous maximum,\n// which suggests the server was restarted.\n// You can adjust the threshold value by modifying the uptime_seconds in the WHERE clause.\nFROM metrics-apachereceiver.otel-*\n| STATS max_uptime = MAX(metrics.apache.uptime), min_uptime = MIN(metrics.apache.uptime) BY resource.attributes.host.name\n| WHERE max_uptime IS NOT NULL AND min_uptime IS NOT NULL\n| EVAL uptime_delta = max_uptime - min_uptime\n| WHERE min_uptime < 300 AND uptime_delta > 300\n| KEEP resource.attributes.host.name, min_uptime, max_uptime, uptime_delta\n| SORT min_uptime ASC" + }, + "groupBy": "row", + "timeField": "@timestamp" + } + }, + "managed": true, + "coreMigrationVersion": "8.8.0", + "typeMigrationVersion": "10.1.0" +} diff --git a/packages/apache_otel/kibana/alerting_rule_template/apache_otel-capacity-low-idle-workers.json b/packages/apache_otel/kibana/alerting_rule_template/apache_otel-capacity-low-idle-workers.json new file mode 100644 index 00000000000..b764def299d --- /dev/null +++ b/packages/apache_otel/kibana/alerting_rule_template/apache_otel-capacity-low-idle-workers.json @@ -0,0 +1,43 @@ +{ + "id": "apache_otel-capacity-low-idle-workers", + "type": "alerting_rule_template", + "attributes": { + "description": "Alerts when average idle worker count falls below 5. A low idle pool means the server has no headroom to absorb traffic bursts and is close to worker-pool exhaustion.", + "name": "[Apache OTEL] Low Idle Workers", + "ruleTypeId": ".es-query", + "tags": [ + "Apache", + "OTEL", + "Capacity" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "artifacts": { + "dashboards": [ + { + "id": "apache_otel-overview" + } + ], + "investigation_guide": { + "blob": "## Apache low idle workers\n\n### What fired\n`apache.workers` with `attributes.state == \"idle\"` averaged below 5 during the evaluation window.\n\n### Why it matters\nApache handles new connections by dispatching them to idle workers. When the idle count dips, any traffic burst leads to queuing or outright connection refusal. This is a leading indicator of worker-pool exhaustion.\n\n### Triage\n1. Overview dashboard: compare idle vs busy workers and observe the trend.\n2. Check `apache.requests` rate \u2014 is traffic spiking, or is each request holding a worker longer than usual?\n3. Look at `apache.request.time` rate divided by `apache.requests` rate for average latency changes.\n4. Inspect upstream/backend health if Apache is acting as a reverse proxy.\n\n### Remediation\n- Increase `MaxRequestWorkers` / `ServerLimit` if host capacity allows.\n- Fix the slow backend that is pinning workers to requests.\n- Scale horizontally by adding Apache instances behind the load balancer.\n\n### Tuning\n- The threshold of 5 idle workers is small-scale; tune it proportionally to your `MaxRequestWorkers` setting (e.g. < 10% of max).\n" + } + }, + "params": { + "searchType": "esqlQuery", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "esqlQuery": { + "esql": "// Alert triggers when the number of idle workers falls below the threshold.\n// Low idle workers indicate the server is approaching capacity and may not handle new requests efficiently.\n// The alert is grouped by host name.\n// You can adjust the threshold value by modifying the idle_workers in the WHERE clause (default: < 5).\nFROM metrics-apachereceiver.otel-*\n| WHERE attributes.state == \"idle\"\n| STATS idle_workers = AVG(metrics.apache.workers) BY resource.attributes.host.name\n| WHERE idle_workers IS NOT NULL AND idle_workers < 5\n| EVAL idle_workers = ROUND(idle_workers, 2)\n| KEEP resource.attributes.host.name, idle_workers\n| SORT idle_workers ASC" + }, + "groupBy": "row", + "timeField": "@timestamp" + } + }, + "managed": true, + "coreMigrationVersion": "8.8.0", + "typeMigrationVersion": "10.1.0" +} diff --git a/packages/apache_otel/kibana/alerting_rule_template/apache_otel-high-server-load.json b/packages/apache_otel/kibana/alerting_rule_template/apache_otel-high-server-load.json index c7b255b13b5..83024b5041c 100644 --- a/packages/apache_otel/kibana/alerting_rule_template/apache_otel-high-server-load.json +++ b/packages/apache_otel/kibana/alerting_rule_template/apache_otel-high-server-load.json @@ -1,41 +1,52 @@ { - "id": "apache_otel-high-server-load", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Apache OTel] High server load", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "apache" - ], - "schedule": { - "interval": "1m" - }, - "alertDelay": { - "active": 3 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-apachereceiver.otel-*\n| WHERE `apache.load.1` IS NOT NULL\n| STATS load_1m = MAX(`apache.load.1`)\n BY resource.attributes.apache.server.name\n| WHERE load_1m > 5.0\n| SORT load_1m DESC\n| LIMIT 10" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "resource.attributes.apache.server.name", - "termSize": 10, - "excludeHitsFromPreviousRun": true + "id": "apache_otel-high-server-load", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when the 1-minute system load average on an Apache host exceeds 5 (`apache.load.1 > 5`). Load spikes above CPU core count imply request handling is CPU-bound or queued.", + "name": "[Apache OTel] High server load", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "apache" + ], + "schedule": { + "interval": "1m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "apache_otel-overview" } + ], + "investigation_guide": { + "blob": "## Apache high server load\n\n### What fired\n`apache.load.1` peaked above 5 during the evaluation window.\n\n### Why it matters\nThe 1-minute load average approximates how many processes are runnable or in uninterruptible sleep. Values consistently higher than the number of CPU cores mean requests are queuing for CPU. Combined with rising latency, this is a classic saturation pattern.\n\n### Triage\n1. Overview dashboard: confirm the server(s) involved and correlate with `apache.load.5` / `apache.load.15` for trend.\n2. Check `apache.cpu.load` and `apache.cpu.time` (by mode) \u2014 is it user CPU (application) or system CPU (syscalls/IO)?\n3. Inspect worker utilization: a pool near exhaustion aggravates load.\n4. Host-level: disk I/O and network metrics often explain elevated load when CPU alone looks healthy.\n\n### Remediation\n- Scale Apache hosts or add cores.\n- Optimise CPU-heavy handlers (cgi-bin, mod_php, mod_ssl work).\n- If load is from I/O wait, look at disk/network rather than Apache tuning.\n\n### Tuning\n- Threshold `> 5` is a generic safety threshold. Tune to roughly the number of CPU cores on the host.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-apachereceiver.otel-*\n| WHERE `apache.load.1` IS NOT NULL\n| STATS load_1m = MAX(`apache.load.1`)\n BY resource.attributes.apache.server.name\n| WHERE load_1m > 5.0\n| SORT load_1m DESC\n| LIMIT 10" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "resource.attributes.apache.server.name", + "termSize": 10, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/apache_otel/kibana/alerting_rule_template/apache_otel-no-requests-received.json b/packages/apache_otel/kibana/alerting_rule_template/apache_otel-no-requests-received.json index 7db5b373e77..d981c98c837 100644 --- a/packages/apache_otel/kibana/alerting_rule_template/apache_otel-no-requests-received.json +++ b/packages/apache_otel/kibana/alerting_rule_template/apache_otel-no-requests-received.json @@ -1,41 +1,52 @@ { - "id": "apache_otel-no-requests-received", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Apache OTel] No requests received", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "apache" - ], - "schedule": { - "interval": "1m" - }, - "alertDelay": { - "active": 3 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "TS metrics-apachereceiver.otel-*\n| WHERE apache.requests IS NOT NULL\n| STATS request_rate = SUM(RATE(apache.requests))\n BY resource.attributes.apache.server.name\n| WHERE request_rate == 0.0 OR request_rate IS NULL\n| SORT resource.attributes.apache.server.name ASC\n| LIMIT 10" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "resource.attributes.apache.server.name", - "termSize": 10, - "excludeHitsFromPreviousRun": true + "id": "apache_otel-no-requests-received", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when the request rate drops to zero (`RATE(apache.requests) == 0`). Unexpected zero traffic usually points to upstream routing / DNS / load-balancer problems.", + "name": "[Apache OTel] No requests received", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "apache" + ], + "schedule": { + "interval": "1m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "apache_otel-overview" } + ], + "investigation_guide": { + "blob": "## Apache no requests received\n\n### What fired\n`RATE(apache.requests)` was 0 (or null) across the evaluation window.\n\n### Why it matters\nA server that is running but serving zero requests is effectively unavailable. The cause is rarely Apache itself \u2014 usually an upstream routing, DNS, or load balancer problem is starving the server of traffic.\n\n### Triage\n1. Confirm whether *this* server is unique or the whole fleet is affected (Overview dashboard).\n2. Check the load balancer health-check status and target registration.\n3. Verify DNS resolution for the virtual host.\n4. Inspect external synthetic probes / RUM for a broader outage signal.\n5. Look at `apache.uptime` to rule out a recent restart during which no traffic has arrived yet.\n\n### Remediation\n- Fix the upstream routing/DNS/load-balancer issue.\n- If Apache itself is misconfigured and rejecting traffic, inspect `error_log` for vhost / cert failures.\n\n### Tuning\n- Rule fires on any zero-rate sample. If you have maintenance windows, suppress it via tags or a separate maintenance rule.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "TS metrics-apachereceiver.otel-*\n| WHERE apache.requests IS NOT NULL\n| STATS request_rate = SUM(RATE(apache.requests))\n BY resource.attributes.apache.server.name\n| WHERE request_rate == 0.0 OR request_rate IS NULL\n| SORT resource.attributes.apache.server.name ASC\n| LIMIT 10" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "resource.attributes.apache.server.name", + "termSize": 10, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/apache_otel/kibana/alerting_rule_template/apache_otel-performance-high-server-load.json b/packages/apache_otel/kibana/alerting_rule_template/apache_otel-performance-high-server-load.json new file mode 100644 index 00000000000..36272edd3a5 --- /dev/null +++ b/packages/apache_otel/kibana/alerting_rule_template/apache_otel-performance-high-server-load.json @@ -0,0 +1,43 @@ +{ + "id": "apache_otel-performance-high-server-load", + "type": "alerting_rule_template", + "attributes": { + "description": "Alerts when the 1-minute load average exceeds 0.8 (interpreted as a normalized-per-core value) \u2014 a sign that the host is saturated and request performance is degrading.", + "name": "[Apache OTEL] High Server Load", + "ruleTypeId": ".es-query", + "tags": [ + "Apache", + "OTEL", + "Performance" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "artifacts": { + "dashboards": [ + { + "id": "apache_otel-overview" + } + ], + "investigation_guide": { + "blob": "## Apache high server load (performance)\n\n### What fired\n`metrics.apache.load.1` averaged above 0.8 during the evaluation window.\n\n### Why it matters\nThis variant is interpreted as normalized load. Values above 0.8 indicate the host is close to saturation; above 1.0 means runnable processes are queuing for CPU. Either way, request processing time is likely degrading.\n\n### Triage\n1. Overview dashboard: confirm trend and compare against `apache.load.5`/`.15`.\n2. Check worker utilization \u2014 busy/idle ratio.\n3. Review `apache.cpu.time` (user vs system) and `apache.request.time` rate.\n4. Cross-check with host-level metrics to rule out non-Apache processes driving the load.\n\n### Remediation\n- Scale horizontally, optimise handlers, or reduce non-Apache workload on the host.\n\n### Tuning\n- `> 0.8` is a generic normalized threshold. If your deployment expects a higher steady-state load, raise this value.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "esqlQuery": { + "esql": "// Alert triggers when the Apache server load (1-minute average) exceeds the threshold.\n// High server load indicates the server is under heavy demand and may experience performance degradation.\n// The alert is grouped by host name.\n// You can adjust the threshold value by modifying the server_load in the WHERE clause (default: > 0.8 = 80%).\nFROM metrics-apachereceiver.otel-*\n| STATS server_load = AVG(metrics.apache.load.1) BY resource.attributes.host.name\n| WHERE server_load IS NOT NULL AND server_load > 0.8\n| EVAL server_load = ROUND(server_load, 3)\n| KEEP resource.attributes.host.name, server_load\n| SORT server_load DESC" + }, + "groupBy": "row", + "timeField": "@timestamp" + } + }, + "managed": true, + "coreMigrationVersion": "8.8.0", + "typeMigrationVersion": "10.1.0" +} diff --git a/packages/apache_otel/kibana/alerting_rule_template/apache_otel-saturation-high-busy-workers.json b/packages/apache_otel/kibana/alerting_rule_template/apache_otel-saturation-high-busy-workers.json new file mode 100644 index 00000000000..c056040a226 --- /dev/null +++ b/packages/apache_otel/kibana/alerting_rule_template/apache_otel-saturation-high-busy-workers.json @@ -0,0 +1,43 @@ +{ + "id": "apache_otel-saturation-high-busy-workers", + "type": "alerting_rule_template", + "attributes": { + "description": "Alerts when the busy-to-total worker ratio exceeds 0.85. The worker pool is close to exhaustion \u2014 new connections will be queued or refused shortly.", + "name": "[Apache OTEL] High Busy Workers Ratio", + "ruleTypeId": ".es-query", + "tags": [ + "Apache", + "OTEL", + "Saturation" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "artifacts": { + "dashboards": [ + { + "id": "apache_otel-overview" + } + ], + "investigation_guide": { + "blob": "## Apache high busy workers ratio\n\n### What fired\n`busy_workers / (busy_workers + idle_workers)` averaged above 0.85 during the evaluation window.\n\n### Why it matters\nApache handles concurrent connections with a fixed worker pool (`MaxRequestWorkers`). Once utilisation exceeds ~85% the remaining idle pool is too small to absorb short bursts, and once the pool reaches 100% new connections are either queued or rejected depending on MPM configuration.\n\n### Triage\n1. Overview dashboard: confirm per-host utilisation and trend.\n2. Check `apache.request.time` rate / `apache.requests` rate for latency increase \u2014 slow backends extend worker hold time and drive this metric.\n3. Inspect `apache.connections.async` (event MPM) \u2014 keep-alive accumulation is a common driver.\n4. Rule out traffic spikes (compare `apache.requests` rate to baseline).\n\n### Remediation\n- Increase `MaxRequestWorkers` (raising `ServerLimit` if needed).\n- Reduce per-request worker hold time by fixing slow backends or enabling mod_proxy / buffering.\n- Scale Apache horizontally.\n\n### Tuning\n- Threshold `> 0.85` is documented as the warning band. Use 0.80 for conservative tiers, 0.90 for aggressive.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "esqlQuery": { + "esql": "// Alert triggers when the ratio of busy workers to total workers exceeds the threshold.\n// High busy worker ratio indicates the server is near capacity and may start rejecting connections.\n// The alert is grouped by host name.\n// You can adjust the threshold value by modifying the busy_ratio in the WHERE clause (default: > 0.85 = 85%).\nFROM metrics-apachereceiver.otel-*\n| STATS busy_workers = AVG(metrics.apache.workers) WHERE attributes.state == \"busy\",\n idle_workers = AVG(metrics.apache.workers) WHERE attributes.state == \"idle\"\n BY resource.attributes.host.name\n| WHERE busy_workers IS NOT NULL AND idle_workers IS NOT NULL\n| EVAL total_workers = busy_workers + idle_workers\n| WHERE total_workers > 0\n| EVAL busy_ratio = busy_workers / total_workers\n| WHERE busy_ratio > 0.85\n| EVAL busy_ratio = ROUND(busy_ratio, 3)\n| KEEP resource.attributes.host.name, busy_workers, idle_workers, total_workers, busy_ratio\n| SORT busy_ratio DESC" + }, + "groupBy": "row", + "timeField": "@timestamp" + } + }, + "managed": true, + "coreMigrationVersion": "8.8.0", + "typeMigrationVersion": "10.1.0" +} diff --git a/packages/apache_otel/kibana/alerting_rule_template/apache_otel-throughput-high-request-rate.json b/packages/apache_otel/kibana/alerting_rule_template/apache_otel-throughput-high-request-rate.json new file mode 100644 index 00000000000..5035398fb3d --- /dev/null +++ b/packages/apache_otel/kibana/alerting_rule_template/apache_otel-throughput-high-request-rate.json @@ -0,0 +1,43 @@ +{ + "id": "apache_otel-throughput-high-request-rate", + "type": "alerting_rule_template", + "attributes": { + "description": "Alerts when the request rate exceeds 1000 requests/minute on a host. High rates can be legitimate traffic, but should be corroborated against capacity and security signals.", + "name": "[Apache OTEL] High Request Rate", + "ruleTypeId": ".es-query", + "tags": [ + "Apache", + "OTEL", + "Throughput" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "artifacts": { + "dashboards": [ + { + "id": "apache_otel-overview" + } + ], + "investigation_guide": { + "blob": "## Apache high request rate\n\n### What fired\nComputed request rate (delta of `apache.requests` divided by elapsed seconds) exceeded 1000 requests/minute.\n\n### Why it matters\nA sudden traffic increase can legitimately precede a marketing event or cause a DoS-style overload. Either way, the server's remaining capacity headroom must be verified against worker pool utilisation and load averages.\n\n### Triage\n1. Overview dashboard: correlate request rate with busy-worker ratio and load averages.\n2. Inspect `apache.traffic` rate \u2014 abnormal request-to-byte ratio may indicate an attack.\n3. Look for spikes in error-adjacent signals (async connection accumulation, `no-requests-received` preceding the spike, etc.).\n4. Use upstream CDN/WAF logs if available to identify source IPs / patterns.\n\n### Remediation\n- If legitimate, scale horizontally or raise `MaxRequestWorkers`.\n- If attack, engage WAF / rate-limit at the edge.\n\n### Tuning\n- 1000 req/min is a reasonable small-to-mid tier threshold. Adjust to your baseline \u2014 compare against the 95th-percentile traffic over the past 7 days.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "esqlQuery": { + "esql": "// Alert triggers when the request rate exceeds the threshold.\n// High request rate may indicate unexpected traffic spikes or potential DDoS attacks.\n// Uses delta calculation for counter metric (requests is cumulative).\n// The alert is grouped by host name.\n// You can adjust the threshold value by modifying the request_rate in the WHERE clause (default: > 1000 requests/min).\nFROM metrics-apachereceiver.otel-*\n| STATS max_requests = MAX(metrics.apache.requests), min_requests = MIN(metrics.apache.requests),\n first_ts = MIN(@timestamp), last_ts = MAX(@timestamp)\n BY resource.attributes.host.name\n| EVAL request_delta = max_requests - min_requests\n| EVAL request_delta = CASE(request_delta >= 0, request_delta, 0)\n| EVAL elapsed_sec = TO_DOUBLE(TO_LONG(last_ts) - TO_LONG(first_ts)) / 1000\n| WHERE elapsed_sec >= 60\n| EVAL request_rate = (request_delta / elapsed_sec) * 60\n| WHERE request_rate > 1000\n| EVAL request_rate = ROUND(request_rate, 2)\n| KEEP resource.attributes.host.name, request_delta, request_rate, elapsed_sec\n| SORT request_rate DESC" + }, + "groupBy": "row", + "timeField": "@timestamp" + } + }, + "managed": true, + "coreMigrationVersion": "8.8.0", + "typeMigrationVersion": "10.1.0" +} diff --git a/packages/apache_otel/kibana/alerting_rule_template/apache_otel-worker-pool-exhaustion.json b/packages/apache_otel/kibana/alerting_rule_template/apache_otel-worker-pool-exhaustion.json index 07e616c5a3e..26b81d722b1 100644 --- a/packages/apache_otel/kibana/alerting_rule_template/apache_otel-worker-pool-exhaustion.json +++ b/packages/apache_otel/kibana/alerting_rule_template/apache_otel-worker-pool-exhaustion.json @@ -1,41 +1,52 @@ { - "id": "apache_otel-worker-pool-exhaustion", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Apache OTel] Worker pool exhaustion", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "apache" - ], - "schedule": { - "interval": "1m" - }, - "alertDelay": { - "active": 3 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-apachereceiver.otel-*\n| WHERE apache.workers IS NOT NULL\n| STATS total = SUM(apache.workers),\n busy = SUM(apache.workers) WHERE attributes.state == \"busy\"\n BY resource.attributes.apache.server.name\n| WHERE total > 0\n| EVAL utilization_pct = ROUND(TO_DOUBLE(busy) / TO_DOUBLE(total) * 100.0, 2)\n| WHERE utilization_pct > 80.0\n| SORT utilization_pct DESC\n| LIMIT 10" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "resource.attributes.apache.server.name", - "termSize": 10, - "excludeHitsFromPreviousRun": true + "id": "apache_otel-worker-pool-exhaustion", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when the busy-to-total worker ratio exceeds 80%. This is the canonical worker-pool saturation signal and a direct predictor of request queuing or refusal.", + "name": "[Apache OTel] Worker pool exhaustion", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "apache" + ], + "schedule": { + "interval": "1m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "apache_otel-overview" } + ], + "investigation_guide": { + "blob": "## Apache worker pool exhaustion\n\n### What fired\n`SUM(apache.workers WHERE state=busy) / SUM(apache.workers)` exceeded 0.80 during the evaluation window.\n\n### Why it matters\nThis is the primary saturation indicator for Apache. Above 80% worker utilisation, bursts will fill the accept queue; at 100% new connections are rejected outright (or queued up to `ListenBacklog` and timed out).\n\n### Triage\n1. Overview dashboard: confirm per-server worker state and trend.\n2. Check `apache.request.time / apache.requests` rate for rising per-request latency \u2014 long backends inflate worker hold time.\n3. Inspect `apache.connections.async` on event MPM, especially `closing`/`writing` states.\n4. Look at upstream (proxied) service health if Apache is a reverse proxy.\n\n### Remediation\n- Raise `MaxRequestWorkers` / `ServerLimit`.\n- Fix slow downstreams or add caching.\n- Horizontally scale the Apache fleet.\n\n### Tuning\n- `> 80%` matches the documented warning band. Tighten for SLA-critical tiers.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-apachereceiver.otel-*\n| WHERE apache.workers IS NOT NULL\n| STATS total = SUM(apache.workers),\n busy = SUM(apache.workers) WHERE attributes.state == \"busy\"\n BY resource.attributes.apache.server.name\n| WHERE total > 0\n| EVAL utilization_pct = ROUND(TO_DOUBLE(busy) / TO_DOUBLE(total) * 100.0, 2)\n| WHERE utilization_pct > 80.0\n| SORT utilization_pct DESC\n| LIMIT 10" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "resource.attributes.apache.server.name", + "termSize": 10, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/apache_otel/manifest.yml b/packages/apache_otel/manifest.yml index 28fd2bccb0b..fe6d080198b 100644 --- a/packages/apache_otel/manifest.yml +++ b/packages/apache_otel/manifest.yml @@ -1,7 +1,7 @@ format_version: 3.5.4 name: apache_otel title: "Apache OpenTelemetry Assets" -version: 0.3.0 +version: 0.4.0 source: license: "Elastic-2.0" description: "Apache HTTP Server Assets for OpenTelemetry Collector" @@ -12,7 +12,7 @@ categories: - opentelemetry conditions: kibana: - version: "^9.2.1" + version: "^9.4.0" elastic: subscription: "basic" discovery: diff --git a/packages/apache_tomcat_otel/changelog.yml b/packages/apache_tomcat_otel/changelog.yml index 49f564e8a7f..6f3a2283024 100644 --- a/packages/apache_tomcat_otel/changelog.yml +++ b/packages/apache_tomcat_otel/changelog.yml @@ -1,4 +1,9 @@ # newer versions go on top +- version: "0.2.0" + changes: + - description: Add description and artifact fields to alerting rule template. + type: enhancement + link: https://github.com/elastic/integrations/pull/18506 - version: "0.1.0" changes: - description: Initial draft of the apache_tomcat_otel content pack. diff --git a/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-connection-pool-saturation.json b/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-connection-pool-saturation.json index 4a042ee8ef5..ef6713b392a 100644 --- a/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-connection-pool-saturation.json +++ b/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-connection-pool-saturation.json @@ -1,37 +1,51 @@ { - "id": "apache_tomcat_otel-connection-pool-saturation", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Tomcat OTel] Connection pool saturation", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "tomcat" - ], - "schedule": { - "interval": "1m" + "id": "apache_tomcat_otel-connection-pool-saturation", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when connector open connections exceed 80% of `maxConnections`. Beyond this, keep-alive connections can starve new connections and requests begin to queue.", + "name": "[Tomcat OTel] Connection pool saturation", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "tomcat" + ], + "schedule": { + "interval": "1m" + }, + "alertDelay": { + "active": 3 + }, + "artifacts": { + "dashboards": [ + { + "id": "apache_tomcat_otel-thread-pool-connections" }, - "alertDelay": { - "active": 3 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-tomcat.otel-*\n| WHERE Catalina_ThreadPool_connectionCount IS NOT NULL\n AND Catalina_ThreadPool_maxConnections IS NOT NULL\n| STATS\n connections = MAX(Catalina_ThreadPool_connectionCount),\n max_connections = MAX(Catalina_ThreadPool_maxConnections)\n BY host.name, attributes.name\n| WHERE max_connections > 0\n| EVAL connection_pct = ROUND(connections / max_connections * 100.0, 2)\n// Connection saturation threshold \u2014 when keep-alive connections accumulate\n// they can starve new connections; adjust based on connector capacity planning\n| WHERE connection_pct > 80.0\n| SORT connection_pct DESC" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "host.name", - "termSize": 100, - "excludeHitsFromPreviousRun": true + { + "id": "apache_tomcat_otel-overview" } + ], + "investigation_guide": { + "blob": "## Tomcat connection pool saturation\n\n### What fired\n`Catalina_ThreadPool_connectionCount / Catalina_ThreadPool_maxConnections` exceeded 0.80 over the evaluation window.\n\n### Why it matters\nEach connector accepts up to `maxConnections`. As that number fills \u2014 particularly with keep-alive connections that hold idle \u2014 new connections must wait in the accept queue (`acceptCount`). When that queue overflows, the OS refuses connections at the TCP level.\n\n### Triage\n1. Thread Pool & Connections dashboard: compare `connectionCount`, `keepAliveCount`, and `acceptCount` per connector.\n2. Check `currentThreadsBusy` to confirm whether work is actively being done on those connections.\n3. Look at upstream request latency \u2014 slow backends cause connections to linger.\n4. Verify `keepAliveTimeout` / `maxKeepAliveRequests` \u2014 aggressive keep-alive can drive saturation.\n\n### Remediation\n- Raise `maxConnections` if host resources allow.\n- Tune keep-alive behaviour (shorten timeouts, limit max requests per connection).\n- Scale Tomcat horizontally behind a balancer.\n\n### Tuning\n- `> 80%` is the documented warning band. Tighten or relax per connector capacity plan.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-tomcat.otel-*\n| WHERE Catalina_ThreadPool_connectionCount IS NOT NULL\n AND Catalina_ThreadPool_maxConnections IS NOT NULL\n| STATS\n connections = MAX(Catalina_ThreadPool_connectionCount),\n max_connections = MAX(Catalina_ThreadPool_maxConnections)\n BY host.name, attributes.name\n| WHERE max_connections > 0\n| EVAL connection_pct = ROUND(connections / max_connections * 100.0, 2)\n// Connection saturation threshold \u2014 when keep-alive connections accumulate\n// they can starve new connections; adjust based on connector capacity planning\n| WHERE connection_pct > 80.0\n| SORT connection_pct DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "host.name", + "termSize": 100, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-file-descriptor-exhaustion.json b/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-file-descriptor-exhaustion.json index 04057e7c164..4fa4f876118 100644 --- a/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-file-descriptor-exhaustion.json +++ b/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-file-descriptor-exhaustion.json @@ -1,37 +1,51 @@ { - "id": "apache_tomcat_otel-file-descriptor-exhaustion", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Tomcat OTel] File descriptor exhaustion", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "tomcat" - ], - "schedule": { - "interval": "5m" + "id": "apache_tomcat_otel-file-descriptor-exhaustion", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when Tomcat's JVM process uses more than 80% of its file descriptor limit. FD exhaustion prevents new socket connections and file operations \u2014 effectively halting request processing.", + "name": "[Tomcat OTel] File descriptor exhaustion", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "tomcat" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "artifacts": { + "dashboards": [ + { + "id": "apache_tomcat_otel-jvm-os-resources" }, - "alertDelay": { - "active": 3 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-tomcat.otel-*\n| WHERE java_lang_OperatingSystem_OpenFileDescriptorCount IS NOT NULL\n AND java_lang_OperatingSystem_MaxFileDescriptorCount IS NOT NULL\n| STATS\n open_fds = MAX(java_lang_OperatingSystem_OpenFileDescriptorCount),\n max_fds = MAX(java_lang_OperatingSystem_MaxFileDescriptorCount)\n BY host.name\n| WHERE max_fds > 0\n| EVAL fd_pct = ROUND(open_fds / max_fds * 100.0, 2)\n// File descriptor exhaustion prevents new socket connections and file operations\n// Adjust threshold based on your FD limits (ulimit -n)\n| WHERE fd_pct > 80.0\n| SORT fd_pct DESC" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "host.name", - "termSize": 100, - "excludeHitsFromPreviousRun": true + { + "id": "apache_tomcat_otel-overview" } + ], + "investigation_guide": { + "blob": "## Tomcat file descriptor exhaustion\n\n### What fired\n`OpenFileDescriptorCount / MaxFileDescriptorCount` exceeded 0.80 over the evaluation window.\n\n### Why it matters\nSockets, log files, and class loading all consume file descriptors. When the JVM hits the OS-imposed limit, `Too many open files` errors cause request failures and the JVM may need to be restarted to recover. This is one of the most disruptive operational failures for Tomcat.\n\n### Triage\n1. JVM & OS Resources dashboard: confirm trend and current vs max FDs.\n2. Identify leak drivers: unclosed streams, lingering sockets (check `netstat` / `ss` for CLOSE_WAIT pile-up), verbose class loading.\n3. Look at request rate and connection count \u2014 scaling traffic naturally scales FDs.\n\n### Remediation\n- Raise `ulimit -n` / systemd `LimitNOFILE` and restart the JVM.\n- Identify and fix FD leaks (unclosed `InputStream`, `Socket`, etc.).\n- Reduce keep-alive timeouts to accelerate socket reclamation.\n\n### Tuning\n- 80% warning is conservative; some deployments alert at 70% to leave more time for operator response.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-tomcat.otel-*\n| WHERE java_lang_OperatingSystem_OpenFileDescriptorCount IS NOT NULL\n AND java_lang_OperatingSystem_MaxFileDescriptorCount IS NOT NULL\n| STATS\n open_fds = MAX(java_lang_OperatingSystem_OpenFileDescriptorCount),\n max_fds = MAX(java_lang_OperatingSystem_MaxFileDescriptorCount)\n BY host.name\n| WHERE max_fds > 0\n| EVAL fd_pct = ROUND(open_fds / max_fds * 100.0, 2)\n// File descriptor exhaustion prevents new socket connections and file operations\n// Adjust threshold based on your FD limits (ulimit -n)\n| WHERE fd_pct > 80.0\n| SORT fd_pct DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "host.name", + "termSize": 100, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-high-jvm-process-cpu.json b/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-high-jvm-process-cpu.json index faaa17be409..bd4b8ba1adb 100644 --- a/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-high-jvm-process-cpu.json +++ b/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-high-jvm-process-cpu.json @@ -1,37 +1,51 @@ { - "id": "apache_tomcat_otel-high-jvm-process-cpu", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Tomcat OTel] High JVM process CPU usage", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "tomcat" - ], - "schedule": { - "interval": "1m" + "id": "apache_tomcat_otel-high-jvm-process-cpu", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when the JVM process CPU load averages above 90% (`java_lang_OperatingSystem_ProcessCpuLoad > 0.9`). Sustained high CPU degrades request latency and often correlates with GC pressure.", + "name": "[Tomcat OTel] High JVM process CPU usage", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "tomcat" + ], + "schedule": { + "interval": "1m" + }, + "alertDelay": { + "active": 3 + }, + "artifacts": { + "dashboards": [ + { + "id": "apache_tomcat_otel-jvm-os-resources" }, - "alertDelay": { - "active": 3 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-tomcat.otel-*\n| WHERE java_lang_OperatingSystem_ProcessCpuLoad IS NOT NULL\n| STATS avg_cpu = AVG(java_lang_OperatingSystem_ProcessCpuLoad)\n BY host.name\n| EVAL cpu_pct = ROUND(avg_cpu * 100.0, 2)\n// JVM process CPU threshold \u2014 sustained high CPU often correlates with\n// GC pressure, thread contention, or compute-heavy request processing\n// Adjust based on expected workload and available CPU cores\n| WHERE cpu_pct > 90.0\n| SORT cpu_pct DESC" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "host.name", - "termSize": 100, - "excludeHitsFromPreviousRun": true + { + "id": "apache_tomcat_otel-overview" } + ], + "investigation_guide": { + "blob": "## Tomcat high JVM process CPU usage\n\n### What fired\n`java_lang_OperatingSystem_ProcessCpuLoad` averaged above 0.90 during the evaluation window.\n\n### Why it matters\nPersistent high CPU use usually points to GC churn, synchronized-block contention, or compute-heavy request handling. Under CPU pressure, response latency degrades and the thread pool can saturate quickly.\n\n### Triage\n1. JVM & OS Resources dashboard: confirm host and trend.\n2. Inspect GC metrics (`java_lang_G1_Young_Generation_CollectionTime`, `java_lang_G1_Old_Generation_CollectionTime`) for GC dominating the CPU.\n3. Check thread counts and stack depth via thread dumps if CPU-bound without GC.\n4. Look at request mix \u2014 CPU-heavy endpoints like PDF generation, compression, TLS handshake bursts.\n\n### Remediation\n- Reduce GC tax (size heap correctly, tune G1 regions).\n- Profile hot methods (async-profiler, JFR) and optimise.\n- Scale out instances or up to more cores.\n\n### Tuning\n- `> 90%` is conservative. Many ops teams alert at 85% for earlier warning.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-tomcat.otel-*\n| WHERE java_lang_OperatingSystem_ProcessCpuLoad IS NOT NULL\n| STATS avg_cpu = AVG(java_lang_OperatingSystem_ProcessCpuLoad)\n BY host.name\n| EVAL cpu_pct = ROUND(avg_cpu * 100.0, 2)\n// JVM process CPU threshold \u2014 sustained high CPU often correlates with\n// GC pressure, thread contention, or compute-heavy request processing\n// Adjust based on expected workload and available CPU cores\n| WHERE cpu_pct > 90.0\n| SORT cpu_pct DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "host.name", + "termSize": 100, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-high-request-error-rate.json b/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-high-request-error-rate.json index 583ede35f7f..a58b0b9a992 100644 --- a/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-high-request-error-rate.json +++ b/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-high-request-error-rate.json @@ -1,37 +1,51 @@ { - "id": "apache_tomcat_otel-high-request-error-rate", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Tomcat OTel] High request error rate", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "tomcat" - ], - "schedule": { - "interval": "5m" + "id": "apache_tomcat_otel-high-request-error-rate", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when the connector-level HTTP error rate exceeds 5% (`errorCount / requestCount > 0.05`). Sustained elevated error rate indicates application failures or misconfigurations.", + "name": "[Tomcat OTel] High request error rate", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "tomcat" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "artifacts": { + "dashboards": [ + { + "id": "apache_tomcat_otel-request-processing" }, - "alertDelay": { - "active": 3 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-tomcat.otel-*\n| WHERE Catalina_GlobalRequestProcessor_errorCount IS NOT NULL\n AND Catalina_GlobalRequestProcessor_requestCount IS NOT NULL\n| STATS\n max_errors = MAX(Catalina_GlobalRequestProcessor_errorCount),\n min_errors = MIN(Catalina_GlobalRequestProcessor_errorCount),\n max_requests = MAX(Catalina_GlobalRequestProcessor_requestCount),\n min_requests = MIN(Catalina_GlobalRequestProcessor_requestCount)\n BY host.name, attributes.name\n| EVAL\n delta_errors = max_errors - min_errors,\n delta_requests = max_requests - min_requests\n// Minimum request volume to avoid noisy alerts on low-traffic connectors\n// Adjust this threshold based on expected traffic volume\n| WHERE delta_requests > 10\n| EVAL error_rate_pct = ROUND(delta_errors / delta_requests * 100.0, 2)\n// Alert threshold \u2014 adjust based on acceptable error rate for your services\n| WHERE error_rate_pct > 5.0\n| SORT error_rate_pct DESC" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "host.name", - "termSize": 100, - "excludeHitsFromPreviousRun": true + { + "id": "apache_tomcat_otel-overview" } + ], + "investigation_guide": { + "blob": "## Tomcat high request error rate\n\n### What fired\n`(delta errorCount) / (delta requestCount) > 5%` on a connector during the evaluation window.\n\n### Why it matters\nTomcat counts HTTP 4xx and 5xx responses as errors. Sustained error rate above a few percent typically reflects an application-layer problem \u2014 a failed dependency, a recently deployed bug, or misconfigured security filters.\n\n### Triage\n1. Request Processing dashboard: identify affected connectors / web applications.\n2. Correlate with deploy timelines and dependency status.\n3. Sample application logs for the dominant error category.\n4. Check if errors are concentrated on specific servlets (`Catalina_Servlet_errorCount` per servlet).\n\n### Remediation\n- Roll back or patch the offending application.\n- Fix downstream dependency; add circuit breakers to degrade gracefully.\n\n### Tuning\n- `> 5%` with `delta_requests > 10` is a sensible default. Lower for APIs with tight SLAs; raise for noisy UIs.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-tomcat.otel-*\n| WHERE Catalina_GlobalRequestProcessor_errorCount IS NOT NULL\n AND Catalina_GlobalRequestProcessor_requestCount IS NOT NULL\n| STATS\n max_errors = MAX(Catalina_GlobalRequestProcessor_errorCount),\n min_errors = MIN(Catalina_GlobalRequestProcessor_errorCount),\n max_requests = MAX(Catalina_GlobalRequestProcessor_requestCount),\n min_requests = MIN(Catalina_GlobalRequestProcessor_requestCount)\n BY host.name, attributes.name\n| EVAL\n delta_errors = max_errors - min_errors,\n delta_requests = max_requests - min_requests\n// Minimum request volume to avoid noisy alerts on low-traffic connectors\n// Adjust this threshold based on expected traffic volume\n| WHERE delta_requests > 10\n| EVAL error_rate_pct = ROUND(delta_errors / delta_requests * 100.0, 2)\n// Alert threshold \u2014 adjust based on acceptable error rate for your services\n| WHERE error_rate_pct > 5.0\n| SORT error_rate_pct DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "host.name", + "termSize": 100, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-jvm-heap-memory-pressure.json b/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-jvm-heap-memory-pressure.json index aaa419936a6..3cdd75c879f 100644 --- a/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-jvm-heap-memory-pressure.json +++ b/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-jvm-heap-memory-pressure.json @@ -1,37 +1,51 @@ { - "id": "apache_tomcat_otel-jvm-heap-memory-pressure", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Tomcat OTel] JVM heap memory pressure", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "tomcat" - ], - "schedule": { - "interval": "1m" + "id": "apache_tomcat_otel-jvm-heap-memory-pressure", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when JVM heap utilization exceeds 85% (`HeapMemoryUsage_used / HeapMemoryUsage_max > 0.85`). Sustained high heap often precedes full GCs and OutOfMemoryError.", + "name": "[Tomcat OTel] JVM heap memory pressure", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "tomcat" + ], + "schedule": { + "interval": "1m" + }, + "alertDelay": { + "active": 3 + }, + "artifacts": { + "dashboards": [ + { + "id": "apache_tomcat_otel-jvm-memory-gc" }, - "alertDelay": { - "active": 3 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-tomcat.otel-*\n| WHERE java_lang_Memory_HeapMemoryUsage_used IS NOT NULL\n AND java_lang_Memory_HeapMemoryUsage_max IS NOT NULL\n| STATS\n heap_used = MAX(java_lang_Memory_HeapMemoryUsage_used),\n heap_max = MAX(java_lang_Memory_HeapMemoryUsage_max)\n BY host.name\n// HeapMemoryUsage_max can be -1 if undefined; filter to valid values\n| WHERE heap_max > 0\n| EVAL heap_pct = ROUND(heap_used / heap_max * 100.0, 2)\n// Heap utilisation threshold \u2014 peaks above 85% indicate memory pressure\n// Sustained high utilisation often precedes OutOfMemoryError\n| WHERE heap_pct > 85.0\n| SORT heap_pct DESC" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "host.name", - "termSize": 100, - "excludeHitsFromPreviousRun": true + { + "id": "apache_tomcat_otel-overview" } + ], + "investigation_guide": { + "blob": "## Tomcat JVM heap memory pressure\n\n### What fired\nHeap utilisation peaked above 85% over the evaluation window.\n\n### Why it matters\nA healthy JVM shows a sawtooth heap pattern. Sustained high peaks indicate the old generation is filling up, old-gen GCs become frequent and long, and eventually the JVM will throw `OutOfMemoryError` and die. Session growth, classloader leaks on redeploy, and undersized heap are typical causes.\n\n### Triage\n1. JVM Memory & GC dashboard: inspect heap trend and old-gen usage.\n2. Check `java_lang_G1_Old_Gen_Usage_used` after old GCs \u2014 a rising post-GC baseline indicates a leak.\n3. Correlate with `Catalina_Manager_activeSessions` (session accumulation is a classic leak vector).\n4. If redeploy cadence is high, suspect classloader leaks / metaspace growth.\n\n### Remediation\n- Raise `-Xmx` if the workload legitimately grew.\n- Take and analyse a heap dump to find the dominant retained objects.\n- Fix leaks (ThreadLocals, unclosed resources, classloader leaks after hot-redeploy).\n\n### Tuning\n- 85% matches the documented warning band. Lower to 80% for earlier warning; raise to 90% for memory-tight steady-state workloads.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-tomcat.otel-*\n| WHERE java_lang_Memory_HeapMemoryUsage_used IS NOT NULL\n AND java_lang_Memory_HeapMemoryUsage_max IS NOT NULL\n| STATS\n heap_used = MAX(java_lang_Memory_HeapMemoryUsage_used),\n heap_max = MAX(java_lang_Memory_HeapMemoryUsage_max)\n BY host.name\n// HeapMemoryUsage_max can be -1 if undefined; filter to valid values\n| WHERE heap_max > 0\n| EVAL heap_pct = ROUND(heap_used / heap_max * 100.0, 2)\n// Heap utilisation threshold \u2014 peaks above 85% indicate memory pressure\n// Sustained high utilisation often precedes OutOfMemoryError\n| WHERE heap_pct > 85.0\n| SORT heap_pct DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "host.name", + "termSize": 100, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-old-generation-gc-activity.json b/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-old-generation-gc-activity.json index 543d6d69578..ba299f04ffd 100644 --- a/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-old-generation-gc-activity.json +++ b/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-old-generation-gc-activity.json @@ -1,37 +1,51 @@ { - "id": "apache_tomcat_otel-old-generation-gc-activity", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Tomcat OTel] Old generation GC activity", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "tomcat" - ], - "schedule": { - "interval": "5m" + "id": "apache_tomcat_otel-old-generation-gc-activity", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when any old-generation (full) GC cycles occur (`java_lang_G1_Old_Generation_CollectionCount` increases). Full GCs stop-the-world and should be rare in a healthy JVM.", + "name": "[Tomcat OTel] Old generation GC activity", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "tomcat" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "artifacts": { + "dashboards": [ + { + "id": "apache_tomcat_otel-jvm-memory-gc" }, - "alertDelay": { - "active": 2 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-tomcat.otel-*\n| WHERE java_lang_G1_Old_Generation_CollectionCount IS NOT NULL\n| STATS\n max_count = MAX(java_lang_G1_Old_Generation_CollectionCount),\n min_count = MIN(java_lang_G1_Old_Generation_CollectionCount)\n BY host.name\n| EVAL full_gc_count = max_count - min_count\n// Any old generation (full) GC pauses the entire JVM and directly impacts latency\n// Frequent full GC (> 1/min) is critical \u2014 adjust threshold to match your GC tuning\n| WHERE full_gc_count > 0\n| SORT full_gc_count DESC" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "host.name", - "termSize": 100, - "excludeHitsFromPreviousRun": true + { + "id": "apache_tomcat_otel-overview" } + ], + "investigation_guide": { + "blob": "## Tomcat old generation GC activity\n\n### What fired\n`java_lang_G1_Old_Generation_CollectionCount` increased during the evaluation window.\n\n### Why it matters\nOld-generation collections (mixed / full GCs under G1) pause the entire JVM. Even a single full GC can cause request-latency spikes lasting hundreds of milliseconds to seconds. Frequent full GCs indicate heap pressure that young-gen collection cannot resolve.\n\n### Triage\n1. JVM Memory & GC dashboard: look at collection count and collection time trend.\n2. Inspect `java_lang_G1_Old_Gen_Usage_used` post-GC baseline \u2014 rising means memory leak.\n3. Correlate with heap utilization and request latency spikes.\n4. If redeploys coincide, suspect classloader leaks.\n\n### Remediation\n- Fix memory leaks revealed by heap dumps.\n- Size heap correctly \u2014 too small drives frequent full GCs, too large extends pause duration.\n- Tune G1 region size / IHOP if old-gen pressure is structural.\n\n### Tuning\n- Fires on any full GC. For heavy batch workloads where occasional old-gen GCs are acceptable, set a rate threshold.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-tomcat.otel-*\n| WHERE java_lang_G1_Old_Generation_CollectionCount IS NOT NULL\n| STATS\n max_count = MAX(java_lang_G1_Old_Generation_CollectionCount),\n min_count = MIN(java_lang_G1_Old_Generation_CollectionCount)\n BY host.name\n| EVAL full_gc_count = max_count - min_count\n// Any old generation (full) GC pauses the entire JVM and directly impacts latency\n// Frequent full GC (> 1/min) is critical \u2014 adjust threshold to match your GC tuning\n| WHERE full_gc_count > 0\n| SORT full_gc_count DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "host.name", + "termSize": 100, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-request-latency-spike.json b/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-request-latency-spike.json index 58b9ac85e4f..26acde5ead4 100644 --- a/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-request-latency-spike.json +++ b/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-request-latency-spike.json @@ -1,38 +1,51 @@ { - "id": "apache_tomcat_otel-request-latency-spike", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Tomcat OTel] Request latency spike", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "tomcat" - ], - "schedule": { - "interval": "5m" + "id": "apache_tomcat_otel-request-latency-spike", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when the average per-request processing time exceeds 500 ms on a connector. Elevated average latency is a user-visible regression and usually precedes thread-pool saturation.", + "name": "[Tomcat OTel] Request latency spike", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "tomcat" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "artifacts": { + "dashboards": [ + { + "id": "apache_tomcat_otel-request-processing" }, - "alertDelay": { - "active": 3 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-tomcat.otel-*\n| WHERE Catalina_GlobalRequestProcessor_processingTime IS NOT NULL\n AND Catalina_GlobalRequestProcessor_requestCount IS NOT NULL\n| STATS\n max_time = MAX(Catalina_GlobalRequestProcessor_processingTime),\n min_time = MIN(Catalina_GlobalRequestProcessor_processingTime),\n max_requests = MAX(Catalina_GlobalRequestProcessor_requestCount),\n min_requests = MIN(Catalina_GlobalRequestProcessor_requestCount)\n BY host.name, attributes.name\n| EVAL\n delta_time_ms = max_time - min_time,\n delta_requests = max_requests - min_requests\n// Minimum request volume to compute a meaningful average\n| WHERE delta_requests > 10\n| EVAL avg_latency_ms = ROUND(delta_time_ms / delta_requests, 2)\n// Latency threshold in milliseconds \u2014 adjust based on your SLA\n// Consider that this is average latency; tail latency will be higher\n| WHERE avg_latency_ms > 500.0\n| SORT avg_latency_ms DESC" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "host.name", - "termSize": 100, - "excludeHitsFromPreviousRun": true - + { + "id": "apache_tomcat_otel-overview" } + ], + "investigation_guide": { + "blob": "## Tomcat request latency spike\n\n### What fired\n`delta processingTime / delta requestCount > 500 ms` on a connector over the evaluation window.\n\n### Why it matters\nAverage request latency is the most direct SLI for a Tomcat connector. Sustained increases hurt user experience and cascade into thread pool saturation as workers hold connections longer. Tail latency will be worse than average, so the true impact often exceeds what this rule shows.\n\n### Triage\n1. Request Processing dashboard: confirm connector/application affected.\n2. Correlate with thread pool saturation (`Catalina_ThreadPool_currentThreadsBusy`).\n3. Check GC pauses (`java_lang_G1_Old_Generation_CollectionTime`) \u2014 GC spikes cause latency spikes.\n4. Inspect downstream dependency latency (DB, REST services, caches).\n\n### Remediation\n- Fix the slow downstream / slow servlet.\n- Add caching or circuit breakers.\n- Size the thread pool and heap for the observed latency envelope.\n\n### Tuning\n- `> 500 ms` is a coarse default; most services should alert at a much lower threshold tuned to their SLA.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-tomcat.otel-*\n| WHERE Catalina_GlobalRequestProcessor_processingTime IS NOT NULL\n AND Catalina_GlobalRequestProcessor_requestCount IS NOT NULL\n| STATS\n max_time = MAX(Catalina_GlobalRequestProcessor_processingTime),\n min_time = MIN(Catalina_GlobalRequestProcessor_processingTime),\n max_requests = MAX(Catalina_GlobalRequestProcessor_requestCount),\n min_requests = MIN(Catalina_GlobalRequestProcessor_requestCount)\n BY host.name, attributes.name\n| EVAL\n delta_time_ms = max_time - min_time,\n delta_requests = max_requests - min_requests\n// Minimum request volume to compute a meaningful average\n| WHERE delta_requests > 10\n| EVAL avg_latency_ms = ROUND(delta_time_ms / delta_requests, 2)\n// Latency threshold in milliseconds \u2014 adjust based on your SLA\n// Consider that this is average latency; tail latency will be higher\n| WHERE avg_latency_ms > 500.0\n| SORT avg_latency_ms DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "host.name", + "termSize": 100, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-session-rejection.json b/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-session-rejection.json index be39099b97d..ab2746733e4 100644 --- a/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-session-rejection.json +++ b/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-session-rejection.json @@ -1,37 +1,51 @@ { - "id": "apache_tomcat_otel-session-rejection", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Tomcat OTel] Session rejection", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "tomcat" - ], - "schedule": { - "interval": "5m" + "id": "apache_tomcat_otel-session-rejection", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when any HTTP sessions are rejected (`Catalina_Manager_rejectedSessions` increases). Rejection means `maxActiveSessions` was reached \u2014 real users are being turned away.", + "name": "[Tomcat OTel] Session rejection", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "tomcat" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "artifacts": { + "dashboards": [ + { + "id": "apache_tomcat_otel-sessions-servlets" }, - "alertDelay": { - "active": 2 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-tomcat.otel-*\n| WHERE Catalina_Manager_rejectedSessions IS NOT NULL\n| STATS\n max_rejected = MAX(Catalina_Manager_rejectedSessions),\n min_rejected = MIN(Catalina_Manager_rejectedSessions)\n BY host.name, attributes.context, attributes.host\n| EVAL new_rejections = max_rejected - min_rejected\n// Any session rejection means maxActiveSessions has been reached\n// This directly impacts users who cannot establish new sessions\n| WHERE new_rejections > 0\n| SORT new_rejections DESC" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "host.name", - "termSize": 100, - "excludeHitsFromPreviousRun": true + { + "id": "apache_tomcat_otel-overview" } + ], + "investigation_guide": { + "blob": "## Tomcat session rejection\n\n### What fired\n`Catalina_Manager_rejectedSessions` increased on at least one web application during the evaluation window.\n\n### Why it matters\nSessions are rejected when the manager reaches `maxActiveSessions`. Every rejection is a user that cannot establish a session \u2014 likely a login failure or lost shopping cart. Silent session rejection often goes unnoticed until support tickets arrive.\n\n### Triage\n1. Sessions & Applications dashboard: find the affected context.\n2. Compare `activeSessions` vs `maxActiveSessions` and the rate of `sessionCreateRate` / `sessionExpireRate`.\n3. Check for unusually long session TTLs or a recent traffic spike.\n\n### Remediation\n- Raise `maxActiveSessions` (verify heap can tolerate it).\n- Shorten session TTL or promote session expiration.\n- Investigate why sessions accumulate (leaked references, absent logout flows).\n\n### Tuning\n- Fires on any rejection. Lower sensitivity is rarely appropriate since every rejection is a user-visible failure.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-tomcat.otel-*\n| WHERE Catalina_Manager_rejectedSessions IS NOT NULL\n| STATS\n max_rejected = MAX(Catalina_Manager_rejectedSessions),\n min_rejected = MIN(Catalina_Manager_rejectedSessions)\n BY host.name, attributes.context, attributes.host\n| EVAL new_rejections = max_rejected - min_rejected\n// Any session rejection means maxActiveSessions has been reached\n// This directly impacts users who cannot establish new sessions\n| WHERE new_rejections > 0\n| SORT new_rejections DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "host.name", + "termSize": 100, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-thread-pool-saturation.json b/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-thread-pool-saturation.json index 301538d5652..ce7070da4fe 100644 --- a/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-thread-pool-saturation.json +++ b/packages/apache_tomcat_otel/kibana/alerting_rule_template/apache_tomcat_otel-thread-pool-saturation.json @@ -1,37 +1,51 @@ { - "id": "apache_tomcat_otel-thread-pool-saturation", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Tomcat OTel] Thread pool saturation", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "tomcat" - ], - "schedule": { - "interval": "1m" + "id": "apache_tomcat_otel-thread-pool-saturation", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when thread pool utilisation exceeds 80% (`currentThreadsBusy / maxThreads > 0.80`). This is the primary saturation signal for Tomcat; requests begin queuing and refusing here.", + "name": "[Tomcat OTel] Thread pool saturation", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "tomcat" + ], + "schedule": { + "interval": "1m" + }, + "alertDelay": { + "active": 3 + }, + "artifacts": { + "dashboards": [ + { + "id": "apache_tomcat_otel-thread-pool-connections" }, - "alertDelay": { - "active": 3 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-tomcat.otel-*\n| WHERE Catalina_ThreadPool_currentThreadsBusy IS NOT NULL\n AND Catalina_ThreadPool_maxThreads IS NOT NULL\n| STATS\n busy = MAX(Catalina_ThreadPool_currentThreadsBusy),\n max_threads = MAX(Catalina_ThreadPool_maxThreads)\n BY host.name, attributes.name\n| WHERE max_threads > 0\n| EVAL utilization_pct = ROUND(busy / max_threads * 100.0, 2)\n// Thread pool saturation threshold \u2014 adjust based on your capacity requirements\n// Above 80% the risk of request queuing and rejection increases rapidly\n| WHERE utilization_pct > 80.0\n| SORT utilization_pct DESC" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "host.name", - "termSize": 100, - "excludeHitsFromPreviousRun": true + { + "id": "apache_tomcat_otel-overview" } + ], + "investigation_guide": { + "blob": "## Tomcat thread pool saturation\n\n### What fired\n`currentThreadsBusy / maxThreads` on a connector exceeded 80% during the evaluation window.\n\n### Why it matters\nThread pool utilisation is the single most important saturation indicator in Tomcat. Above 80%, bursts begin to queue; at 100% the accept queue (`acceptCount`) absorbs further connections until it too is full, after which new connections are refused at the TCP level.\n\n### Triage\n1. Thread Pool & Connections dashboard: confirm per-connector utilisation trend.\n2. Check request latency (`processingTime / requestCount`) \u2014 slow requests pin threads longer.\n3. Inspect downstream dependency health (DB, HTTP clients) to find the source of the slowness.\n4. Look at connection count and keep-alive count \u2014 unhealthy keep-alive patterns inflate usage.\n\n### Remediation\n- Fix the slow downstream so threads are released faster.\n- Raise `maxThreads` (verify heap and CPU can support it).\n- Scale Tomcat horizontally behind a load balancer.\n\n### Tuning\n- `> 80%` matches documented guidance. Tighten to 70% for latency-sensitive tiers; 85% is the absolute ceiling before the rule becomes late to warn.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-tomcat.otel-*\n| WHERE Catalina_ThreadPool_currentThreadsBusy IS NOT NULL\n AND Catalina_ThreadPool_maxThreads IS NOT NULL\n| STATS\n busy = MAX(Catalina_ThreadPool_currentThreadsBusy),\n max_threads = MAX(Catalina_ThreadPool_maxThreads)\n BY host.name, attributes.name\n| WHERE max_threads > 0\n| EVAL utilization_pct = ROUND(busy / max_threads * 100.0, 2)\n// Thread pool saturation threshold \u2014 adjust based on your capacity requirements\n// Above 80% the risk of request queuing and rejection increases rapidly\n| WHERE utilization_pct > 80.0\n| SORT utilization_pct DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "host.name", + "termSize": 100, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/apache_tomcat_otel/manifest.yml b/packages/apache_tomcat_otel/manifest.yml index a0654fc6b62..1bd62d4374d 100644 --- a/packages/apache_tomcat_otel/manifest.yml +++ b/packages/apache_tomcat_otel/manifest.yml @@ -1,7 +1,7 @@ format_version: 3.5.7 name: apache_tomcat_otel title: "Apache Tomcat OpenTelemetry Assets" -version: 0.1.0 +version: 0.2.0 source: license: "Elastic-2.0" description: "Apache Tomcat Assets from OpenTelemetry Collector" @@ -13,7 +13,7 @@ categories: - opentelemetry conditions: kibana: - version: "^9.2.1" + version: "^9.4.0" elastic: subscription: "basic" discovery: diff --git a/packages/cassandra_otel/changelog.yml b/packages/cassandra_otel/changelog.yml index 88a0d17e0d0..0c80107d72d 100644 --- a/packages/cassandra_otel/changelog.yml +++ b/packages/cassandra_otel/changelog.yml @@ -1,4 +1,9 @@ # newer versions go on top +- version: "0.2.0" + changes: + - description: Add description and artifact fields to alerting rule template. + type: enhancement + link: https://github.com/elastic/integrations/pull/18506 - version: "0.1.0" changes: - description: Initial draft of the package diff --git a/packages/cassandra_otel/kibana/alerting_rule_template/cassandra_otel-compaction-pending.json b/packages/cassandra_otel/kibana/alerting_rule_template/cassandra_otel-compaction-pending.json index d7e34400464..7c37f630196 100644 --- a/packages/cassandra_otel/kibana/alerting_rule_template/cassandra_otel-compaction-pending.json +++ b/packages/cassandra_otel/kibana/alerting_rule_template/cassandra_otel-compaction-pending.json @@ -1,39 +1,52 @@ { - "id": "cassandra_otel-compaction-pending", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Cassandra OTel] Compaction falling behind (pending tasks)", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "cassandra" - ], - "schedule": { - "interval": "5m" - }, - "alertDelay": { - "active": 3 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "TS metrics-cassandra.otel-default\n// Gauge: pending compaction tasks; > 0 means compaction is falling behind\n| WHERE cassandra.compaction.tasks.pending IS NOT NULL\n| STATS pending = MAX(MAX_OVER_TIME(cassandra.compaction.tasks.pending)) BY server.address\n// Any persistent backlog degrades read performance\n| WHERE pending > 0\n| SORT pending DESC" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "server.address", - "termSize": 20, - "excludeHitsFromPreviousRun": true + "id": "cassandra_otel-compaction-pending", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when a node has pending compaction tasks (`cassandra.compaction.tasks.pending > 0`). A persistent backlog degrades read performance as SSTables pile up.", + "name": "[Cassandra OTel] Compaction falling behind (pending tasks)", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "cassandra" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "cassandra_otel-overview" } + ], + "investigation_guide": { + "blob": "## Cassandra compaction pending\n\n### What fired\n`cassandra.compaction.tasks.pending` exceeded 0 on at least one node over the evaluation window.\n\n### Why it matters\nCassandra merges SSTables through compaction. When compaction falls behind, each read must consult more SSTables, drastically increasing read latency and disk I/O. Sustained pending compactions usually indicate write load outpaces compaction throughput or disk I/O is constrained.\n\n### Triage\n1. Overview dashboard: confirm the affected nodes and trend.\n2. Correlate with read latency p99 and range-slice latency \u2014 you should see degradation on the same nodes.\n3. Inspect disk I/O utilisation and free space on data directories.\n4. Look for recent schema or workload changes driving write amplification.\n\n### Remediation\n- Throttle or throttle down writes until compaction catches up (cluster-wide).\n- Increase `concurrent_compactors` / `compaction_throughput_mb_per_sec` if hardware permits.\n- Scale out to distribute write load across more nodes.\n\n### Tuning\n- Fires on any pending task. For large clusters with steady small backlogs, use `pending > N` or a percentage of total SSTables.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "TS metrics-cassandra.otel-default\n// Gauge: pending compaction tasks; > 0 means compaction is falling behind\n| WHERE cassandra.compaction.tasks.pending IS NOT NULL\n| STATS pending = MAX(MAX_OVER_TIME(cassandra.compaction.tasks.pending)) BY server.address\n// Any persistent backlog degrades read performance\n| WHERE pending > 0\n| SORT pending DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "server.address", + "termSize": 20, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/cassandra_otel/kibana/alerting_rule_template/cassandra_otel-high-error-rate.json b/packages/cassandra_otel/kibana/alerting_rule_template/cassandra_otel-high-error-rate.json index 7c4de6204c2..2a598e08196 100644 --- a/packages/cassandra_otel/kibana/alerting_rule_template/cassandra_otel-high-error-rate.json +++ b/packages/cassandra_otel/kibana/alerting_rule_template/cassandra_otel-high-error-rate.json @@ -1,39 +1,52 @@ { - "id": "cassandra_otel-high-error-rate", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Cassandra OTel] High error rate by node", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "cassandra" - ], - "schedule": { - "interval": "5m" - }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "TS metrics-cassandra.otel-default\n// Combine request count and error count per node via FORK\n| FORK (\n WHERE cassandra.client.request.count IS NOT NULL\n | STATS requests = SUM(INCREASE(cassandra.client.request.count)) BY server.address\n | EVAL errors = 0.0\n )\n (\n WHERE cassandra.client.request.error.count IS NOT NULL\n | STATS errors = SUM(INCREASE(cassandra.client.request.error.count)) BY server.address\n | EVAL requests = 0.0\n )\n| STATS total_requests = SUM(requests), total_errors = SUM(errors) BY server.address\n// Minimum sample size to avoid noisy low-traffic nodes\n| WHERE total_requests > 10\n| EVAL error_rate_pct = ROUND(total_errors / total_requests * 100.0, 2)\n// Threshold: 5% error rate; tune for your tolerance\n| WHERE error_rate_pct > 5.0\n| SORT error_rate_pct DESC" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "server.address", - "termSize": 20, - "excludeHitsFromPreviousRun": true + "id": "cassandra_otel-high-error-rate", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when the per-node client error rate exceeds 5% (`error.count / request.count > 0.05`). Errors include timeouts, unavailable exceptions, and operation failures.", + "name": "[Cassandra OTel] High error rate by node", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "cassandra" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "cassandra_otel-overview" } + ], + "investigation_guide": { + "blob": "## Cassandra high error rate by node\n\n### What fired\nOn at least one node the computed error rate (`cassandra.client.request.error.count` / `cassandra.client.request.count`) exceeded 5% over the evaluation window.\n\n### Why it matters\nCassandra tracks Timeouts, Unavailable, and Failure exceptions per operation. Elevated error rate means client applications are receiving explicit failures and retry pressure is rising, which amplifies load and usually correlates with node instability, partition hotspots, or downed replicas.\n\n### Triage\n1. Overview dashboard: identify which node(s) show the highest error rate.\n2. Check `cassandra.storage.total_hints.in_progress.count` \u2014 hints are the canonical signal of replica unreachability.\n3. Inspect read/write p99 latencies on the affected node.\n4. Correlate with recent deploys, schema changes, or hardware events.\n\n### Remediation\n- Restore unavailable replicas (check gossip state, network, disks).\n- Rebalance hot partitions via schema / data modelling changes.\n- Temporarily reduce client consistency levels to keep reads available if a replica is permanently lost.\n\n### Tuning\n- 5% with a minimum of 10 requests is a generic default. Tighten to 1-2% for SLA-critical tiers; use a higher minimum sample size for sparsely-used clusters.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "TS metrics-cassandra.otel-default\n// Combine request count and error count per node via FORK\n| FORK (\n WHERE cassandra.client.request.count IS NOT NULL\n | STATS requests = SUM(INCREASE(cassandra.client.request.count)) BY server.address\n | EVAL errors = 0.0\n )\n (\n WHERE cassandra.client.request.error.count IS NOT NULL\n | STATS errors = SUM(INCREASE(cassandra.client.request.error.count)) BY server.address\n | EVAL requests = 0.0\n )\n| STATS total_requests = SUM(requests), total_errors = SUM(errors) BY server.address\n// Minimum sample size to avoid noisy low-traffic nodes\n| WHERE total_requests > 10\n| EVAL error_rate_pct = ROUND(total_errors / total_requests * 100.0, 2)\n// Threshold: 5% error rate; tune for your tolerance\n| WHERE error_rate_pct > 5.0\n| SORT error_rate_pct DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "server.address", + "termSize": 20, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/cassandra_otel/kibana/alerting_rule_template/cassandra_otel-high-range-slice-latency-p99.json b/packages/cassandra_otel/kibana/alerting_rule_template/cassandra_otel-high-range-slice-latency-p99.json index 68724c756f5..cbc0d1edc90 100644 --- a/packages/cassandra_otel/kibana/alerting_rule_template/cassandra_otel-high-range-slice-latency-p99.json +++ b/packages/cassandra_otel/kibana/alerting_rule_template/cassandra_otel-high-range-slice-latency-p99.json @@ -1,39 +1,52 @@ { - "id": "cassandra_otel-high-range-slice-latency-p99", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Cassandra OTel] High range slice latency (p99)", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "cassandra" - ], - "schedule": { - "interval": "5m" - }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "TS metrics-cassandra.otel-default\n// RangeSlice (range scans, secondary index) latency; inherently higher than single-partition reads\n| WHERE `cassandra.client.request.range_slice.latency.99p` IS NOT NULL\n| STATS p99_us = MAX(MAX_OVER_TIME(`cassandra.client.request.range_slice.latency.99p`)) BY server.address\n// Threshold: 500000 us = 500 ms; range scans are more expensive\n| WHERE p99_us > 500000\n| SORT p99_us DESC" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "server.address", - "termSize": 20, - "excludeHitsFromPreviousRun": true + "id": "cassandra_otel-high-range-slice-latency-p99", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when the p99 range-slice latency exceeds 500 ms (`cassandra.client.request.range_slice.latency.99p > 500000 us`). Range scans are inherently expensive but sustained high p99 indicates trouble.", + "name": "[Cassandra OTel] High range slice latency (p99)", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "cassandra" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "cassandra_otel-overview" } + ], + "investigation_guide": { + "blob": "## Cassandra high range slice latency (p99)\n\n### What fired\np99 of `cassandra.client.request.range_slice.latency.99p` exceeded 500 ms (500,000 \u00b5s) on at least one node.\n\n### Why it matters\nRange slices (used by secondary-index queries and full-table scans) are the most expensive read operation in Cassandra. Sustained high p99 indicates disk pressure, compaction backlog, or cross-DC queries; applications that depend on range scans will time out.\n\n### Triage\n1. Overview dashboard: confirm affected nodes and trend.\n2. Correlate with compaction backlog (`cassandra.compaction.tasks.pending`) and read latency.\n3. Look at disk I/O and page-cache pressure.\n4. Review query patterns \u2014 range scans on non-partition keys are an antipattern and may need data modelling changes.\n\n### Remediation\n- Resolve compaction backlog.\n- Move range-scan workloads to an analytics replica (Spark, Presto).\n- Redesign the data model to support the query with a partition-scoped read.\n\n### Tuning\n- 500 ms is a reasonable default; adjust to the latency tolerance of the dependent application.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "TS metrics-cassandra.otel-default\n// RangeSlice (range scans, secondary index) latency; inherently higher than single-partition reads\n| WHERE `cassandra.client.request.range_slice.latency.99p` IS NOT NULL\n| STATS p99_us = MAX(MAX_OVER_TIME(`cassandra.client.request.range_slice.latency.99p`)) BY server.address\n// Threshold: 500000 us = 500 ms; range scans are more expensive\n| WHERE p99_us > 500000\n| SORT p99_us DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "server.address", + "termSize": 20, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/cassandra_otel/kibana/alerting_rule_template/cassandra_otel-high-read-latency-p99.json b/packages/cassandra_otel/kibana/alerting_rule_template/cassandra_otel-high-read-latency-p99.json index 9f83694e09e..50390129d13 100644 --- a/packages/cassandra_otel/kibana/alerting_rule_template/cassandra_otel-high-read-latency-p99.json +++ b/packages/cassandra_otel/kibana/alerting_rule_template/cassandra_otel-high-read-latency-p99.json @@ -1,39 +1,52 @@ { - "id": "cassandra_otel-high-read-latency-p99", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Cassandra OTel] High read latency (p99)", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "cassandra" - ], - "schedule": { - "interval": "5m" - }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "TS metrics-cassandra.otel-default\n// p99 read latency in microseconds; backtick-quote field names with numeric segments\n| WHERE `cassandra.client.request.read.latency.99p` IS NOT NULL\n| STATS p99_us = MAX(MAX_OVER_TIME(`cassandra.client.request.read.latency.99p`)) BY server.address\n// Threshold: 100000 us = 100 ms; tune for your SLA\n| WHERE p99_us > 100000\n| SORT p99_us DESC" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "server.address", - "termSize": 20, - "excludeHitsFromPreviousRun": true + "id": "cassandra_otel-high-read-latency-p99", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when the p99 read latency exceeds 100 ms (`cassandra.client.request.read.latency.99p > 100000 us`). This is the primary read-side SLI for Cassandra.", + "name": "[Cassandra OTel] High read latency (p99)", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "cassandra" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "cassandra_otel-overview" } + ], + "investigation_guide": { + "blob": "## Cassandra high read latency (p99)\n\n### What fired\n`cassandra.client.request.read.latency.99p` exceeded 100 ms on at least one node during the evaluation window.\n\n### Why it matters\nRead latency p99 is the user-facing latency for point-lookup reads. Sustained high p99 reflects disk pressure, compaction backlog, GC pauses, or network issues between coordinator and replicas. Client SLAs break long before p99 becomes extreme.\n\n### Triage\n1. Overview dashboard: identify affected nodes and correlate with compaction pending, hints-in-progress, and error rate.\n2. Check disk I/O utilisation and free page cache.\n3. GC pauses \u2014 large young-gen or old-gen GCs pause the JVM and directly impact p99.\n4. Inspect per-query patterns \u2014 hot partitions can dominate a node.\n\n### Remediation\n- Resolve compaction backlog.\n- Fix hot partitions (data modelling).\n- Add capacity or tune JVM/GC for lower pauses.\n\n### Tuning\n- 100 ms is a reasonable default for point lookups. Tune to the consuming application's SLA.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "TS metrics-cassandra.otel-default\n// p99 read latency in microseconds; backtick-quote field names with numeric segments\n| WHERE `cassandra.client.request.read.latency.99p` IS NOT NULL\n| STATS p99_us = MAX(MAX_OVER_TIME(`cassandra.client.request.read.latency.99p`)) BY server.address\n// Threshold: 100000 us = 100 ms; tune for your SLA\n| WHERE p99_us > 100000\n| SORT p99_us DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "server.address", + "termSize": 20, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/cassandra_otel/kibana/alerting_rule_template/cassandra_otel-high-write-latency-p99.json b/packages/cassandra_otel/kibana/alerting_rule_template/cassandra_otel-high-write-latency-p99.json index b856fe2a3bf..64931eb0562 100644 --- a/packages/cassandra_otel/kibana/alerting_rule_template/cassandra_otel-high-write-latency-p99.json +++ b/packages/cassandra_otel/kibana/alerting_rule_template/cassandra_otel-high-write-latency-p99.json @@ -1,39 +1,52 @@ { - "id": "cassandra_otel-high-write-latency-p99", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Cassandra OTel] High write latency (p99)", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "cassandra" - ], - "schedule": { - "interval": "5m" - }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "TS metrics-cassandra.otel-default\n// p99 write latency in microseconds\n| WHERE `cassandra.client.request.write.latency.99p` IS NOT NULL\n| STATS p99_us = MAX(MAX_OVER_TIME(`cassandra.client.request.write.latency.99p`)) BY server.address\n// Threshold: 50000 us = 50 ms; writes typically lower than reads\n| WHERE p99_us > 50000\n| SORT p99_us DESC" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "server.address", - "termSize": 20, - "excludeHitsFromPreviousRun": true + "id": "cassandra_otel-high-write-latency-p99", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when the p99 write latency exceeds 50 ms (`cassandra.client.request.write.latency.99p > 50000 us`). Sustained high write p99 usually points to disk / commit-log pressure.", + "name": "[Cassandra OTel] High write latency (p99)", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "cassandra" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "cassandra_otel-overview" } + ], + "investigation_guide": { + "blob": "## Cassandra high write latency (p99)\n\n### What fired\n`cassandra.client.request.write.latency.99p` exceeded 50 ms on at least one node during the evaluation window.\n\n### Why it matters\nWrites in Cassandra go to the commit log and memtable; under healthy conditions they are very fast. High p99 writes indicate commit-log disk pressure, flush backlog, or GC pauses. Write amplification from large unbounded collections is another common culprit.\n\n### Triage\n1. Overview dashboard: confirm affected nodes and write volume.\n2. Check disk I/O on commit-log volume.\n3. Inspect memtable flush activity and pending compactions.\n4. Look at GC pauses \u2014 writes are latency-sensitive to stop-the-world events.\n\n### Remediation\n- Use dedicated commit-log disks / faster storage.\n- Throttle bulk-write workloads.\n- Review schema for unbounded collections or wide rows.\n\n### Tuning\n- 50 ms is a generic threshold. Writes are typically <10 ms on healthy clusters; tighten if that matches your baseline.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "TS metrics-cassandra.otel-default\n// p99 write latency in microseconds\n| WHERE `cassandra.client.request.write.latency.99p` IS NOT NULL\n| STATS p99_us = MAX(MAX_OVER_TIME(`cassandra.client.request.write.latency.99p`)) BY server.address\n// Threshold: 50000 us = 50 ms; writes typically lower than reads\n| WHERE p99_us > 50000\n| SORT p99_us DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "server.address", + "termSize": 20, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/cassandra_otel/kibana/alerting_rule_template/cassandra_otel-hints-in-progress.json b/packages/cassandra_otel/kibana/alerting_rule_template/cassandra_otel-hints-in-progress.json index cdaf71836db..b5083087eff 100644 --- a/packages/cassandra_otel/kibana/alerting_rule_template/cassandra_otel-hints-in-progress.json +++ b/packages/cassandra_otel/kibana/alerting_rule_template/cassandra_otel-hints-in-progress.json @@ -1,39 +1,52 @@ { - "id": "cassandra_otel-hints-in-progress", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Cassandra OTel] Hints in progress (replicas unreachable)", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "cassandra" - ], - "schedule": { - "interval": "5m" - }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "TS metrics-cassandra.otel-default\n// Gauge: hints currently being sent; > 0 means replicas are unreachable\n| WHERE cassandra.storage.total_hints.in_progress.count IS NOT NULL\n| STATS hints_in_progress = MAX(MAX_OVER_TIME(cassandra.storage.total_hints.in_progress.count)) BY server.address\n| WHERE hints_in_progress > 0\n| SORT hints_in_progress DESC" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "server.address", - "termSize": 20, - "excludeHitsFromPreviousRun": true + "id": "cassandra_otel-hints-in-progress", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when any hints are in progress (`cassandra.storage.total_hints.in_progress.count > 0`). In-progress hints indicate replicas were recently unreachable.", + "name": "[Cassandra OTel] Hints in progress (replicas unreachable)", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "cassandra" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "cassandra_otel-overview" } + ], + "investigation_guide": { + "blob": "## Cassandra hints in progress\n\n### What fired\n`cassandra.storage.total_hints.in_progress.count` exceeded 0 on at least one node.\n\n### Why it matters\nWhen a replica is unreachable, coordinators write hints for it that are replayed once the replica recovers. Hints in progress signal that one or more replicas were recently down or network-partitioned. If hints accumulate faster than they can be replayed, data loss becomes possible after `max_hint_window_in_ms`.\n\n### Triage\n1. Overview dashboard: identify nodes generating hints and their destinations.\n2. Check gossip / liveness state across the cluster.\n3. Correlate with error rate spikes and network health.\n4. Confirm the unreachable node has come back online and hint replay is progressing.\n\n### Remediation\n- Restore the unreachable node(s) as soon as possible.\n- If a replica is permanently lost, decommission and let hinted handoff expire; then run `nodetool repair` to restore consistency.\n\n### Tuning\n- Fires on any active hint. In-progress hints are always worth noting \u2014 do not loosen.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "TS metrics-cassandra.otel-default\n// Gauge: hints currently being sent; > 0 means replicas are unreachable\n| WHERE cassandra.storage.total_hints.in_progress.count IS NOT NULL\n| STATS hints_in_progress = MAX(MAX_OVER_TIME(cassandra.storage.total_hints.in_progress.count)) BY server.address\n| WHERE hints_in_progress > 0\n| SORT hints_in_progress DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "server.address", + "termSize": 20, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/cassandra_otel/kibana/alerting_rule_template/cassandra_otel-request-errors.json b/packages/cassandra_otel/kibana/alerting_rule_template/cassandra_otel-request-errors.json index b3fdb02c758..b36884d7d30 100644 --- a/packages/cassandra_otel/kibana/alerting_rule_template/cassandra_otel-request-errors.json +++ b/packages/cassandra_otel/kibana/alerting_rule_template/cassandra_otel-request-errors.json @@ -1,39 +1,52 @@ { - "id": "cassandra_otel-request-errors", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Cassandra OTel] Request errors (Timeout, Unavailable, Failure)", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "cassandra" - ], - "schedule": { - "interval": "5m" - }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "TS metrics-cassandra.otel-default\n// Counter metric: use INCREASE for absolute error count in window\n| WHERE cassandra.client.request.error.count IS NOT NULL\n| STATS error_count = SUM(INCREASE(cassandra.client.request.error.count)) BY server.address, attributes.operation, attributes.status\n| WHERE error_count > 0\n| SORT error_count DESC" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "server.address", - "termSize": 20, - "excludeHitsFromPreviousRun": true + "id": "cassandra_otel-request-errors", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when any increase in client request errors is observed (`cassandra.client.request.error.count > 0`), broken down by operation and status (Timeout, Unavailable, Failure).", + "name": "[Cassandra OTel] Request errors (Timeout, Unavailable, Failure)", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "cassandra" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "cassandra_otel-overview" } + ], + "investigation_guide": { + "blob": "## Cassandra request errors\n\n### What fired\n`cassandra.client.request.error.count` increased during the evaluation window, dimensioned by `attributes.operation` and `attributes.status`.\n\n### Why it matters\nCassandra distinguishes three error types:\n\n- **Timeout**: the coordinator did not hear back from enough replicas within the timeout.\n- **Unavailable**: not enough replicas were alive at query time to satisfy the consistency level.\n- **Failure**: an explicit non-timeout error (e.g. corruption, read failure) occurred on a replica.\n\nEach points to a different class of issue and the breakdown by `operation` (read / write / range-slice) narrows the triage further.\n\n### Triage\n1. Overview dashboard: look at the error type distribution first.\n2. For **Unavailable** \u2192 check gossip state / replica liveness.\n3. For **Timeout** \u2192 check latency p99 for the same operation, compaction pending, and GC pauses.\n4. For **Failure** \u2192 inspect node logs for explicit tracebacks (possible corruption / disk errors).\n\n### Remediation\n- Restore liveness or address latency root cause based on the dominant error type.\n- Consider lowering consistency level temporarily if Unavailable errors dominate and replicas cannot be restored quickly.\n\n### Tuning\n- Fires on any error. For a noisy environment where occasional errors are expected, require `error_count > N` in the query.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "TS metrics-cassandra.otel-default\n// Counter metric: use INCREASE for absolute error count in window\n| WHERE cassandra.client.request.error.count IS NOT NULL\n| STATS error_count = SUM(INCREASE(cassandra.client.request.error.count)) BY server.address, attributes.operation, attributes.status\n| WHERE error_count > 0\n| SORT error_count DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "server.address", + "termSize": 20, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/cassandra_otel/manifest.yml b/packages/cassandra_otel/manifest.yml index 5803e203dde..db746550868 100644 --- a/packages/cassandra_otel/manifest.yml +++ b/packages/cassandra_otel/manifest.yml @@ -1,7 +1,7 @@ format_version: 3.5.7 name: cassandra_otel title: "Cassandra OpenTelemetry Assets" -version: 0.1.0 +version: 0.2.0 source: license: "Elastic-2.0" description: "Apache Cassandra Assets from OpenTelemetry Collector" @@ -12,7 +12,7 @@ categories: - opentelemetry conditions: kibana: - version: "^9.2.1" + version: "^9.4.0" elastic: subscription: "basic" discovery: diff --git a/packages/cockroachdb_otel/changelog.yml b/packages/cockroachdb_otel/changelog.yml index 8b68fb27127..4271a43014f 100644 --- a/packages/cockroachdb_otel/changelog.yml +++ b/packages/cockroachdb_otel/changelog.yml @@ -1,4 +1,9 @@ # newer versions go on top +- version: "0.2.0" + changes: + - description: Add description and artifact fields to alerting rule template. + type: enhancement + link: https://github.com/elastic/integrations/pull/18506 - version: "0.1.0" changes: - description: Initial draft of the package diff --git a/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-cpu-overload.json b/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-cpu-overload.json index 2bab499b67b..d1f80189326 100644 --- a/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-cpu-overload.json +++ b/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-cpu-overload.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when a CockroachDB node's process CPU utilization exceeds 80% (normalized 0-1). Sustained values above this threshold indicate the workload is outpacing provisioned CPU and will starve Raft, compactions and SQL execution.", "name": "[CockroachDB OTel] CPU utilization high", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "cockroachdb_otel-overview" + }, + { + "id": "cockroachdb_otel-sql-transactions" + } + ], + "investigation_guide": { + "blob": "## CockroachDB CPU utilization high\n\n### What fired\n`sys_cpu_combined_percent_normalized` for a node exceeded `0.8` (80%) over the evaluation window. This is the CockroachDB process CPU only (not the host), normalized by the number of cores visible to the process.\n\n### Why it matters\nSustained high CPU on a CockroachDB node causes queue build-up across SQL execution, Raft, and the storage engine. Knock-on effects commonly include elevated `sys_runnable_goroutines_per_cpu`, LSM compaction backlogs (`storage_l0_sublevels`, `storage_l0_num_files`), write stalls, and unavailable/under-replicated ranges if leaseholders stop heartbeating.\n\n### Triage\n1. Confirm the scope \u2014 single node or cluster-wide? Check the CockroachDB OTel Overview dashboard, filtered by `attributes.node_id`.\n2. Compare host CPU (`sys_cpu_host_combined_percent_normalized`) with process CPU. If host CPU is also pegged, another process on the box is competing.\n3. Inspect SQL workload on the hot node: `sql_statements_active`, `sql_select_count`/`sql_insert_count`/... rates and `sql_distsql_contended_queries_count`.\n4. Check for hot ranges / lease imbalance: `replicas_leaseholders` and `rebalancing_queriespersecond` per store.\n5. Look at `sys_runnable_goroutines_per_cpu`; values >30 corroborate true CPU saturation.\n\n### Remediation\n- If one node is hot, consider lease rebalancing or identifying hot keys in the SQL workload.\n- If cluster-wide, scale out nodes or investigate a recently deployed workload / migration.\n- Check for expensive queries via `sql_full_scan_count` increases.\n\n### Tuning\n- Default threshold is 80% (`> 0.8`). For steady-state heavy workloads, raise to 0.85-0.9 to reduce noise.\n- `alertDelay.active` is 2 consecutive runs over a 10-minute window \u2014 increase for flappy environments.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-cockroachdb.otel-default*\n| WHERE service.name == \"cockroachdb\"\n AND sys_cpu_combined_percent_normalized IS NOT NULL\n AND sys_cpu_combined_percent_normalized > 0.8\n// Node-level: CockroachDB process CPU; > 0.8 indicates overload\n| WHERE attributes.node_id IS NOT NULL AND attributes.store IS NULL\n| STATS cpu_pct = ROUND(MAX(sys_cpu_combined_percent_normalized) * 100.0, 1) BY attributes.node_id, host.name\n| SORT cpu_pct DESC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 10, diff --git a/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-deadlocks-detected.json b/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-deadlocks-detected.json index 4a2655d7ce9..b58babee35b 100644 --- a/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-deadlocks-detected.json +++ b/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-deadlocks-detected.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `txnwaitqueue_deadlocks_total` increases on a node, indicating transactions are deadlocking in the wait queue. Deadlocks typically point to application-level ordering issues or long-running transactions holding contended keys.", "name": "[CockroachDB OTel] Deadlocks detected", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "cockroachdb_otel-sql-transactions" + }, + { + "id": "cockroachdb_otel-overview" + } + ], + "investigation_guide": { + "blob": "## CockroachDB deadlocks detected\n\n### What fired\nThe counter `txnwaitqueue_deadlocks_total` increased on one or more nodes over the evaluation window.\n\n### Why it matters\nDeadlocks indicate two or more transactions are waiting on each other's locks and CockroachDB had to abort at least one to break the cycle. Application latency and error rates will be affected, and repeated deadlocks often point to a systemic access-ordering problem.\n\n### Triage\n1. Identify affected nodes from the alert context and inspect the SQL & Transactions dashboard.\n2. Correlate with contention signals: `sql_distsql_contended_queries_count`, `kv_concurrency_avg_lock_wait_duration_nanos`, `kv_concurrency_max_lock_wait_duration_nanos`.\n3. Look at `sql_txn_abort_count` and `txn_restarts_serializable` for the same window; deadlock aborts will show up here.\n4. In application logs/traces, find transactions that repeatedly touch the same keys in different orders.\n\n### Remediation\n- Enforce a canonical order for accessing keys across transactions.\n- Keep transactions short; batch reads before writes where possible.\n- Consider `SELECT ... FOR UPDATE` to acquire locks in a predictable order.\n\n### Tuning\n- This rule fires on *any* increase in the counter. If low-rate deadlocks are acceptable, switch the query to threshold on a rate (e.g. `deadlocks > N` per 10m).\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "TS metrics-cockroachdb.otel-default*\n| WHERE service.name == \"cockroachdb\"\n AND txnwaitqueue_deadlocks_total IS NOT NULL\n| WHERE attributes.node_id IS NOT NULL AND attributes.store IS NULL\n// INCREASE for counter: any increase indicates deadlocks detected\n| STATS deadlocks = SUM(INCREASE(txnwaitqueue_deadlocks_total)) BY attributes.node_id, host.name\n| WHERE deadlocks > 0\n| SORT deadlocks DESC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 10, diff --git a/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-disk-io-bottleneck.json b/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-disk-io-bottleneck.json index 36ef3acc4dd..bd64781e65b 100644 --- a/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-disk-io-bottleneck.json +++ b/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-disk-io-bottleneck.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when a node has more than 10 disk I/O operations queued (`sys_host_disk_iopsinprogress > 10`). Persistent double-digit queues indicate the disk cannot keep up with the workload and will cascade into write stalls and LSM back-pressure.", "name": "[CockroachDB OTel] Disk I/O operations queued high", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "cockroachdb_otel-storage" + }, + { + "id": "cockroachdb_otel-overview" + } + ], + "investigation_guide": { + "blob": "## CockroachDB disk I/O queue high\n\n### What fired\n`sys_host_disk_iopsinprogress` for a node exceeded 10 over the evaluation window. CockroachDB documents single-digit queues as normal; persistent double digits indicate an I/O bottleneck.\n\n### Why it matters\nA saturated disk is the most common root cause of CockroachDB performance incidents. When the disk cannot keep up, the storage engine backs up, leading to write stalls (`storage_write_stalls`), LSM growth (`storage_l0_sublevels`, `storage_l0_num_files`), elevated read amplification and, eventually, range unavailability.\n\n### Triage\n1. Use the Storage dashboard to correlate `sys_host_disk_iopsinprogress` with `storage_write_stalls`, `admission_io_overload`, and L0 metrics.\n2. Check throughput: `sys_host_disk_read_bytes` and `sys_host_disk_write_bytes` rates. Is read- or write-heavy behavior driving the queue?\n3. Compare affected node(s) with peers \u2014 is traffic imbalanced (`rebalancing_queriespersecond`, `replicas_leaseholders`)?\n4. Inspect host-level monitoring (iostat, cloud provider volume metrics) for disk utilization, await time, and provisioned IOPS headroom.\n\n### Remediation\n- Provision higher IOPS / faster storage (e.g. upgrade to NVMe or higher-tier cloud volumes).\n- Rebalance traffic if a single node is hot.\n- Reduce write amplification by tuning workload (smaller batches, fewer indexes).\n\n### Tuning\n- Threshold (`> 10`) and 10-minute window are reasonable defaults. Tighten the window for latency-sensitive tiers.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-cockroachdb.otel-default*\n| WHERE service.name == \"cockroachdb\"\n AND sys_host_disk_iopsinprogress IS NOT NULL\n AND sys_host_disk_iopsinprogress > 10\n// Persistent double digits indicate I/O bottleneck\n| WHERE attributes.node_id IS NOT NULL AND attributes.store IS NULL\n| STATS iops_queued = MAX(sys_host_disk_iopsinprogress) BY attributes.node_id, host.name\n| SORT iops_queued DESC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 10, diff --git a/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-goroutines-overload.json b/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-goroutines-overload.json index 462e054e315..ded2eefcfea 100644 --- a/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-goroutines-overload.json +++ b/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-goroutines-overload.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when the average runnable goroutines per CPU on a node exceeds 30 (`sys_runnable_goroutines_per_cpu > 30`). This is CockroachDB's canonical indicator that the Go scheduler cannot keep up and the process is CPU-bound.", "name": "[CockroachDB OTel] Runnable goroutines per CPU high", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "cockroachdb_otel-overview" + }, + { + "id": "cockroachdb_otel-sql-transactions" + } + ], + "investigation_guide": { + "blob": "## CockroachDB runnable goroutines per CPU high\n\n### What fired\n`sys_runnable_goroutines_per_cpu` exceeded 30 on at least one node over the evaluation window.\n\n### Why it matters\nThis metric is CockroachDB's preferred indicator of CPU saturation inside the process \u2014 it reflects scheduling pressure regardless of what the host CPU gauge shows. Sustained values above 30 produce elevated tail latency across all workloads.\n\n### Triage\n1. Cross-check with `sys_cpu_combined_percent_normalized` and `sys_cpu_host_combined_percent_normalized` \u2014 both should be high if the node is truly CPU-bound.\n2. Look at the workload: `sql_statements_active`, `sql_distsql_contended_queries_count`, `sql_full_scan_count` rate.\n3. Inspect leaseholder/replica balance (`replicas_leaseholders`, `rebalancing_queriespersecond`) \u2014 one node holding too many leases produces this pattern.\n4. Check for GC or compaction storms (`rocksdb_compactions` rate) that can saturate CPU indirectly.\n\n### Remediation\n- Rebalance leases / add nodes.\n- Eliminate expensive queries (full scans, large joins) surfaced by the application.\n- Upgrade instance size if the workload is simply beyond provisioned cores.\n\n### Tuning\n- `> 30` is the documented overload threshold. Lower only if you need earlier warning.\n- Default 10-minute window filters brief GC/scheduling spikes.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-cockroachdb.otel-default*\n| WHERE service.name == \"cockroachdb\"\n AND sys_runnable_goroutines_per_cpu IS NOT NULL\n AND sys_runnable_goroutines_per_cpu > 30\n// Values > 30 indicate CPU overload\n| WHERE attributes.node_id IS NOT NULL AND attributes.store IS NULL\n| STATS goroutines_per_cpu = MAX(sys_runnable_goroutines_per_cpu) BY attributes.node_id, host.name\n| SORT goroutines_per_cpu DESC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 10, diff --git a/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-io-admission-overload.json b/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-io-admission-overload.json index 7db2ae535b6..d36b4c0c9db 100644 --- a/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-io-admission-overload.json +++ b/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-io-admission-overload.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when CockroachDB's IO admission-control subsystem reports overload (`admission_io_overload > 1.0`) on a store. The admission controller is shedding or queueing work to protect the storage engine.", "name": "[CockroachDB OTel] IO admission control overload", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "cockroachdb_otel-storage" + }, + { + "id": "cockroachdb_otel-overview" + } + ], + "investigation_guide": { + "blob": "## CockroachDB IO admission control overload\n\n### What fired\n`admission_io_overload` for one or more stores exceeded 1.0 over the evaluation window.\n\n### Why it matters\nAdmission control is CockroachDB's internal back-pressure mechanism. A value above 1.0 means the subsystem is actively shedding or delaying work because the storage engine would otherwise fall behind. This precedes user-visible symptoms: write stalls, elevated latency, and LSM health degradation.\n\n### Triage\n1. Confirm affected store(s) from the alert context (`attributes.node_id`, `attributes.store`).\n2. Inspect the Storage dashboard for correlated signals: `storage_write_stalls`, `storage_l0_sublevels`, `storage_l0_num_files`, `rocksdb_read_amplification`.\n3. Look at the disk itself: `sys_host_disk_iopsinprogress`, and cloud-provider IOPS/throughput metrics.\n4. Check CPU; under-provisioned CPU can starve compactions and trigger admission overload even when the disk is fine.\n\n### Remediation\n- If the disk is saturated, provision faster/bigger storage.\n- If CPU is starved, scale up cores or reduce workload intensity.\n- Reduce write amplification (batch size, indexing strategy) if compactions are the bottleneck.\n\n### Tuning\n- The `> 1.0` threshold matches CockroachDB's documented \"overloaded\" semantics.\n- Consider widening the window to 15m+ if brief spikes are tolerable.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-cockroachdb.otel-default*\n| WHERE service.name == \"cockroachdb\"\n AND admission_io_overload IS NOT NULL\n AND admission_io_overload > 1.0\n// Values > 1.0 indicate storage overload; adjust threshold if needed\n| WHERE attributes.node_id IS NOT NULL AND attributes.store IS NOT NULL\n| STATS overload = MAX(admission_io_overload) BY attributes.node_id, attributes.store, host.name\n| SORT overload DESC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 10, diff --git a/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-l0-files-high.json b/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-l0-files-high.json index 7927b533a6e..6fc865a13ad 100644 --- a/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-l0-files-high.json +++ b/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-l0-files-high.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when the number of files at L0 in the Pebble LSM tree exceeds 1000 on a store (`storage_l0_num_files > 1000`). This is a canonical sign that compactions are falling behind and read amplification is about to degrade.", "name": "[CockroachDB OTel] L0 file count high", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "cockroachdb_otel-storage" + }, + { + "id": "cockroachdb_otel-overview" + } + ], + "investigation_guide": { + "blob": "## CockroachDB L0 file count high\n\n### What fired\n`storage_l0_num_files` on one or more stores exceeded 1000 over the evaluation window.\n\n### Why it matters\nL0 is the top of the Pebble LSM tree. Files land here from in-memory flushes and are compacted down into L1+. When L0 grows past ~1000 files, read amplification rises sharply and compaction debt becomes self-reinforcing. Left unchecked, the store will begin stalling writes.\n\n### Triage\n1. Open the Storage dashboard filtered by `attributes.node_id` / `attributes.store`.\n2. Correlate with `storage_l0_sublevels` (sub-levels growing?), `rocksdb_read_amplification` (rising?), `rocksdb_compactions` rate (stalling or thrashing?).\n3. Check admission control (`admission_io_overload`) and disk queue (`sys_host_disk_iopsinprogress`).\n4. Inspect CPU \u2014 compactions are CPU-heavy and will be squeezed out of a hot node.\n\n### Remediation\n- Reduce write throughput to the store until compactions catch up.\n- Scale out to redistribute writes across more stores.\n- Investigate disk speed; faster disks finish compactions faster.\n\n### Tuning\n- Default threshold (`> 1000`, 15-minute window) is conservative. For latency-sensitive clusters, tighten to `> 500`.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-cockroachdb.otel-default*\n| WHERE service.name == \"cockroachdb\"\n AND storage_l0_num_files IS NOT NULL\n AND storage_l0_num_files > 1000\n// > 1000 indicates compaction falling behind\n| WHERE attributes.node_id IS NOT NULL AND attributes.store IS NOT NULL\n| STATS l0_files = MAX(storage_l0_num_files) BY attributes.node_id, attributes.store, host.name\n| SORT l0_files DESC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-liveness-heartbeat-failures.json b/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-liveness-heartbeat-failures.json index 10e76cd108d..85394bb52a9 100644 --- a/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-liveness-heartbeat-failures.json +++ b/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-liveness-heartbeat-failures.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when a node registers any liveness heartbeat failures (`liveness_heartbeatfailures` increases). Liveness is how CockroachDB decides a node is alive; any failure is an early signal of instability or a network/gossip problem.", "name": "[CockroachDB OTel] Liveness heartbeat failures", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "cockroachdb_otel-overview" + }, + { + "id": "cockroachdb_otel-replication" + } + ], + "investigation_guide": { + "blob": "## CockroachDB liveness heartbeat failures\n\n### What fired\nThe counter `liveness_heartbeatfailures` increased on at least one node over the evaluation window.\n\n### Why it matters\nNode liveness is the foundation of CockroachDB's availability model. A node that misses heartbeats loses its leases and its ranges are temporarily unavailable until another replica takes over. Repeated failures usually indicate network partitioning, severe CPU/disk saturation, or clock skew.\n\n### Triage\n1. Check whether the node is still in the cluster (`liveness_livenodes` gauge across the cluster).\n2. Inspect correlated signals on the affected node: `rpc_connection_unhealthy`, `rpc_connection_failures`, `clock_offset_meannanos`.\n3. Look for resource starvation: `sys_cpu_combined_percent_normalized`, `sys_runnable_goroutines_per_cpu`, `sys_host_disk_iopsinprogress`.\n4. Verify NTP/chrony health on the host \u2014 clock skew above `cluster.max_offset` will push a node out of the liveness quorum.\n\n### Remediation\n- Fix the underlying resource pressure or network path.\n- Confirm NTP is running and offsets are sub-millisecond.\n- If the node is isolated, check firewall/security group rules between nodes.\n\n### Tuning\n- This rule fires on *any* increase, intentionally \u2014 a 1-minute evaluation with a 5-minute window catches brief incidents without being too noisy.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "TS metrics-cockroachdb.otel-default*\n| WHERE service.name == \"cockroachdb\"\n AND liveness_heartbeatfailures IS NOT NULL\n| WHERE attributes.node_id IS NOT NULL AND attributes.store IS NULL\n// INCREASE for counter: any increase indicates heartbeat failures\n| STATS increase = SUM(INCREASE(liveness_heartbeatfailures)) BY attributes.node_id, host.name\n| WHERE increase > 0\n| SORT increase DESC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 5, diff --git a/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-lsm-read-amplification.json b/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-lsm-read-amplification.json index 7a04569f269..892f62a02d0 100644 --- a/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-lsm-read-amplification.json +++ b/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-lsm-read-amplification.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `rocksdb_read_amplification` on a store exceeds 50, indicating each logical read is costing many physical reads. This is a strong signal of an unhealthy LSM tree.", "name": "[CockroachDB OTel] LSM read amplification high", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "cockroachdb_otel-storage" + }, + { + "id": "cockroachdb_otel-overview" + } + ], + "investigation_guide": { + "blob": "## CockroachDB LSM read amplification high\n\n### What fired\n`rocksdb_read_amplification` on one or more stores exceeded 50 over the evaluation window. In a healthy cluster this metric should sit in the single digits.\n\n### Why it matters\nRead amplification is the average number of physical disk reads required per logical read. Elevated values mean the LSM tree is bloated (usually L0/L6 imbalance) and all reads are paying the cost. Sustained values above 50 are documented as \"unhealthy LSM\".\n\n### Triage\n1. On the Storage dashboard, correlate with `storage_l0_sublevels`, `storage_l0_num_files`, `rocksdb_compactions` rate, and `admission_io_overload`.\n2. Check disk health: `sys_host_disk_iopsinprogress`, provider IOPS metrics.\n3. Verify compactions aren't CPU-starved (`sys_cpu_combined_percent_normalized`, `sys_runnable_goroutines_per_cpu`).\n\n### Remediation\n- Throttle writes until compactions recover.\n- Provision more IOPS or CPU if compactions cannot keep up with steady-state load.\n- As a last resort, consider a manual compaction (advanced \u2014 coordinate with support).\n\n### Tuning\n- `> 50` for 15 minutes is the documented unhealthy-LSM signal. Lower for early warning or raise if your workload tolerates higher amplification.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-cockroachdb.otel-default*\n| WHERE service.name == \"cockroachdb\"\n AND rocksdb_read_amplification IS NOT NULL\n AND rocksdb_read_amplification > 50\n// > 50 for sustained period indicates unhealthy LSM\n| WHERE attributes.node_id IS NOT NULL AND attributes.store IS NOT NULL\n| STATS amplification = MAX(rocksdb_read_amplification) BY attributes.node_id, host.name\n| SORT amplification DESC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-rpc-unhealthy.json b/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-rpc-unhealthy.json index 73c915b3820..dd6cd0e871f 100644 --- a/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-rpc-unhealthy.json +++ b/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-rpc-unhealthy.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when a node reports one or more unhealthy bidirectional inter-node RPC connections (`rpc_connection_unhealthy > 0`). This is a strong signal of a network partition or a struggling peer.", "name": "[CockroachDB OTel] Unhealthy RPC connections", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "cockroachdb_otel-overview" + }, + { + "id": "cockroachdb_otel-replication" + } + ], + "investigation_guide": { + "blob": "## CockroachDB unhealthy RPC connections\n\n### What fired\n`rpc_connection_unhealthy` on at least one node exceeded 0 over the evaluation window. In a healthy cluster this gauge should always be 0.\n\n### Why it matters\nCockroachDB maintains bidirectional RPC connections between every pair of nodes. An unhealthy connection means the two nodes cannot reliably talk to each other, which can cause lease churn, replication delays, and \u2014 if persistent \u2014 under-replication or range unavailability.\n\n### Triage\n1. Identify the reporting node from the alert context and, from the Overview dashboard, check which peer(s) are unhealthy.\n2. Inspect `rpc_connection_failures` and `rpc_connection_avg_round_trip_latency` on both ends of the suspect link.\n3. Correlate with `liveness_heartbeatfailures` and `liveness_livenodes`.\n4. Check `clock_offset_meannanos` \u2014 large clock skew can poison the connection.\n5. Inspect host-level network metrics and your cloud's VPC/security-group configuration for the affected pair.\n\n### Remediation\n- Fix the network path (MTU, firewall, route) between the two nodes.\n- Restart NTP/chrony and verify clock skew is sub-millisecond.\n- If one node is unhealthy due to resource pressure, treat it as a separate incident.\n\n### Tuning\n- 1-minute evaluation with a 5-minute window is tight on purpose \u2014 RPC unhealth is a leading indicator of outages.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-cockroachdb.otel-default*\n| WHERE service.name == \"cockroachdb\"\n AND rpc_connection_unhealthy IS NOT NULL\n AND rpc_connection_unhealthy > 0\n// > 0 indicates possible network partition\n| WHERE attributes.node_id IS NOT NULL AND attributes.store IS NULL\n| STATS unhealthy = MAX(rpc_connection_unhealthy) BY attributes.node_id, host.name\n| SORT unhealthy DESC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 5, diff --git a/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-storage-capacity-high.json b/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-storage-capacity-high.json index 0d1c3a6191b..bb1400fde26 100644 --- a/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-storage-capacity-high.json +++ b/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-storage-capacity-high.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when used storage exceeds 60% of the total capacity on a store (`capacity_used / capacity > 0.6`). CockroachDB recommends keeping used below 60% to leave headroom for snapshots, compactions, and rebalancing.", "name": "[CockroachDB OTel] Storage capacity usage high", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "cockroachdb_otel-storage" + }, + { + "id": "cockroachdb_otel-overview" + } + ], + "investigation_guide": { + "blob": "## CockroachDB storage capacity usage high\n\n### What fired\n`capacity_used / capacity` on at least one store exceeded 60% over the evaluation window.\n\n### Why it matters\nCockroachDB documents 60% as the safe headroom threshold. Above this, the cluster has less room for rebalancing, snapshots for up-replication, and compactions. Above ~90% the node will refuse writes.\n\n### Triage\n1. On the Storage dashboard, see which store(s) are highest and whether usage is trending upwards.\n2. Is the usage cluster-wide (steady data growth) or concentrated on a subset of stores (rebalancing lag)?\n3. Check `ranges` and `replicas` distribution across stores \u2014 imbalance will reveal itself here.\n\n### Remediation\n- Add nodes/capacity; let the rebalancer spread data.\n- Identify and archive or TTL large tables.\n- If the imbalance is temporary (decommission, node add), wait for rebalancing to complete; otherwise investigate `rebalancing_range_rebalances` progress.\n\n### Tuning\n- The `0.6` (60%) threshold is documented; raise to 0.7 for clusters with aggressive rebalancing SLOs, never higher than 0.8.\n- 15-minute evaluation window keeps noise low for a slow-moving metric.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-cockroachdb.otel-default*\n| WHERE service.name == \"cockroachdb\"\n AND capacity IS NOT NULL\n AND capacity > 0\n AND capacity_used IS NOT NULL\n// Used should not exceed 60% of total; adjust ratio if needed\n| EVAL usage_ratio = capacity_used / capacity\n| WHERE usage_ratio > 0.6\n| WHERE attributes.node_id IS NOT NULL AND attributes.store IS NOT NULL\n| STATS usage_pct = ROUND(MAX(usage_ratio) * 100.0, 1), used_gb = ROUND(MAX(capacity_used) / 1073741824.0, 1), total_gb = ROUND(MAX(capacity) / 1073741824.0, 1)\n BY attributes.node_id, attributes.store, host.name\n| SORT usage_pct DESC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-storage-write-stalls.json b/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-storage-write-stalls.json index 47249852fd9..58e317f0c9c 100644 --- a/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-storage-write-stalls.json +++ b/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-storage-write-stalls.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when a store reports any intentional write stalls due to disk backpressure (`storage_write_stalls > 0`). Write stalls are a direct user impact: SQL writes will be latent or failing while they persist.", "name": "[CockroachDB OTel] Storage write stalls detected", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "cockroachdb_otel-storage" + }, + { + "id": "cockroachdb_otel-overview" + } + ], + "investigation_guide": { + "blob": "## CockroachDB storage write stalls detected\n\n### What fired\nThe gauge `storage_write_stalls` on at least one store exceeded 0 over the evaluation window.\n\n### Why it matters\nWrite stalls are intentional back-pressure by the storage engine to prevent the LSM from collapsing. While they are preferable to an unavailable store, they cause direct user-visible latency spikes for any workload writing through that store.\n\n### Triage\n1. On the Storage dashboard, correlate stalls with `admission_io_overload`, `storage_l0_sublevels`, `storage_l0_num_files`, and `rocksdb_read_amplification`.\n2. Check disk queue (`sys_host_disk_iopsinprogress`) and host I/O.\n3. Check CPU \u2014 compaction starvation can cause stalls.\n\n### Remediation\n- Increase provisioned IOPS / upgrade storage.\n- Shed or throttle the write workload temporarily.\n- Add nodes to spread the write load.\n\n### Tuning\n- This rule fires on any stall. For noisy environments, require `> N` over a longer window.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-cockroachdb.otel-default*\n| WHERE service.name == \"cockroachdb\"\n AND storage_write_stalls IS NOT NULL\n AND storage_write_stalls > 0\n// Store-level metric: intentional write stalls due to disk backpressure\n| WHERE attributes.node_id IS NOT NULL AND attributes.store IS NOT NULL\n| STATS stalls = MAX(storage_write_stalls) BY attributes.node_id, attributes.store, host.name\n| SORT stalls DESC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 5, diff --git a/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-unavailable-ranges.json b/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-unavailable-ranges.json index d92c69fc39b..4b34090033a 100644 --- a/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-unavailable-ranges.json +++ b/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-unavailable-ranges.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when any ranges become unavailable (`ranges_unavailable > 0`), meaning there are insufficient live replicas to form a quorum. Unavailable ranges cannot be read or written \u2014 this is a P1 condition.", "name": "[CockroachDB OTel] Unavailable ranges detected", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "cockroachdb_otel-replication" + }, + { + "id": "cockroachdb_otel-overview" + } + ], + "investigation_guide": { + "blob": "## CockroachDB unavailable ranges\n\n### What fired\n`ranges_unavailable` on at least one store exceeded 0 over the evaluation window.\n\n### Why it matters\nAn unavailable range has lost quorum. Reads and writes against keys in that range will block or fail until quorum is restored. This is the single most critical CockroachDB availability signal and should be treated as a P1 incident.\n\n### Triage\n1. Check `liveness_livenodes` against the expected cluster size \u2014 node loss is the most common cause.\n2. On the Replication & Ranges dashboard, inspect `ranges_underreplicated` and per-store replica distribution.\n3. Verify network health between nodes: `rpc_connection_unhealthy`, `rpc_connection_failures`, `clock_offset_meannanos`.\n4. Check for cascading resource issues that could be causing multiple nodes to heartbeat-fail simultaneously.\n\n### Remediation\n- Restore quorum by bringing failed nodes back online, or (if the node is lost) decommission it to trigger up-replication from surviving replicas.\n- Fix network partitions between DCs/AZs.\n- Page on-call immediately if unavailable ranges persist more than a few minutes.\n\n### Tuning\n- This rule fires on any unavailable range, intentionally. The 1-minute evaluation and 5-minute window balance responsiveness against transient rebalancing noise.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-cockroachdb.otel-default*\n| WHERE service.name == \"cockroachdb\"\n AND ranges_unavailable IS NOT NULL\n AND ranges_unavailable > 0\n// Store-level metric: both node_id and store must be present\n| WHERE attributes.node_id IS NOT NULL AND attributes.store IS NOT NULL\n| STATS unavailable = MAX(ranges_unavailable) BY attributes.node_id, attributes.store, host.name\n| SORT unavailable DESC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 5, diff --git a/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-underreplicated-ranges.json b/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-underreplicated-ranges.json index a6a026e1ec0..141a1faa034 100644 --- a/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-underreplicated-ranges.json +++ b/packages/cockroachdb_otel/kibana/alerting_rule_template/cockroachdb_otel-underreplicated-ranges.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when any ranges have fewer replicas than the replication target (`ranges_underreplicated > 0`). The cluster is still serving traffic but has lost redundancy \u2014 further failures could cause unavailability.", "name": "[CockroachDB OTel] Under-replicated ranges detected", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "cockroachdb_otel-replication" + }, + { + "id": "cockroachdb_otel-overview" + } + ], + "investigation_guide": { + "blob": "## CockroachDB under-replicated ranges\n\n### What fired\n`ranges_underreplicated` on at least one store exceeded 0 over the evaluation window.\n\n### Why it matters\nAn under-replicated range is still serving traffic but has fewer replicas than the replication factor specifies. The cluster has lost its safety margin: one more replica failure in the affected range(s) can cause unavailability.\n\n### Triage\n1. Check `liveness_livenodes`. Are any nodes down or decommissioning?\n2. Inspect `ranges_underreplicated` by `attributes.node_id` / `attributes.store` to see whether the issue is concentrated or spread.\n3. Look at `rebalancing_range_rebalances` \u2014 if up-replication is in flight, the metric will drain on its own.\n4. Check available capacity on surviving stores (`capacity_used / capacity`); full stores cannot host new replicas.\n\n### Remediation\n- If a node is down, bring it back or replace it.\n- If capacity is full, add nodes; CockroachDB will up-replicate automatically once capacity exists.\n- Check zone configs haven't accidentally been set to a replication factor exceeding the number of failure domains.\n\n### Tuning\n- The 10-minute window filters brief rebalancing dips. Extend to 15m+ if your cluster frequently rebalances.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-cockroachdb.otel-default*\n| WHERE service.name == \"cockroachdb\"\n AND ranges_underreplicated IS NOT NULL\n AND ranges_underreplicated > 0\n// Store-level metric\n| WHERE attributes.node_id IS NOT NULL AND attributes.store IS NOT NULL\n| STATS underreplicated = MAX(ranges_underreplicated) BY attributes.node_id, attributes.store, host.name\n| SORT underreplicated DESC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 10, diff --git a/packages/cockroachdb_otel/manifest.yml b/packages/cockroachdb_otel/manifest.yml index 147b56f5bb7..dab77ed6a8d 100644 --- a/packages/cockroachdb_otel/manifest.yml +++ b/packages/cockroachdb_otel/manifest.yml @@ -1,7 +1,7 @@ format_version: 3.5.7 name: cockroachdb_otel title: "CockroachDB OpenTelemetry Assets" -version: 0.1.0 +version: 0.2.0 source: license: "Elastic-2.0" description: "CockroachDB Assets from OpenTelemetry Collector" @@ -12,7 +12,7 @@ categories: - opentelemetry conditions: kibana: - version: "^9.2.1" + version: "^9.4.0" elastic: subscription: "basic" discovery: diff --git a/packages/couchdb_otel/changelog.yml b/packages/couchdb_otel/changelog.yml index 4f894a761b0..6badf214a56 100644 --- a/packages/couchdb_otel/changelog.yml +++ b/packages/couchdb_otel/changelog.yml @@ -1,4 +1,9 @@ # newer versions go on top +- version: "0.2.0" + changes: + - description: Add description and artifact fields to alerting rule template. + type: enhancement + link: https://github.com/elastic/integrations/pull/18506 - version: "0.1.0" changes: - description: Initial draft of the package diff --git a/packages/couchdb_otel/kibana/alerting_rule_template/couchdb_otel-conflict-storm.json b/packages/couchdb_otel/kibana/alerting_rule_template/couchdb_otel-conflict-storm.json index 81ec0d157e1..97b5b3fa967 100644 --- a/packages/couchdb_otel/kibana/alerting_rule_template/couchdb_otel-conflict-storm.json +++ b/packages/couchdb_otel/kibana/alerting_rule_template/couchdb_otel-conflict-storm.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when CouchDB returns an elevated rate of 409 (Conflict) responses, indicating heavy write contention on the same documents. Sustained conflict storms suggest application-level retry loops or concurrent writers competing for the same document IDs.", "name": "[CouchDB OTel] Conflict storm (elevated 409 responses)", "ruleTypeId": ".es-query", "tags": [ @@ -16,10 +17,17 @@ "active": 2 }, "flapping": { - "enabled": true, "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "investigation_guide": { + "blob": "## Conflict Storm (Elevated 409 Responses)\n\n### Triage Steps\n1. Identify which CouchDB node is producing the 409s — open the CouchDB overview dashboard and filter by the alerting node.\n2. Correlate with HTTP request rate (PUT/POST in particular) — conflicts grow with concurrent write volume on the same documents.\n3. Check application logs for the documents being updated — are multiple clients writing to the same `_id`?\n4. Look for retry storms: clients that retry on 409 without backoff can amplify the problem.\n\n### Remediation\n- Implement exponential backoff and jitter in client retry logic.\n- Use CouchDB's bulk update endpoint with `all_or_nothing: false` to handle conflicts gracefully.\n- Redesign the data model to avoid hot documents — split frequently-updated state across multiple documents.\n- For counter-style updates, consider using a separate document per increment and aggregating on read.\n\n### Threshold Tuning\nThe default of 10 conflicts/sec is a moderate threshold. Some workloads (e.g. CRDT-style replication) may legitimately produce more conflicts; raise the threshold accordingly. For low-write workloads, lower the threshold to detect smaller anomalies." + }, + "dashboards": [ + {"id": "couchdb_otel-overview"} + ] + }, "params": { "searchType": "esqlQuery", "esqlQuery": { diff --git a/packages/couchdb_otel/kibana/alerting_rule_template/couchdb_otel-file-descriptor-exhaustion.json b/packages/couchdb_otel/kibana/alerting_rule_template/couchdb_otel-file-descriptor-exhaustion.json index ad0b18d65bb..ad8e0fef4f5 100644 --- a/packages/couchdb_otel/kibana/alerting_rule_template/couchdb_otel-file-descriptor-exhaustion.json +++ b/packages/couchdb_otel/kibana/alerting_rule_template/couchdb_otel-file-descriptor-exhaustion.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when the number of open file descriptors on a CouchDB node approaches the OS limit. Each open database file and network connection consumes a file descriptor; exhausting the limit causes new connections and database opens to fail.", "name": "[CouchDB OTel] File descriptor exhaustion risk", "ruleTypeId": ".es-query", "tags": [ @@ -16,10 +17,17 @@ "active": 2 }, "flapping": { - "enabled": true, "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "investigation_guide": { + "blob": "## File Descriptor Exhaustion Risk\n\n### Triage Steps\n1. Check the open file descriptor trend on the CouchDB overview dashboard for the alerting node.\n2. Cross-reference with `couchdb.database.open` — runaway database counts proportionally consume FDs.\n3. Verify the OS-level `ulimit -n` setting on the CouchDB host: `cat /proc/{pid}/limits` for the `beam.smp` process.\n4. Inspect active CouchDB connections — long-lived `_changes` feeds, replication tasks, and HTTP keep-alives all consume FDs.\n\n### Remediation\n- Raise the OS file descriptor limit (`ulimit -n`) and the systemd `LimitNOFILE` for the CouchDB service unit.\n- Close idle replication tasks and unused `_changes` listeners.\n- Identify and consolidate per-user or per-tenant databases if the open-database count is also high.\n- Restart CouchDB during a maintenance window to release leaked descriptors if no application-side fix is possible.\n\n### Threshold Tuning\nThe default threshold of 1000 open FDs assumes a typical `ulimit -n` of 1024–65536. Set the threshold to ~70-80% of your configured limit to give time to react before exhaustion." + }, + "dashboards": [ + {"id": "couchdb_otel-overview"} + ] + }, "params": { "searchType": "esqlQuery", "esqlQuery": { diff --git a/packages/couchdb_otel/kibana/alerting_rule_template/couchdb_otel-high-5xx-rate.json b/packages/couchdb_otel/kibana/alerting_rule_template/couchdb_otel-high-5xx-rate.json index f0220a16e55..41155609016 100644 --- a/packages/couchdb_otel/kibana/alerting_rule_template/couchdb_otel-high-5xx-rate.json +++ b/packages/couchdb_otel/kibana/alerting_rule_template/couchdb_otel-high-5xx-rate.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when the proportion of 5xx server error responses (500, 501, 503) exceeds a configurable percentage of total HTTP responses. Sustained 5xx rates indicate the CouchDB server is failing or is overloaded and unable to serve requests.", "name": "[CouchDB OTel] High 5xx server error rate", "ruleTypeId": ".es-query", "tags": [ @@ -16,10 +17,17 @@ "active": 2 }, "flapping": { - "enabled": true, "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "investigation_guide": { + "blob": "## High 5xx Server Error Rate\n\n### Triage Steps\n1. Open the CouchDB overview dashboard and identify which status codes dominate (500 internal error vs 503 unavailable).\n2. Check `couchdb.average_request_time` — rising latency often precedes 5xx as the server becomes overloaded.\n3. Review CouchDB logs (`/var/log/couchdb/couchdb.log`) for stack traces and error messages from the time window.\n4. Verify host resources: CPU, memory, disk I/O, and disk space. CouchDB returns 500 when it cannot write to disk.\n5. Check Erlang VM health (BEAM process count, scheduler utilisation) — process exhaustion causes cascading 5xx.\n\n### Remediation\n- For 503: scale out additional CouchDB nodes or reduce client request rate.\n- For 500: address the underlying cause from logs (e.g. corrupt database file, out of disk, Erlang process limit).\n- Restart the affected node if the issue is transient and persists after immediate causes are ruled out.\n- For sustained issues, raise `max_dbs_open` and Erlang process limits in `local.ini`.\n\n### Threshold Tuning\nThe default of 5% 5xx rate is a balanced threshold. For mission-critical workloads, consider lowering to 1%. The query also requires a minimum of 0.01 responses/sec to suppress noise on idle nodes — raise this if you want to focus only on production-traffic nodes." + }, + "dashboards": [ + {"id": "couchdb_otel-overview"} + ] + }, "params": { "searchType": "esqlQuery", "esqlQuery": { diff --git a/packages/couchdb_otel/kibana/alerting_rule_template/couchdb_otel-high-average-request-time.json b/packages/couchdb_otel/kibana/alerting_rule_template/couchdb_otel-high-average-request-time.json index b4c9fabe309..7cbd7d0023c 100644 --- a/packages/couchdb_otel/kibana/alerting_rule_template/couchdb_otel-high-average-request-time.json +++ b/packages/couchdb_otel/kibana/alerting_rule_template/couchdb_otel-high-average-request-time.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when the average HTTP request time on a CouchDB node exceeds a configurable latency threshold. Rising request times indicate disk I/O saturation, lock contention, view rebuilds, or general resource pressure.", "name": "[CouchDB OTel] High average request time (latency)", "ruleTypeId": ".es-query", "tags": [ @@ -16,10 +17,17 @@ "active": 2 }, "flapping": { - "enabled": true, "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "investigation_guide": { + "blob": "## High Average Request Time (Latency)\n\n### Triage Steps\n1. Open the CouchDB overview dashboard and identify the node and time range where latency spiked.\n2. Correlate with HTTP request rate — was there a traffic surge, or did latency climb at constant load?\n3. Check `couchdb.database.operations` (reads vs writes) and `couchdb.httpd.bulk_requests` for workload composition shifts.\n4. Review host metrics: disk I/O wait, CPU saturation, and memory pressure.\n5. Check for view rebuilds — design document changes trigger costly background indexing that can stall queries.\n\n### Remediation\n- If view rebuilds are the cause, schedule design document changes during off-peak hours and pre-warm views with `?stale=update_after`.\n- For disk I/O saturation, move data files to faster storage or reduce write rate via batching.\n- Tune `[httpd] max_connections` and `[chttpd] backlog` to prevent request queueing under load.\n- Consider compacting databases (`POST /{db}/_compact`) if file sizes are bloated.\n\n### Threshold Tuning\nThe default 1000ms threshold is a generic starting point. Set it to align with your CouchDB SLO — typical OLTP workloads target <100-500ms. The rule uses peak (MAX) latency in the window, so brief spikes are detected; switch to averaging if you want to filter out transient outliers." + }, + "dashboards": [ + {"id": "couchdb_otel-overview"} + ] + }, "params": { "searchType": "esqlQuery", "esqlQuery": { diff --git a/packages/couchdb_otel/kibana/alerting_rule_template/couchdb_otel-high-error-rate.json b/packages/couchdb_otel/kibana/alerting_rule_template/couchdb_otel-high-error-rate.json index bc1ff592e58..2578f51afe0 100644 --- a/packages/couchdb_otel/kibana/alerting_rule_template/couchdb_otel-high-error-rate.json +++ b/packages/couchdb_otel/kibana/alerting_rule_template/couchdb_otel-high-error-rate.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when the combined rate of 4xx (client) and 5xx (server) HTTP error responses exceeds a configurable percentage of total responses. Useful for tracking overall error budget consumption across both client- and server-side problems.", "name": "[CouchDB OTel] High error rate (4xx and 5xx)", "ruleTypeId": ".es-query", "tags": [ @@ -16,10 +17,17 @@ "active": 2 }, "flapping": { - "enabled": true, "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "investigation_guide": { + "blob": "## High Overall Error Rate (4xx and 5xx)\n\n### Triage Steps\n1. Open the CouchDB overview dashboard and break down responses by status code to see whether 4xx, 5xx, or both are driving the alert.\n2. For 4xx-dominated alerts, focus on client-side issues:\n - 401/403: authentication or authorization problems — check recent credential or `_security` changes.\n - 404: missing documents or databases — could indicate a deploy that referenced renamed/deleted resources.\n - 409: write conflicts — see the dedicated conflict storm rule.\n3. For 5xx-dominated alerts, see the dedicated 5xx-rate alert investigation guide.\n4. Identify whether errors come from a specific node or are cluster-wide.\n\n### Remediation\n- Address the dominant status code via the relevant playbook (4xx is typically a client/application issue; 5xx is server-side).\n- For mixed errors, coordinate with the application team to triage in parallel.\n- Investigate any recent deployments, configuration changes, or replication topology changes that correlate with the onset.\n\n### Threshold Tuning\nThe default 10% combined error rate is appropriate for general observability. Tighten the threshold for production-critical workloads, or split into separate 4xx and 5xx alerts if you want different sensitivity per error class." + }, + "dashboards": [ + {"id": "couchdb_otel-overview"} + ] + }, "params": { "searchType": "esqlQuery", "esqlQuery": { diff --git a/packages/couchdb_otel/kibana/alerting_rule_template/couchdb_otel-open-databases-high.json b/packages/couchdb_otel/kibana/alerting_rule_template/couchdb_otel-open-databases-high.json index dbb075b0098..4ff8642b609 100644 --- a/packages/couchdb_otel/kibana/alerting_rule_template/couchdb_otel-open-databases-high.json +++ b/packages/couchdb_otel/kibana/alerting_rule_template/couchdb_otel-open-databases-high.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when the number of open databases on a CouchDB node grows beyond a configurable threshold. Unbounded growth typically indicates database proliferation (e.g. per-user databases not being closed) or a leak in the application data model.", "name": "[CouchDB OTel] Open databases count high", "ruleTypeId": ".es-query", "tags": [ @@ -16,10 +17,17 @@ "active": 2 }, "flapping": { - "enabled": true, "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "investigation_guide": { + "blob": "## Open Databases Count High\n\n### Triage Steps\n1. Check the open databases trend on the CouchDB overview dashboard — is the count growing linearly or step-wise?\n2. Cross-reference with `couchdb.file_descriptor.open` — open databases consume file descriptors and may trigger the FD exhaustion alert.\n3. Query `/_all_dbs` to enumerate databases and look for naming patterns indicating per-user, per-tenant, or per-session databases.\n4. Check the `[couchdb] max_dbs_open` setting in `local.ini` against the current count.\n\n### Remediation\n- If per-user databases are growing unbounded, implement application-side cleanup of inactive accounts.\n- Consolidate small databases into a single database with a discriminator field where the data model allows.\n- Raise `[couchdb] max_dbs_open` if the growth is legitimate, ensuring the OS file descriptor limit is also raised correspondingly.\n- Use `PUT /{db}/_revs_limit` to reduce per-database overhead if many databases are kept for historical access only.\n\n### Threshold Tuning\nThe default of 100 open databases is conservative. Multi-tenant deployments often legitimately maintain thousands; set the threshold to ~70-80% of `max_dbs_open` so you have time to react before CouchDB starts refusing to open additional databases." + }, + "dashboards": [ + {"id": "couchdb_otel-overview"} + ] + }, "params": { "searchType": "esqlQuery", "esqlQuery": { diff --git a/packages/couchdb_otel/manifest.yml b/packages/couchdb_otel/manifest.yml index cd37d81b2b6..88b33ea6416 100644 --- a/packages/couchdb_otel/manifest.yml +++ b/packages/couchdb_otel/manifest.yml @@ -1,7 +1,7 @@ format_version: 3.5.5 name: couchdb_otel title: "CouchDB OpenTelemetry Assets" -version: 0.1.0 +version: 0.2.0 source: license: "Elastic-2.0" description: "CouchDB Assets for OpenTelemetry Collector" @@ -12,7 +12,7 @@ categories: - opentelemetry conditions: kibana: - version: "^9.3.0" + version: "^9.4.0" elastic: subscription: "basic" discovery: diff --git a/packages/envoyproxy_otel/changelog.yml b/packages/envoyproxy_otel/changelog.yml index 93da7c32bc2..36cdb6912ef 100644 --- a/packages/envoyproxy_otel/changelog.yml +++ b/packages/envoyproxy_otel/changelog.yml @@ -1,4 +1,9 @@ # newer versions go on top +- version: "0.2.0" + changes: + - description: Add description and artifact fields to alerting rule template. + type: enhancement + link: https://github.com/elastic/integrations/pull/18506 - version: "0.1.0" changes: - description: First release of the Envoyproxy OpenTelemetry content package diff --git a/packages/envoyproxy_otel/kibana/alerting_rule_template/envoyproxy_otel-certificate-expiring-soon.json b/packages/envoyproxy_otel/kibana/alerting_rule_template/envoyproxy_otel-certificate-expiring-soon.json index 4535e6eb44d..6b7926fe3a7 100644 --- a/packages/envoyproxy_otel/kibana/alerting_rule_template/envoyproxy_otel-certificate-expiring-soon.json +++ b/packages/envoyproxy_otel/kibana/alerting_rule_template/envoyproxy_otel-certificate-expiring-soon.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `envoy.server.days_until_first_cert_expiring` falls below 7 days. Expired TLS certificates cause handshake failures on downstream and upstream connections.", "name": "[Envoy OTel] Certificate expiring soon", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,25 @@ "lookBackWindow": 3, "statusChangeThreshold": 2 }, + "artifacts": { + "dashboards": [ + { + "id": "envoyproxy_otel-envoy-overview" + } + ], + "investigation_guide": { + "blob": "## Envoy certificate expiring soon\n\n### What fired\n`envoy.server.days_until_first_cert_expiring` dropped below 7 days in the evaluation window.\n\n### Why it matters\nEnvoy serves TLS on the downstream and may also present client certs upstream. An expired cert breaks every secured connection, causing immediate availability impact.\n\n### Triage\n1. Identify which certificate(s) are expiring on the affected Envoy instance via the admin interface `/certs` endpoint.\n2. Check the certificate issuance pipeline (cert-manager, Vault, ACME client) for stuck renewals.\n3. Verify downstream clients can still connect if renewal is already in flight.\n\n### Remediation\n- Force-renew the certificate via the issuance pipeline.\n- Push a new Envoy secret via SDS / hot-restart if not auto-refreshed.\n- Adopt automated rotation if this is a recurring event.\n\n### Tuning\n- 7 days is a late threshold. For production, pair this rule with a 30-day warning rule.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { - "esql": "FROM metrics-statsdreceiver-default\n| WHERE data_stream.dataset == \"statsdreceiver\"\n| WHERE envoy.server.days_until_first_cert_expiring IS NOT NULL\n// Days until first TLS cert expires — alert when below threshold\n| STATS min_days = MIN(envoy.server.days_until_first_cert_expiring)\n// Adjust threshold (7 days) based on your cert renewal lead time\n| WHERE min_days < 7" + "esql": "FROM metrics-statsdreceiver-default\n| WHERE data_stream.dataset == \"statsdreceiver\"\n| WHERE envoy.server.days_until_first_cert_expiring IS NOT NULL\n// Days until first TLS cert expires \u2014 alert when below threshold\n| STATS min_days = MIN(envoy.server.days_until_first_cert_expiring)\n// Adjust threshold (7 days) based on your cert renewal lead time\n| WHERE min_days < 7" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 1, diff --git a/packages/envoyproxy_otel/kibana/alerting_rule_template/envoyproxy_otel-high-downstream-error-rate.json b/packages/envoyproxy_otel/kibana/alerting_rule_template/envoyproxy_otel-high-downstream-error-rate.json index 945be9c8928..89ea3f09a9a 100644 --- a/packages/envoyproxy_otel/kibana/alerting_rule_template/envoyproxy_otel-high-downstream-error-rate.json +++ b/packages/envoyproxy_otel/kibana/alerting_rule_template/envoyproxy_otel-high-downstream-error-rate.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when the Envoy downstream error rate across HTTP stat_prefixes exceeds 5% over 15 minutes. Errors at the edge indicate client-visible failures.", "name": "[Envoy OTel] High downstream error rate", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 4 }, + "artifacts": { + "dashboards": [ + { + "id": "envoyproxy_otel-envoy-downstream" + }, + { + "id": "envoyproxy_otel-envoy-overview" + } + ], + "investigation_guide": { + "blob": "## Envoy high downstream error rate\n\n### What fired\n`(downstream_rq_completed - downstream_rq_2xx) / downstream_rq_completed > 5%` across HTTP stat_prefixes.\n\n### Why it matters\nDownstream errors include 4xx and 5xx seen by clients. Sustained elevated rates degrade user experience and are almost always caused by upstream application issues or Envoy config problems.\n\n### Triage\n1. Split errors by stat_prefix (ingress_http vs admin) to localise the incident.\n2. On the Downstream dashboard, correlate with upstream cluster health and 5xx ratios.\n3. Inspect `envoy.cluster_manager.warming_clusters` and config state.\n\n### Remediation\n- Roll back the latest application or config change.\n- Add retries / circuit breaking on the misbehaving upstream.\n\n### Tuning\n- 5% over 15 minutes; tighten to 1% for SLA-critical tiers.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { - "esql": "FROM metrics-statsdreceiver-default\n| WHERE data_stream.dataset == \"statsdreceiver\"\n// Aggregate across HTTP stat_prefixes (ingress_http, admin) — add more COALESCE terms as needed\n| EVAL total_rq = COALESCE(envoy.http.ingress_http.downstream_rq_completed, 0::long)\n + COALESCE(envoy.http.admin.downstream_rq_completed, 0::long)\n| EVAL success_rq = COALESCE(envoy.http.ingress_http.downstream_rq_2xx, 0::long)\n + COALESCE(envoy.http.admin.downstream_rq_2xx, 0::long)\n| WHERE total_rq > 50\n| EVAL error_rate = TO_DOUBLE(total_rq - success_rq) / TO_DOUBLE(total_rq) * 100.0\n| STATS max_error_rate = MAX(error_rate)\n// Healthy: < 1%; Unhealthy: > 5%. Adjust threshold for your SLO.\n| WHERE max_error_rate > 5.0" + "esql": "FROM metrics-statsdreceiver-default\n| WHERE data_stream.dataset == \"statsdreceiver\"\n// Aggregate across HTTP stat_prefixes (ingress_http, admin) \u2014 add more COALESCE terms as needed\n| EVAL total_rq = COALESCE(envoy.http.ingress_http.downstream_rq_completed, 0::long)\n + COALESCE(envoy.http.admin.downstream_rq_completed, 0::long)\n| EVAL success_rq = COALESCE(envoy.http.ingress_http.downstream_rq_2xx, 0::long)\n + COALESCE(envoy.http.admin.downstream_rq_2xx, 0::long)\n| WHERE total_rq > 50\n| EVAL error_rate = TO_DOUBLE(total_rq - success_rq) / TO_DOUBLE(total_rq) * 100.0\n| STATS max_error_rate = MAX(error_rate)\n// Healthy: < 1%; Unhealthy: > 5%. Adjust threshold for your SLO.\n| WHERE max_error_rate > 5.0" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/envoyproxy_otel/kibana/alerting_rule_template/envoyproxy_otel-high-memory-pressure.json b/packages/envoyproxy_otel/kibana/alerting_rule_template/envoyproxy_otel-high-memory-pressure.json index 3de819f8ae6..d39f4ad21a4 100644 --- a/packages/envoyproxy_otel/kibana/alerting_rule_template/envoyproxy_otel-high-memory-pressure.json +++ b/packages/envoyproxy_otel/kibana/alerting_rule_template/envoyproxy_otel-high-memory-pressure.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `envoy.server.memory_allocated / memory_heap_size > 0.9`. Sustained high ratios indicate memory pressure or a leak and risk OOM kills.", "name": "[Envoy OTel] High memory pressure", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,25 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "envoyproxy_otel-envoy-overview" + } + ], + "investigation_guide": { + "blob": "## Envoy high memory pressure\n\n### What fired\n`memory_allocated / memory_heap_size` exceeded 0.9 over the evaluation window.\n\n### Why it matters\nEnvoy is C++ and uses tcmalloc-style allocators. High allocated/heap ratios usually mean too many active connections, huge header sets, or a genuine leak. OOM kills the process entirely.\n\n### Triage\n1. Check concurrent connection count and typical response size.\n2. Look at `envoy.server.live` / restart counters to see if the process has been flapping.\n3. Compare with recent traffic growth or config pushes.\n\n### Remediation\n- Increase pod memory limits.\n- Tune `overload_manager` to trigger backpressure before OOM.\n- Collect a heap profile if a leak is suspected.\n\n### Tuning\n- 0.9 threshold balances noise and severity; lower to 0.85 for tighter environments.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { - "esql": "FROM metrics-statsdreceiver-default\n| WHERE data_stream.dataset == \"statsdreceiver\"\n| WHERE envoy.server.memory_allocated IS NOT NULL\n AND envoy.server.memory_heap_size IS NOT NULL\n AND envoy.server.memory_heap_size > 0\n// Ratio of allocated to heap — sustained high values indicate pressure or leak\n| EVAL mem_ratio = envoy.server.memory_allocated / envoy.server.memory_heap_size\n| STATS max_ratio = MAX(mem_ratio)\n// Adjust threshold (0.9 = 90%) based on your memory headroom expectations\n| WHERE max_ratio > 0.9" + "esql": "FROM metrics-statsdreceiver-default\n| WHERE data_stream.dataset == \"statsdreceiver\"\n| WHERE envoy.server.memory_allocated IS NOT NULL\n AND envoy.server.memory_heap_size IS NOT NULL\n AND envoy.server.memory_heap_size > 0\n// Ratio of allocated to heap \u2014 sustained high values indicate pressure or leak\n| EVAL mem_ratio = envoy.server.memory_allocated / envoy.server.memory_heap_size\n| STATS max_ratio = MAX(mem_ratio)\n// Adjust threshold (0.9 = 90%) based on your memory headroom expectations\n| WHERE max_ratio > 0.9" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/envoyproxy_otel/kibana/alerting_rule_template/envoyproxy_otel-listeners-warming.json b/packages/envoyproxy_otel/kibana/alerting_rule_template/envoyproxy_otel-listeners-warming.json index 5f9ce69ebe0..c290b412f11 100644 --- a/packages/envoyproxy_otel/kibana/alerting_rule_template/envoyproxy_otel-listeners-warming.json +++ b/packages/envoyproxy_otel/kibana/alerting_rule_template/envoyproxy_otel-listeners-warming.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `envoy.listener_manager.total_listeners_warming > 0`. Listeners stuck in the warming state cannot accept traffic.", "name": "[Envoy OTel] Listeners warming", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,25 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "envoyproxy_otel-envoy-overview" + } + ], + "investigation_guide": { + "blob": "## Envoy listeners warming\n\n### What fired\n`envoy.listener_manager.total_listeners_warming` remained above 0 during the window.\n\n### Why it matters\nListeners warm when Envoy is waiting for secrets or initial discovery responses. Listeners stuck warming are effectively unreachable.\n\n### Triage\n1. Check the admin `/config_dump` or `/listeners` to see which listener is stuck.\n2. Inspect xDS / SDS connectivity to the control plane.\n3. Look for secrets (TLS certs) that have not been delivered.\n\n### Remediation\n- Fix the upstream discovery or secret source.\n- Restart the pod as last resort.\n\n### Tuning\n- Warming should be milliseconds. Any sustained value warrants investigation.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-statsdreceiver-default\n| WHERE data_stream.dataset == \"statsdreceiver\"\n| WHERE envoy.listener_manager.total_listeners_warming IS NOT NULL\n// Listeners warming should drop to 0 after config load\n| STATS max_warming = MAX(envoy.listener_manager.total_listeners_warming)\n| WHERE max_warming > 0" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/envoyproxy_otel/kibana/alerting_rule_template/envoyproxy_otel-server-not-live.json b/packages/envoyproxy_otel/kibana/alerting_rule_template/envoyproxy_otel-server-not-live.json index 4461d874a23..96f423d41d9 100644 --- a/packages/envoyproxy_otel/kibana/alerting_rule_template/envoyproxy_otel-server-not-live.json +++ b/packages/envoyproxy_otel/kibana/alerting_rule_template/envoyproxy_otel-server-not-live.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `envoy.server.live` drops below 1 \u2014 the admin `/ready` endpoint reports the proxy as not live.", "name": "[Envoy OTel] Server not live", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,25 @@ "lookBackWindow": 5, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "envoyproxy_otel-envoy-overview" + } + ], + "investigation_guide": { + "blob": "## Envoy server not live\n\n### What fired\n`envoy.server.live` was below 1 during the window.\n\n### Why it matters\nAn Envoy that is not live is either in initialization, draining, or has entered the overload/failstate. Clients will get connection failures.\n\n### Triage\n1. Check the admin `/server_info` and `/ready` endpoints.\n2. Cross-reference with `envoy.server.state` to see the exact lifecycle stage.\n3. Review Envoy logs for overload manager trips or init failures.\n\n### Remediation\n- Restore control plane / config source so init can complete.\n- Cancel drains if they were unintentional.\n\n### Tuning\n- Fires on any non-live sample; consider an alertDelay if restarts are frequent in your environment.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-statsdreceiver-default\n| WHERE data_stream.dataset == \"statsdreceiver\"\n| WHERE envoy.server.live IS NOT NULL\n// Server live is 1 when healthy, 0 when not\n| STATS max_live = MAX(envoy.server.live)\n| WHERE max_live < 1" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 5, diff --git a/packages/envoyproxy_otel/kibana/alerting_rule_template/envoyproxy_otel-server-state-not-live.json b/packages/envoyproxy_otel/kibana/alerting_rule_template/envoyproxy_otel-server-state-not-live.json index 0eaff368452..94bccea8485 100644 --- a/packages/envoyproxy_otel/kibana/alerting_rule_template/envoyproxy_otel-server-state-not-live.json +++ b/packages/envoyproxy_otel/kibana/alerting_rule_template/envoyproxy_otel-server-state-not-live.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `envoy.server.state != 0` (i.e. not LIVE). State 1/2/3 are DRAINING, PRE_INITIALIZING, INITIALIZING.", "name": "[Envoy OTel] Server state not LIVE", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,25 @@ "lookBackWindow": 5, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "envoyproxy_otel-envoy-overview" + } + ], + "investigation_guide": { + "blob": "## Envoy server state not LIVE\n\n### What fired\n`envoy.server.state` was non-zero during the window (0 = LIVE).\n\n### Why it matters\nAny non-LIVE state means the proxy cannot serve traffic at full capacity. Draining is expected during rollouts; other states are usually errors.\n\n### Triage\n1. Determine the exact state: DRAINING (1) is benign during rolling deploys; PRE_INITIALIZING (2) / INITIALIZING (3) indicate startup stalls.\n2. Check xDS/SDS connectivity if state is 2 or 3.\n3. Look at cluster rollout events for state 1.\n\n### Remediation\n- Let rolling deploys finish; if state 2/3 persists, restart with corrected bootstrap config.\n\n### Tuning\n- Use alongside the `server-not-live` rule; this one adds state context.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-statsdreceiver-default\n| WHERE data_stream.dataset == \"statsdreceiver\"\n| WHERE envoy.server.state IS NOT NULL\n// state: 0=LIVE, 1=DRAINING, 2=PRE_INITIALIZING, 3=INITIALIZING\n| STATS max_state = MAX(envoy.server.state)\n| WHERE max_state != 0" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 5, diff --git a/packages/envoyproxy_otel/kibana/alerting_rule_template/envoyproxy_otel-warming-clusters-stuck.json b/packages/envoyproxy_otel/kibana/alerting_rule_template/envoyproxy_otel-warming-clusters-stuck.json index 04e92ddf723..3a19bcc81a0 100644 --- a/packages/envoyproxy_otel/kibana/alerting_rule_template/envoyproxy_otel-warming-clusters-stuck.json +++ b/packages/envoyproxy_otel/kibana/alerting_rule_template/envoyproxy_otel-warming-clusters-stuck.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `envoy.cluster_manager.warming_clusters > 0` persists. Clusters stuck warming cannot route requests.", "name": "[Envoy OTel] Warming clusters stuck", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "envoyproxy_otel-envoy-upstream-proxy" + }, + { + "id": "envoyproxy_otel-envoy-overview" + } + ], + "investigation_guide": { + "blob": "## Envoy warming clusters stuck\n\n### What fired\n`envoy.cluster_manager.warming_clusters` remained above 0 during the window.\n\n### Why it matters\nClusters warm when Envoy is awaiting DNS/EDS responses for their endpoints. Stuck warming means endpoints cannot route, effectively taking the upstream offline.\n\n### Triage\n1. Inspect admin `/clusters` for the stuck cluster name.\n2. Check DNS resolution / EDS connectivity from the Envoy pod.\n3. Look for recent CDS/EDS push failures.\n\n### Remediation\n- Fix DNS / control-plane connectivity.\n- Push a correct cluster config.\n\n### Tuning\n- Fires on any sustained value > 0.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-statsdreceiver-default\n| WHERE data_stream.dataset == \"statsdreceiver\"\n| WHERE envoy.cluster_manager.warming_clusters IS NOT NULL\n// Warming clusters should drop to 0 after config load; sustained > 0 indicates config issues\n| STATS max_warming = MAX(envoy.cluster_manager.warming_clusters)\n| WHERE max_warming > 0" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/envoyproxy_otel/manifest.yml b/packages/envoyproxy_otel/manifest.yml index b47bfd535d9..23cdf8893ac 100644 --- a/packages/envoyproxy_otel/manifest.yml +++ b/packages/envoyproxy_otel/manifest.yml @@ -1,7 +1,7 @@ format_version: 3.5.7 name: envoyproxy_otel title: "Envoyproxy OpenTelemetry assets" -version: 0.1.0 +version: 0.2.0 source: license: "Elastic-2.0" description: "Envoyproxy Assets for OpenTelemetry Collector" @@ -11,7 +11,7 @@ categories: - opentelemetry conditions: kibana: - version: "^9.3.0" + version: "^9.4.0" elastic: subscription: "basic" discovery: diff --git a/packages/etcd_otel/changelog.yml b/packages/etcd_otel/changelog.yml index 45b3bd92d19..0201004bc4a 100644 --- a/packages/etcd_otel/changelog.yml +++ b/packages/etcd_otel/changelog.yml @@ -1,4 +1,9 @@ # newer versions go on top +- version: "0.2.0" + changes: + - description: Add description and artifact fields to alerting rule template. + type: enhancement + link: https://github.com/elastic/integrations/pull/18506 - version: "0.1.0" changes: - description: Initial draft of the package diff --git a/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-db-size-quota.json b/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-db-size-quota.json index 3d42438fa6a..fe4f6d28c03 100644 --- a/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-db-size-quota.json +++ b/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-db-size-quota.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `etcd_mvcc_db_total_size_in_bytes / etcd_server_quota_backend_bytes > 0.9`. Hitting quota puts etcd into alarm mode and rejects writes.", "name": "[etcd OTel] Database size approaching quota", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "etcd_otel-storage" + }, + { + "id": "etcd_otel-overview" + } + ], + "investigation_guide": { + "blob": "## etcd database size approaching quota\n\n### What fired\nDB size exceeded 90% of the configured backend quota.\n\n### Why it matters\nWhen etcd hits its quota it raises NOSPACE and stops accepting writes until the operator clears the alarm. Kubernetes and other clients will stall.\n\n### Triage\n1. Confirm the ratio on the Storage dashboard.\n2. Check defragmentation status and compaction schedule.\n3. Inspect key counts per prefix (if known) to spot runaway data.\n\n### Remediation\n- Run `etcdctl defrag` on each member.\n- Issue `etcdctl alarm disarm` once space is reclaimed.\n- Raise `--quota-backend-bytes` if workload has grown legitimately.\n\n### Tuning\n- 0.9 is a late threshold \u2014 consider a 0.7 warning rule as well.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-etcd.otel-default*\n// DB size vs quota: when ratio reaches 1, etcd enters alarm mode and rejects writes\n| WHERE etcd_mvcc_db_total_size_in_bytes IS NOT NULL\n AND etcd_server_quota_backend_bytes IS NOT NULL\n AND etcd_server_quota_backend_bytes > 0\n| EVAL usage_ratio = etcd_mvcc_db_total_size_in_bytes * 1.0 / etcd_server_quota_backend_bytes\n// Alert when usage exceeds 90%; adjust threshold for earlier warning\n| WHERE usage_ratio > 0.9\n| STATS usage_ratio = MAX(usage_ratio),\n db_bytes = MAX(etcd_mvcc_db_total_size_in_bytes),\n quota_bytes = MAX(etcd_server_quota_backend_bytes)\n BY resource.attributes.service.instance.id\n| SORT usage_ratio DESC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 10, diff --git a/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-file-descriptor-exhaustion.json b/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-file-descriptor-exhaustion.json index 1e16539ae40..741a6035d35 100644 --- a/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-file-descriptor-exhaustion.json +++ b/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-file-descriptor-exhaustion.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `process_open_fds / process_max_fds > 0.9`. FD exhaustion rejects new client and peer connections.", "name": "[etcd OTel] File descriptor exhaustion", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,25 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "etcd_otel-overview" + } + ], + "investigation_guide": { + "blob": "## etcd file descriptor exhaustion\n\n### What fired\nOpen FDs exceeded 90% of the process limit over the window.\n\n### Why it matters\netcd uses an FD per client and peer connection plus storage files. Running out halts new connections and may crash the process.\n\n### Triage\n1. Check client connection count and watch streams.\n2. Inspect `ulimit -n` / systemd `LimitNOFILE` for the process.\n3. Look for leaked connections from misbehaving clients.\n\n### Remediation\n- Raise the FD limit and restart the member.\n- Fix the client leaking connections.\n\n### Tuning\n- 0.9 is conservative; warn at 0.8.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-etcd.otel-default*\n// Gauge: open FDs vs max; exhaustion causes connection failures\n| WHERE process_open_fds IS NOT NULL\n AND process_max_fds IS NOT NULL\n AND process_max_fds > 0\n| EVAL fd_ratio = process_open_fds * 1.0 / process_max_fds\n// Alert when usage exceeds 90%; adjust for earlier warning\n| WHERE fd_ratio > 0.9\n| STATS fd_ratio = MAX(fd_ratio),\n open_fds = MAX(process_open_fds),\n max_fds = MAX(process_max_fds)\n BY resource.attributes.service.instance.id\n| SORT fd_ratio DESC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 10, diff --git a/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-grpc-errors.json b/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-grpc-errors.json index 91686c70f9f..26c8997dbcc 100644 --- a/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-grpc-errors.json +++ b/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-grpc-errors.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when any gRPC server handlers complete with a non-OK status code. Persistent non-OK codes signal client, auth, or overload issues.", "name": "[etcd OTel] gRPC errors", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "etcd_otel-grpc" + }, + { + "id": "etcd_otel-overview" + } + ], + "investigation_guide": { + "blob": "## etcd gRPC errors\n\n### What fired\n`grpc_server_handled_total` with `grpc_code != OK` increased during the window.\n\n### Why it matters\nNon-OK gRPC codes (DeadlineExceeded, Unavailable, PermissionDenied, ResourceExhausted) indicate clients cannot complete their calls. Varies from noise (benign retries) to outage (gRPC server overload).\n\n### Triage\n1. Break down errors by gRPC code to diagnose.\n2. Correlate with `etcd_server_proposals_failed_total`, slow applies, and leader changes.\n3. Check network latency between clients and etcd.\n\n### Remediation\n- Address overload by scaling etcd peers or tuning client patterns.\n- Fix auth problems if PermissionDenied dominates.\n\n### Tuning\n- Fires on any increase. For noisy environments filter out expected Unavailable counts or raise threshold.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "TS metrics-etcd.otel-default*\n// Counter with attributes.grpc_code: non-OK codes indicate client/auth/overload issues\n| WHERE grpc_server_handled_total IS NOT NULL\n AND attributes.grpc_code != \"OK\"\n| STATS errors = SUM(INCREASE(grpc_server_handled_total))\n BY resource.attributes.service.instance.id\n| WHERE errors > 0\n| SORT errors DESC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 5, diff --git a/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-heartbeat-failures.json b/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-heartbeat-failures.json index c6a32b025b2..fa4d5f36d33 100644 --- a/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-heartbeat-failures.json +++ b/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-heartbeat-failures.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when the leader fails to send heartbeats to followers (`etcd_server_heartbeat_send_failures_total` increases). A precursor to leader elections.", "name": "[etcd OTel] Heartbeat send failures", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "etcd_otel-raft-consensus" + }, + { + "id": "etcd_otel-overview" + } + ], + "investigation_guide": { + "blob": "## etcd heartbeat send failures\n\n### What fired\n`etcd_server_heartbeat_send_failures_total` increased during the window.\n\n### Why it matters\nHeartbeats keep followers in sync with the leader. Failures indicate slow disk, overloaded leader, or network issues between peers, and often precede leader elections.\n\n### Triage\n1. Check disk I/O latency on the leader (fsync is on the critical path).\n2. Look at `rpc_connection_avg_round_trip_latency` or equivalent between peers.\n3. Check CPU on the leader.\n\n### Remediation\n- Move etcd to dedicated SSDs.\n- Spread peers across low-latency links.\n- Tune `--heartbeat-interval` only as last resort.\n\n### Tuning\n- Fires on any increase.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "TS metrics-etcd.otel-default*\n// Counter: leader failing to send heartbeats to followers; precursor to leader elections\n| WHERE etcd_server_heartbeat_send_failures_total IS NOT NULL\n| STATS increase = SUM(INCREASE(etcd_server_heartbeat_send_failures_total))\n BY resource.attributes.service.instance.id\n| WHERE increase > 0\n| SORT increase DESC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 5, diff --git a/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-leader-changes.json b/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-leader-changes.json index f3aeca2c6b5..0e71b6c7cb8 100644 --- a/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-leader-changes.json +++ b/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-leader-changes.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when more than 1 leader change occurs in 15 minutes (`etcd_server_leader_changes_seen_total`). Frequent elections point to disk, network, or resource issues.", "name": "[etcd OTel] Frequent leader elections", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "etcd_otel-raft-consensus" + }, + { + "id": "etcd_otel-overview" + } + ], + "investigation_guide": { + "blob": "## etcd frequent leader elections\n\n### What fired\nCounter `etcd_server_leader_changes_seen_total` incremented more than once in 15 minutes.\n\n### Why it matters\nLeader elections pause all writes and rebuild Raft state. Frequent elections degrade throughput and indicate instability in the cluster.\n\n### Triage\n1. Correlate with heartbeat failures and slow applies.\n2. Inspect disk fsync latency on all peers.\n3. Check for network partitions or peer restarts.\n\n### Remediation\n- Stabilise disk I/O.\n- Keep peers in the same failure domain for low latency.\n\n### Tuning\n- `> 1 change per 15m` is the documented threshold. Lower for latency-sensitive tiers.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "TS metrics-etcd.otel-default*\n// Counter: leader changes observed; frequent elections suggest disk I/O, network, or resource issues\n| WHERE etcd_server_leader_changes_seen_total IS NOT NULL\n| STATS changes = SUM(INCREASE(etcd_server_leader_changes_seen_total))\n BY resource.attributes.service.instance.id\n// Alert when more than 1 leader change in window; adjust threshold for sensitivity\n| WHERE changes > 1\n| SORT changes DESC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-no-leader.json b/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-no-leader.json index 99c44488431..c522d5dd22d 100644 --- a/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-no-leader.json +++ b/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-no-leader.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `etcd_server_has_leader == 0`. The member cannot see a leader \u2014 quorum is lost or this member is partitioned.", "name": "[etcd OTel] No leader", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "etcd_otel-raft-consensus" + }, + { + "id": "etcd_otel-overview" + } + ], + "investigation_guide": { + "blob": "## etcd no leader\n\n### What fired\nA member reported `has_leader == 0` during the window.\n\n### Why it matters\nWithout a leader, no writes are possible from this member. If the whole cluster lacks a leader, Kubernetes and other dependents are down.\n\n### Triage\n1. Check if only one member is affected (partition) vs whole cluster (quorum loss).\n2. Inspect peer connectivity.\n3. Look at recent restarts, config changes, and disk health.\n\n### Remediation\n- Restore peer connectivity / bring missing peers online.\n- As last resort, run a disaster-recovery restore from snapshot.\n\n### Tuning\n- Always P1. Do not raise the threshold.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-etcd.otel-default*\n// Member has no leader: network partition, quorum lost, or all members down\n| WHERE etcd_server_has_leader IS NOT NULL\n AND etcd_server_has_leader == 0\n// Group by instance for per-member alerting\n| STATS count = COUNT(*) BY resource.attributes.service.instance.id\n| SORT count DESC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 5, diff --git a/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-proposals-failed.json b/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-proposals-failed.json index f6c7ecfbd96..7185f49b543 100644 --- a/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-proposals-failed.json +++ b/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-proposals-failed.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `etcd_server_proposals_failed_total` increases. Failed Raft proposals indicate lost quorum or severe network issues.", "name": "[etcd OTel] Proposal failures", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "etcd_otel-raft-consensus" + }, + { + "id": "etcd_otel-overview" + } + ], + "investigation_guide": { + "blob": "## etcd proposal failures\n\n### What fired\nCounter `etcd_server_proposals_failed_total` increased during the window.\n\n### Why it matters\nProposals fail when the cluster cannot replicate the change \u2014 usually because quorum is unreachable or the leader stepped down mid-proposal. Client writes for failed proposals return errors.\n\n### Triage\n1. Check `has_leader` and leader change counters.\n2. Inspect peer connectivity and disk latency.\n\n### Remediation\n- Restore quorum.\n- Stabilise disk and network.\n\n### Tuning\n- Fires on any increase.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "TS metrics-etcd.otel-default*\n// Counter: failed Raft proposals indicate lost quorum or severe network issues\n| WHERE etcd_server_proposals_failed_total IS NOT NULL\n// INCREASE: any increase in the window indicates failures\n| STATS increase = SUM(INCREASE(etcd_server_proposals_failed_total))\n BY resource.attributes.service.instance.id\n| WHERE increase > 0\n| SORT increase DESC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 5, diff --git a/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-proposals-pending.json b/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-proposals-pending.json index 3ea36b4b373..32fd6c5512b 100644 --- a/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-proposals-pending.json +++ b/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-proposals-pending.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when more than 10 Raft proposals are pending commit (`etcd_server_proposals_pending > 10`). Backlog indicates leader overload or slow disk.", "name": "[etcd OTel] Pending proposals backlog", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "etcd_otel-raft-consensus" + }, + { + "id": "etcd_otel-storage" + } + ], + "investigation_guide": { + "blob": "## etcd pending proposals backlog\n\n### What fired\n`etcd_server_proposals_pending > 10` on a member during the window.\n\n### Why it matters\nPending proposals accumulate when Raft cannot commit them quickly. Usually means slow fsync on the leader, overloaded peer, or excessive write throughput.\n\n### Triage\n1. Check leader CPU and disk latency.\n2. Look at incoming client write rate.\n3. Inspect slow-apply counter for correlated disk pressure.\n\n### Remediation\n- Move etcd to faster storage.\n- Throttle write-heavy clients.\n\n### Tuning\n- `> 10` is a moderate threshold. Tune to your write rate.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-etcd.otel-default*\n// Gauge: proposals waiting to be committed; backlog indicates leader overload or slow disk\n| WHERE etcd_server_proposals_pending IS NOT NULL\n AND etcd_server_proposals_pending > 10\n// Adjust threshold (10) based on expected write throughput\n| STATS pending = MAX(etcd_server_proposals_pending)\n BY resource.attributes.service.instance.id\n| SORT pending DESC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 5, diff --git a/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-slow-applies.json b/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-slow-applies.json index 6dba24e7f33..d813c68ad60 100644 --- a/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-slow-applies.json +++ b/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-slow-applies.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `etcd_server_slow_apply_total` increases. Raft applies exceeding the expected duration indicate disk or compaction contention.", "name": "[etcd OTel] Slow apply operations", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "etcd_otel-storage" + }, + { + "id": "etcd_otel-raft-consensus" + } + ], + "investigation_guide": { + "blob": "## etcd slow apply operations\n\n### What fired\nCounter `etcd_server_slow_apply_total` incremented during the window.\n\n### Why it matters\netcd considers an apply 'slow' when it takes > 100 ms. Sustained slow applies indicate slow disk or compaction thrashing that will cascade into leader churn and client timeouts.\n\n### Triage\n1. Check disk fsync latency and IOPS.\n2. Inspect compaction backlog and DB size trend.\n3. Look for CPU contention.\n\n### Remediation\n- Upgrade storage.\n- Run defrag to reduce DB size.\n\n### Tuning\n- Fires on any increase.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "TS metrics-etcd.otel-default*\n// Counter: Raft applies that exceeded expected duration (~100ms); indicates slow disk or compaction contention\n| WHERE etcd_server_slow_apply_total IS NOT NULL\n| STATS increase = SUM(INCREASE(etcd_server_slow_apply_total))\n BY resource.attributes.service.instance.id\n| WHERE increase > 0\n| SORT increase DESC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 5, diff --git a/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-target-down.json b/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-target-down.json index afb7f1b425b..e2ffe8ec876 100644 --- a/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-target-down.json +++ b/packages/etcd_otel/kibana/alerting_rule_template/etcd_otel-target-down.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when the scrape target `up` metric is 0 \u2014 the collector cannot reach the etcd metrics endpoint.", "name": "[etcd OTel] Scrape target down", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,25 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "etcd_otel-overview" + } + ], + "investigation_guide": { + "blob": "## etcd scrape target down\n\n### What fired\n`up == 0` during the entire window.\n\n### Why it matters\nA down scrape target means either etcd has stopped, the metrics endpoint is unreachable, or the collector is misconfigured. Either way, observability of that member is lost.\n\n### Triage\n1. Confirm whether etcd itself is running or the metrics endpoint is blocked.\n2. Check collector logs for scrape failures.\n3. Verify network / firewall to the metrics port.\n\n### Remediation\n- Restore the metrics endpoint or the etcd process.\n\n### Tuning\n- Fires only when `up == 0` for the whole window (MAX=0).\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-etcd.otel-default*\n// Scrape health: up=0 when Prometheus/OTel collector cannot reach the etcd metrics endpoint\n| WHERE `up` IS NOT NULL\n AND `up` == 0\n| STATS count = COUNT(*) BY resource.attributes.service.instance.id\n| SORT count DESC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 5, diff --git a/packages/etcd_otel/manifest.yml b/packages/etcd_otel/manifest.yml index 4645b6e443e..f1d42ab6c3a 100644 --- a/packages/etcd_otel/manifest.yml +++ b/packages/etcd_otel/manifest.yml @@ -1,7 +1,7 @@ format_version: 3.5.7 name: etcd_otel title: "etcd OpenTelemetry Assets" -version: 0.1.0 +version: 0.2.0 source: license: "Elastic-2.0" description: "etcd Assets from OpenTelemetry Collector" @@ -12,7 +12,7 @@ categories: - opentelemetry conditions: kibana: - version: "^9.2.1" + version: "^9.4.0" elastic: subscription: "basic" discovery: diff --git a/packages/haproxy_otel/changelog.yml b/packages/haproxy_otel/changelog.yml index a0d9b9aff64..853014cdba8 100644 --- a/packages/haproxy_otel/changelog.yml +++ b/packages/haproxy_otel/changelog.yml @@ -1,5 +1,10 @@ # newer versions go on top -- version: 0.1.0 +- version: "0.2.0" + changes: + - description: Add description and artifact fields to alerting rule template. + type: enhancement + link: https://github.com/elastic/integrations/pull/18506 +- version: "0.1.0" changes: - description: Initial draft of the Haproxy OTel content package type: enhancement diff --git a/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-backend-no-active-servers.json b/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-backend-no-active-servers.json index 0f417ab203b..83f9991aea7 100644 --- a/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-backend-no-active-servers.json +++ b/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-backend-no-active-servers.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when a HAProxy backend has zero active servers (`haproxy.active < 1` on service_name=BACKEND). The backend cannot serve any traffic.", "name": "[HAProxy OTel] Backend has no active servers", "ruleTypeId": ".es-query", "tags": [ @@ -15,6 +16,22 @@ "alertDelay": { "active": 2 }, + "artifacts": { + "dashboards": [ + { + "id": "haproxy_otel-backend" + }, + { + "id": "haproxy_otel-server" + }, + { + "id": "haproxy_otel-overview" + } + ], + "investigation_guide": { + "blob": "## HAProxy backend has no active servers\n\n### What fired\n`haproxy.active` on a BACKEND row dropped to 0 during the window.\n\n### Why it matters\nA backend with no active servers rejects all requests routed to it. This is a complete outage for any frontend using that backend.\n\n### Triage\n1. Drill into Server dashboard to see individual server health checks and downtime.\n2. Check `haproxy.failed_checks` increments and backup server availability.\n3. Look at server-side causes (crashes, networking).\n\n### Remediation\n- Restore at least one healthy server in the backend.\n- Route traffic to a backup backend if configured.\n\n### Tuning\n- Always P1 \u2014 do not loosen.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { diff --git a/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-connection-errors-spike.json b/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-connection-errors-spike.json index f89d60ba12a..a50b671e37f 100644 --- a/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-connection-errors-spike.json +++ b/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-connection-errors-spike.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `haproxy.connections.errors` increases by more than 10 in 5 minutes. Connection errors indicate backend-reach problems.", "name": "[HAProxy OTel] Connection errors spike", "ruleTypeId": ".es-query", "tags": [ @@ -15,6 +16,22 @@ "alertDelay": { "active": 2 }, + "artifacts": { + "dashboards": [ + { + "id": "haproxy_otel-backend" + }, + { + "id": "haproxy_otel-server" + }, + { + "id": "haproxy_otel-overview" + } + ], + "investigation_guide": { + "blob": "## HAProxy connection errors spike\n\n### What fired\n`haproxy.connections.errors` increased by > 10 in the window.\n\n### Why it matters\nConnection errors mean HAProxy failed the TCP connect to a backend server. Usually indicates crashed servers, network problems, or health-check-flapping targets.\n\n### Triage\n1. Check health-check failures and per-server downtime.\n2. Inspect network between HAProxy and the affected backend.\n3. Look at retry/redispatch counters.\n\n### Remediation\n- Fix the unreachable server(s) or network path.\n- Remove dead targets from the backend.\n\n### Tuning\n- Threshold 10 errors in 5m; tune to your traffic baseline.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { diff --git a/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-health-check-failures.json b/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-health-check-failures.json index bd92af69710..f6d7d091033 100644 --- a/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-health-check-failures.json +++ b/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-health-check-failures.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when any server-level `haproxy.failed_checks` increase. Failed health checks precede the server being marked DOWN.", "name": "[HAProxy OTel] Server health check failures", "ruleTypeId": ".es-query", "tags": [ @@ -15,6 +16,19 @@ "alertDelay": { "active": 2 }, + "artifacts": { + "dashboards": [ + { + "id": "haproxy_otel-server" + }, + { + "id": "haproxy_otel-backend" + } + ], + "investigation_guide": { + "blob": "## HAProxy server health check failures\n\n### What fired\n`haproxy.failed_checks` on a server increased during the window.\n\n### Why it matters\nA health-check failure while UP means the check was slow or returned an error. Repeated failures will flip the server to DOWN, removing it from rotation.\n\n### Triage\n1. Check the server's application logs and CPU/memory.\n2. Inspect network latency to the server.\n3. Validate the health-check path is cheap and always available.\n\n### Remediation\n- Fix the server or health-check.\n- Temporarily increase health-check tolerance if it is overly sensitive.\n\n### Tuning\n- Fires on any increase.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { diff --git a/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-high-connection-time.json b/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-high-connection-time.json index fd5e35dc8c4..fb1860cff3f 100644 --- a/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-high-connection-time.json +++ b/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-high-connection-time.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when the moving-average backend connect time exceeds 100 ms (`haproxy.connections.average_time > 100`). High connect times indicate backend or network pressure.", "name": "[HAProxy OTel] High backend connection time", "ruleTypeId": ".es-query", "tags": [ @@ -15,6 +16,19 @@ "alertDelay": { "active": 2 }, + "artifacts": { + "dashboards": [ + { + "id": "haproxy_otel-backend" + }, + { + "id": "haproxy_otel-server" + } + ], + "investigation_guide": { + "blob": "## HAProxy high backend connection time\n\n### What fired\n`haproxy.connections.average_time > 100 ms` over the window.\n\n### Why it matters\nThe connection time metric is the average over the last 1024 requests. Sustained high values indicate the backend server or the network path is slow to complete TCP connects.\n\n### Triage\n1. Correlate with queue time and response time.\n2. Check backend CPU and TCP accept queues.\n3. Inspect network latency.\n\n### Remediation\n- Add backend capacity.\n- Investigate network path issues.\n\n### Tuning\n- 100 ms is a generic default; tune to your environment's baseline.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { diff --git a/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-high-queue-time.json b/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-high-queue-time.json index 80ddaf1de2a..e56915c0208 100644 --- a/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-high-queue-time.json +++ b/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-high-queue-time.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `haproxy.requests.average_time > 10 ms`. Any non-zero queue time signals backend saturation.", "name": "[HAProxy OTel] High backend queue time", "ruleTypeId": ".es-query", "tags": [ @@ -15,6 +16,19 @@ "alertDelay": { "active": 2 }, + "artifacts": { + "dashboards": [ + { + "id": "haproxy_otel-backend" + }, + { + "id": "haproxy_otel-overview" + } + ], + "investigation_guide": { + "blob": "## HAProxy high backend queue time\n\n### What fired\n`haproxy.requests.average_time > 10 ms` during the window.\n\n### Why it matters\nQueue time is how long requests wait for a backend slot. Non-zero values mean all configured connections on at least one backend server are saturated.\n\n### Triage\n1. Check session utilisation on the affected backend.\n2. Look at server max-conn and active connection counts.\n3. Correlate with upstream server-side latency.\n\n### Remediation\n- Raise per-server max-conn.\n- Add servers to the backend.\n\n### Tuning\n- 10 ms is a sensitive default; raise for bursty workloads.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { diff --git a/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-high-response-time.json b/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-high-response-time.json index 1998897fca0..c56e304a114 100644 --- a/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-high-response-time.json +++ b/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-high-response-time.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `haproxy.responses.average_time > 500 ms`. Backend application latency directly degrades user experience.", "name": "[HAProxy OTel] High backend response time", "ruleTypeId": ".es-query", "tags": [ @@ -15,6 +16,19 @@ "alertDelay": { "active": 2 }, + "artifacts": { + "dashboards": [ + { + "id": "haproxy_otel-backend" + }, + { + "id": "haproxy_otel-server" + } + ], + "investigation_guide": { + "blob": "## HAProxy high backend response time\n\n### What fired\n`haproxy.responses.average_time > 500 ms` during the window.\n\n### Why it matters\nResponse time measures the interval from connection to first response byte \u2014 the actual application processing latency. Regressions here usually reflect application or dependency issues.\n\n### Triage\n1. Correlate with application metrics / traces on the backend.\n2. Check error and retry counters on the same backend.\n3. Inspect DB/cache latency from the backend.\n\n### Remediation\n- Fix the slow application or dependency.\n- Scale backends if the workload has grown.\n\n### Tuning\n- 500 ms is a generic threshold; tune to SLA.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { diff --git a/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-request-errors-spike.json b/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-request-errors-spike.json index 357600aaa28..d4e8d0fa518 100644 --- a/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-request-errors-spike.json +++ b/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-request-errors-spike.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when frontend `haproxy.requests.errors` increases by more than 10 in 5 minutes. Request errors are malformed or timed-out client requests.", "name": "[HAProxy OTel] Request errors spike", "ruleTypeId": ".es-query", "tags": [ @@ -15,6 +16,19 @@ "alertDelay": { "active": 2 }, + "artifacts": { + "dashboards": [ + { + "id": "haproxy_otel-frontend" + }, + { + "id": "haproxy_otel-overview" + } + ], + "investigation_guide": { + "blob": "## HAProxy request errors spike\n\n### What fired\nCounter `haproxy.requests.errors` increased by > 10 during the window.\n\n### Why it matters\nRequest errors include bad HTTP framing, too-long headers, and client timeouts. A sudden spike usually reflects a faulty client, bot traffic, or an MTU/proxy misconfiguration.\n\n### Triage\n1. Inspect HAProxy access logs for the dominant error pattern.\n2. Correlate with source IPs if possible.\n\n### Remediation\n- Block abusive clients at the edge.\n- Fix misbehaving upstream MTU / proxy.\n\n### Tuning\n- 10 errors/5m is a generic threshold; raise for busy edges.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { diff --git a/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-requests-queued.json b/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-requests-queued.json index 9466baee136..8a76750cfbd 100644 --- a/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-requests-queued.json +++ b/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-requests-queued.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `haproxy.requests.queued > 0` sustained. Any ongoing queue depth signals backend saturation.", "name": "[HAProxy OTel] Requests queued at backend", "ruleTypeId": ".es-query", "tags": [ @@ -15,6 +16,19 @@ "alertDelay": { "active": 2 }, + "artifacts": { + "dashboards": [ + { + "id": "haproxy_otel-backend" + }, + { + "id": "haproxy_otel-overview" + } + ], + "investigation_guide": { + "blob": "## HAProxy requests queued at backend\n\n### What fired\n`haproxy.requests.queued > 0` over the window.\n\n### Why it matters\nQueued requests wait for a free backend server slot. Sustained queueing causes cascading latency and eventual timeouts.\n\n### Triage\n1. Identify which backend is queueing and why.\n2. Inspect per-server max-conn saturation.\n3. Look at response time for the backend.\n\n### Remediation\n- Raise per-server capacity.\n- Add servers.\n\n### Tuning\n- Fires on any sustained queue.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { diff --git a/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-response-errors-spike.json b/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-response-errors-spike.json index fb06f79a28f..48bc917e375 100644 --- a/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-response-errors-spike.json +++ b/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-response-errors-spike.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when backend `haproxy.responses.errors` increases by more than 10 in 5 minutes. Response errors include server aborts and delivery failures.", "name": "[HAProxy OTel] Response errors spike", "ruleTypeId": ".es-query", "tags": [ @@ -15,6 +16,19 @@ "alertDelay": { "active": 2 }, + "artifacts": { + "dashboards": [ + { + "id": "haproxy_otel-backend" + }, + { + "id": "haproxy_otel-server" + } + ], + "investigation_guide": { + "blob": "## HAProxy response errors spike\n\n### What fired\nCounter `haproxy.responses.errors` increased by > 10 during the window.\n\n### Why it matters\nResponse errors are failures while delivering the response from server to client \u2014 server aborts, connection resets mid-response, etc. A spike often indicates unstable backend applications.\n\n### Triage\n1. Correlate with application error logs.\n2. Check server restarts / crashes.\n3. Inspect retry/redispatch counters.\n\n### Remediation\n- Stabilise the backend application.\n\n### Tuning\n- 10 errors/5m threshold; tune per traffic.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { diff --git a/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-retry-redispatch-spike.json b/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-retry-redispatch-spike.json index 0661a97dbd7..4290c355489 100644 --- a/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-retry-redispatch-spike.json +++ b/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-retry-redispatch-spike.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when the sum of retries + redispatches exceeds 5 in 5 minutes. Retries and redispatches both signal backend instability.", "name": "[HAProxy OTel] Connection retries or redispatches spike", "ruleTypeId": ".es-query", "tags": [ @@ -15,6 +16,19 @@ "alertDelay": { "active": 2 }, + "artifacts": { + "dashboards": [ + { + "id": "haproxy_otel-backend" + }, + { + "id": "haproxy_otel-server" + } + ], + "investigation_guide": { + "blob": "## HAProxy connection retries / redispatches spike\n\n### What fired\n`haproxy.connections.retries + haproxy.requests.redispatched` increased by > 5 over the window.\n\n### Why it matters\nRetries and redispatches mean HAProxy had to try again because the first attempt failed. Sustained counts indicate flapping backend servers.\n\n### Triage\n1. Identify which servers require retries.\n2. Correlate with connection errors and health-check failures.\n\n### Remediation\n- Remove the unhealthy server from rotation.\n- Fix the backend or its network.\n\n### Tuning\n- 5 events/5m threshold.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { diff --git a/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-session-saturation.json b/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-session-saturation.json index ee19759ad08..8af4223ba4d 100644 --- a/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-session-saturation.json +++ b/packages/haproxy_otel/kibana/alerting_rule_template/haproxy_otel-session-saturation.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when session utilisation (`sessions.count / sessions.limit`) exceeds 80%. Approaching the limit means new connections will be queued or refused.", "name": "[HAProxy OTel] Session saturation approaching limit", "ruleTypeId": ".es-query", "tags": [ @@ -15,6 +16,22 @@ "alertDelay": { "active": 2 }, + "artifacts": { + "dashboards": [ + { + "id": "haproxy_otel-frontend" + }, + { + "id": "haproxy_otel-backend" + }, + { + "id": "haproxy_otel-overview" + } + ], + "investigation_guide": { + "blob": "## HAProxy session saturation approaching limit\n\n### What fired\n`sessions.count / sessions.limit > 0.80` during the window.\n\n### Why it matters\nSessions are bounded by `maxconn` on each entity. Approaching the limit means bursts will be queued (backend) or refused (frontend). Sustained high utilisation predicts imminent capacity problems.\n\n### Triage\n1. Compare against request rate growth.\n2. Check backend queue depth and latency.\n\n### Remediation\n- Raise `maxconn` on the affected entity.\n- Add capacity.\n\n### Tuning\n- `> 80%` is the documented warning band.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { diff --git a/packages/haproxy_otel/manifest.yml b/packages/haproxy_otel/manifest.yml index 6ea58549ad1..549c2c73d6b 100644 --- a/packages/haproxy_otel/manifest.yml +++ b/packages/haproxy_otel/manifest.yml @@ -1,7 +1,7 @@ format_version: 3.5.5 name: haproxy_otel title: "Haproxy OpenTelemetry Assets" -version: 0.1.0 +version: 0.2.0 source: license: "Elastic-2.0" description: "Haproxy Assets for OpenTelemetry Collector" @@ -12,7 +12,7 @@ categories: - opentelemetry conditions: kibana: - version: "^9.2.1" + version: "^9.4.0" elastic: subscription: "basic" discovery: diff --git a/packages/ibmmq_otel/changelog.yml b/packages/ibmmq_otel/changelog.yml index dd1e68d24a7..18d97104c0a 100644 --- a/packages/ibmmq_otel/changelog.yml +++ b/packages/ibmmq_otel/changelog.yml @@ -1,4 +1,9 @@ # newer versions go on top +- version: "0.2.0" + changes: + - description: Add description and artifact fields to alerting rule template. + type: enhancement + link: https://github.com/elastic/integrations/pull/18506 - version: "0.1.0" changes: - description: Initial draft of the ibmmq_otel content pack. diff --git a/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-cpu-load-high.json b/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-cpu-load-high.json index d5d30e4cdc8..44b6ec8898c 100644 --- a/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-cpu-load-high.json +++ b/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-cpu-load-high.json @@ -1,36 +1,50 @@ { - "id": "ibmmq_otel-cpu-load-high", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[IBM MQ OTel] CPU load high", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "ibmmq" - ], - "schedule": { - "interval": "5m" + "id": "ibmmq_otel-cpu-load-high", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when `ibmmq_qmgr_cpu_load_five_minute_average_percentage > 85`. High CPU degrades message throughput and inflates latency.", + "name": "[IBM MQ OTel] CPU load high", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "ibmmq" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "artifacts": { + "dashboards": [ + { + "id": "ibmmq_otel-resources" }, - "alertDelay": { - "active": 3 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "TS metrics-ibmmq.otel-*\n| STATS cpu_load_5m = MAX(AVG_OVER_TIME(ibmmq_qmgr_cpu_load_five_minute_average_percentage))\n BY attributes.qmgr\n// Alert when the 5-minute CPU load average exceeds 85%\n// < 70% is healthy; 70-85% is warning; > 85% is critical\n// High CPU load degrades message throughput and increases latency\n// Adjust threshold based on your host capacity and co-located workloads\n| WHERE cpu_load_5m IS NOT NULL AND cpu_load_5m > 85" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "attributes.qmgr", - "termSize": 10 + { + "id": "ibmmq_otel-overview" } + ], + "investigation_guide": { + "blob": "## IBM MQ queue manager CPU load high\n\n### What fired\nFive-minute CPU load average exceeded 85% on at least one queue manager.\n\n### Why it matters\nCPU-bound queue managers cannot maintain published throughput; log writes slow, put/get latency rises, and channels back up.\n\n### Triage\n1. Identify the qmgr and compare with co-located workloads on the host.\n2. Correlate with log write latency and rollback ratios.\n3. Check for sudden client burst patterns.\n\n### Remediation\n- Move the qmgr to a less contended host.\n- Throttle producers.\n\n### Tuning\n- 85% over 15 minutes; lower to 80% for earlier warning.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "TS metrics-ibmmq.otel-*\n| STATS cpu_load_5m = MAX(AVG_OVER_TIME(ibmmq_qmgr_cpu_load_five_minute_average_percentage))\n BY attributes.qmgr\n// Alert when the 5-minute CPU load average exceeds 85%\n// < 70% is healthy; 70-85% is warning; > 85% is critical\n// High CPU load degrades message throughput and increases latency\n// Adjust threshold based on your host capacity and co-located workloads\n| WHERE cpu_load_5m IS NOT NULL AND cpu_load_5m > 85" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "attributes.qmgr", + "termSize": 10 } + } } diff --git a/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-expired-messages.json b/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-expired-messages.json index 81a0a6f88ee..351bc67ab0c 100644 --- a/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-expired-messages.json +++ b/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-expired-messages.json @@ -1,36 +1,50 @@ { - "id": "ibmmq_otel-expired-messages", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[IBM MQ OTel] Messages expiring before consumption", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "ibmmq" - ], - "schedule": { - "interval": "5m" + "id": "ibmmq_otel-expired-messages", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when any messages expire before being consumed (`ibmmq_qmgr_expired_message_total` rate > 0). Consumer lag or too-aggressive TTLs.", + "name": "[IBM MQ OTel] Messages expiring before consumption", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "ibmmq" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "artifacts": { + "dashboards": [ + { + "id": "ibmmq_otel-message-traffic" }, - "alertDelay": { - "active": 2 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "TS metrics-ibmmq.otel-*\n| STATS expired_rate = SUM(RATE(ibmmq_qmgr_expired_message_total))\n BY attributes.qmgr\n// Alert when messages are expiring before being consumed\n// Any non-zero expiration rate indicates consumers are falling behind\n// or message TTL settings are too aggressive\n// Adjust threshold based on expected message expiration patterns\n| WHERE expired_rate IS NOT NULL AND expired_rate > 0" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "attributes.qmgr", - "termSize": 10 + { + "id": "ibmmq_otel-overview" } + ], + "investigation_guide": { + "blob": "## IBM MQ messages expiring before consumption\n\n### What fired\nRate of `ibmmq_qmgr_expired_message_total` > 0 during the window.\n\n### Why it matters\nExpired messages are dropped before any consumer processes them. Either consumers are lagging or the producer-side TTL is set too aggressively for current consumption throughput.\n\n### Triage\n1. Check queue depth and consumer counts on affected queues.\n2. Validate the configured TTL for business expectations.\n\n### Remediation\n- Add consumer capacity.\n- Raise TTL if business rules allow.\n\n### Tuning\n- Fires on any non-zero rate.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "TS metrics-ibmmq.otel-*\n| STATS expired_rate = SUM(RATE(ibmmq_qmgr_expired_message_total))\n BY attributes.qmgr\n// Alert when messages are expiring before being consumed\n// Any non-zero expiration rate indicates consumers are falling behind\n// or message TTL settings are too aggressive\n// Adjust threshold based on expected message expiration patterns\n| WHERE expired_rate IS NOT NULL AND expired_rate > 0" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "attributes.qmgr", + "termSize": 10 } + } } diff --git a/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-failed-connections.json b/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-failed-connections.json index 7ce02c381d3..d1d7deff848 100644 --- a/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-failed-connections.json +++ b/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-failed-connections.json @@ -1,36 +1,50 @@ { - "id": "ibmmq_otel-failed-connections", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[IBM MQ OTel] Failed connection attempts", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "ibmmq" - ], - "schedule": { - "interval": "5m" + "id": "ibmmq_otel-failed-connections", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when any connect or open operations fail (`failed_mqconn_mqconnx_total` or `failed_mqopen_total`). Auth, resource, or network issues.", + "name": "[IBM MQ OTel] Failed connection attempts", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "ibmmq" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "artifacts": { + "dashboards": [ + { + "id": "ibmmq_otel-error-analysis" }, - "alertDelay": { - "active": 2 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "TS metrics-ibmmq.otel-*\n| STATS\n failed_conns = SUM(RATE(ibmmq_qmgr_failed_mqconn_mqconnx_total)),\n failed_opens = SUM(RATE(ibmmq_qmgr_failed_mqopen_total))\n BY attributes.qmgr\n| EVAL total_failed_access = COALESCE(failed_conns, 0) + COALESCE(failed_opens, 0)\n// Alert when connection or object-open failures are detected\n// Failures indicate authentication problems, resource exhaustion,\n// or network issues between clients and the queue manager\n// Adjust threshold based on your environment\n| WHERE total_failed_access > 0" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "attributes.qmgr", - "termSize": 10 + { + "id": "ibmmq_otel-overview" } + ], + "investigation_guide": { + "blob": "## IBM MQ failed connection attempts\n\n### What fired\nFailed MQCONN/MQCONNX/MQOPEN rates were > 0 during the window.\n\n### Why it matters\nConnect/open failures block applications from reaching queues. Common causes: auth changes, network rules, or resource exhaustion on the qmgr.\n\n### Triage\n1. Check qmgr error logs (AMQERR01.LOG) for RC codes.\n2. Inspect channel auth and TLS configuration.\n3. Confirm qmgr resources (log, storage, max connections).\n\n### Remediation\n- Fix credentials / auth / channel config.\n- Raise `MAXCHANNELS` or `MAXINST` limits.\n\n### Tuning\n- Fires on any increase.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "TS metrics-ibmmq.otel-*\n| STATS\n failed_conns = SUM(RATE(ibmmq_qmgr_failed_mqconn_mqconnx_total)),\n failed_opens = SUM(RATE(ibmmq_qmgr_failed_mqopen_total))\n BY attributes.qmgr\n| EVAL total_failed_access = COALESCE(failed_conns, 0) + COALESCE(failed_opens, 0)\n// Alert when connection or object-open failures are detected\n// Failures indicate authentication problems, resource exhaustion,\n// or network issues between clients and the queue manager\n// Adjust threshold based on your environment\n| WHERE total_failed_access > 0" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "attributes.qmgr", + "termSize": 10 } + } } diff --git a/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-failed-message-operations.json b/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-failed-message-operations.json index a9102115314..ce234ac528a 100644 --- a/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-failed-message-operations.json +++ b/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-failed-message-operations.json @@ -1,36 +1,50 @@ { - "id": "ibmmq_otel-failed-message-operations", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[IBM MQ OTel] Failed message operations", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "ibmmq" - ], - "schedule": { - "interval": "5m" + "id": "ibmmq_otel-failed-message-operations", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when MQPUT, MQPUT1 or MQGET failures occur. Put/get failures indicate queue-full, auth, or application errors.", + "name": "[IBM MQ OTel] Failed message operations", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "ibmmq" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "artifacts": { + "dashboards": [ + { + "id": "ibmmq_otel-error-analysis" }, - "alertDelay": { - "active": 2 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "TS metrics-ibmmq.otel-*\n| STATS\n failed_puts = SUM(RATE(ibmmq_qmgr_failed_mqput_total)),\n failed_put1s = SUM(RATE(ibmmq_qmgr_failed_mqput1_total)),\n failed_gets = SUM(RATE(ibmmq_qmgr_failed_mqget_total))\n BY attributes.qmgr\n| EVAL total_failed_msg_ops = COALESCE(failed_puts, 0) + COALESCE(failed_put1s, 0) + COALESCE(failed_gets, 0)\n// Alert when failed put or get operations are detected\n// In a healthy system, these should all be at zero rate\n// Failures indicate queue full, auth issues, or application errors\n// Adjust threshold based on expected traffic volume\n| WHERE total_failed_msg_ops > 0" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "attributes.qmgr", - "termSize": 10 + { + "id": "ibmmq_otel-message-traffic" } + ], + "investigation_guide": { + "blob": "## IBM MQ failed message operations\n\n### What fired\nRate of `failed_mqput_total + failed_mqput1_total + failed_mqget_total` > 0 during the window.\n\n### Why it matters\nMQ tracks every put/get failure. In a healthy system these should be zero. Elevated failures indicate queue full, authorization issues, or misbehaving applications.\n\n### Triage\n1. Split by queue to find hot spots.\n2. Check queue depth vs MAXDEPTH.\n3. Inspect application logs for reason codes.\n\n### Remediation\n- Increase queue depth or consumer capacity.\n- Fix authorization problems.\n\n### Tuning\n- Fires on any increase.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "TS metrics-ibmmq.otel-*\n| STATS\n failed_puts = SUM(RATE(ibmmq_qmgr_failed_mqput_total)),\n failed_put1s = SUM(RATE(ibmmq_qmgr_failed_mqput1_total)),\n failed_gets = SUM(RATE(ibmmq_qmgr_failed_mqget_total))\n BY attributes.qmgr\n| EVAL total_failed_msg_ops = COALESCE(failed_puts, 0) + COALESCE(failed_put1s, 0) + COALESCE(failed_gets, 0)\n// Alert when failed put or get operations are detected\n// In a healthy system, these should all be at zero rate\n// Failures indicate queue full, auth issues, or application errors\n// Adjust threshold based on expected traffic volume\n| WHERE total_failed_msg_ops > 0" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "attributes.qmgr", + "termSize": 10 } + } } diff --git a/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-fdc-files-increasing.json b/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-fdc-files-increasing.json index c86e55389ff..79e6a17945b 100644 --- a/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-fdc-files-increasing.json +++ b/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-fdc-files-increasing.json @@ -1,36 +1,50 @@ { - "id": "ibmmq_otel-fdc-files-increasing", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[IBM MQ OTel] FDC files increasing", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "ibmmq" - ], - "schedule": { - "interval": "5m" + "id": "ibmmq_otel-fdc-files-increasing", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when new FDC (First Failure Data Capture) files are generated. Any increase is significant and indicates an internal queue-manager error.", + "name": "[IBM MQ OTel] FDC files increasing", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "ibmmq" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 1 + }, + "artifacts": { + "dashboards": [ + { + "id": "ibmmq_otel-error-analysis" }, - "alertDelay": { - "active": 1 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "TS metrics-ibmmq.otel-*\n| STATS fdc_change = SUM(DELTA(ibmmq_qmgr_fdc_files))\n BY attributes.qmgr\n// Alert when new FDC (First Failure Data Capture) files are generated\n// FDC files indicate internal queue manager errors\n// Any increase is significant and requires immediate investigation\n| WHERE fdc_change IS NOT NULL AND fdc_change > 0" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "attributes.qmgr", - "termSize": 10 + { + "id": "ibmmq_otel-overview" } + ], + "investigation_guide": { + "blob": "## IBM MQ FDC files increasing\n\n### What fired\n`ibmmq_qmgr_fdc_files` increased during the window.\n\n### Why it matters\nFDCs are MQ's crash/assertion dumps. Their presence always indicates a non-trivial internal error that warrants support case.\n\n### Triage\n1. Inspect the FDC directory for the dumped files and reason codes.\n2. Correlate with qmgr stability signals.\n\n### Remediation\n- Open a case with IBM support with the FDC bundle.\n- If a known bug, apply the fix pack.\n\n### Tuning\n- Fires on any increase. Do not loosen.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "TS metrics-ibmmq.otel-*\n| STATS fdc_change = SUM(DELTA(ibmmq_qmgr_fdc_files))\n BY attributes.qmgr\n// Alert when new FDC (First Failure Data Capture) files are generated\n// FDC files indicate internal queue manager errors\n// Any increase is significant and requires immediate investigation\n| WHERE fdc_change IS NOT NULL AND fdc_change > 0" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "attributes.qmgr", + "termSize": 10 } + } } diff --git a/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-high-log-write-latency.json b/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-high-log-write-latency.json index 015cc6b3052..ddc296413fb 100644 --- a/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-high-log-write-latency.json +++ b/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-high-log-write-latency.json @@ -1,36 +1,50 @@ { - "id": "ibmmq_otel-high-log-write-latency", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[IBM MQ OTel] High log write latency", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "ibmmq" - ], - "schedule": { - "interval": "5m" + "id": "ibmmq_otel-high-log-write-latency", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when `ibmmq_qmgr_log_write_latency_seconds > 0.005` (5 ms). Log writes are on the critical path for persistent messaging.", + "name": "[IBM MQ OTel] High log write latency", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "ibmmq" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "artifacts": { + "dashboards": [ + { + "id": "ibmmq_otel-resources" }, - "alertDelay": { - "active": 3 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "TS metrics-ibmmq.otel-*\n| STATS log_latency_s = MAX(AVG_OVER_TIME(ibmmq_qmgr_log_write_latency_seconds))\n BY attributes.qmgr\n// Alert when average log write latency exceeds 5ms (0.005s)\n// Sub-millisecond is healthy; > 1ms is warning; > 10ms is critical\n// Persistent messages require synchronous log writes, so latency\n// directly governs end-to-end message throughput\n// Adjust threshold based on your performance requirements\n| WHERE log_latency_s IS NOT NULL AND log_latency_s > 0.005" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "attributes.qmgr", - "termSize": 10 + { + "id": "ibmmq_otel-overview" } + ], + "investigation_guide": { + "blob": "## IBM MQ high log write latency\n\n### What fired\nAverage log write latency exceeded 5 ms during the window.\n\n### Why it matters\nPersistent messaging commits synchronously to the log. Slow log writes throttle throughput and raise put latency.\n\n### Triage\n1. Check disk subsystem IOPS and latency.\n2. Verify log is on dedicated fast storage.\n3. Inspect log buffer sizing (LogBufferPages).\n\n### Remediation\n- Move log to faster storage.\n- Tune log buffers.\n\n### Tuning\n- 5 ms is moderate; tighten to 1 ms for latency-sensitive workloads.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "TS metrics-ibmmq.otel-*\n| STATS log_latency_s = MAX(AVG_OVER_TIME(ibmmq_qmgr_log_write_latency_seconds))\n BY attributes.qmgr\n// Alert when average log write latency exceeds 5ms (0.005s)\n// Sub-millisecond is healthy; > 1ms is warning; > 10ms is critical\n// Persistent messages require synchronous log writes, so latency\n// directly governs end-to-end message throughput\n// Adjust threshold based on your performance requirements\n| WHERE log_latency_s IS NOT NULL AND log_latency_s > 0.005" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "attributes.qmgr", + "termSize": 10 } + } } diff --git a/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-high-rollback-ratio.json b/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-high-rollback-ratio.json index 4ccd1a83abd..cec138343dd 100644 --- a/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-high-rollback-ratio.json +++ b/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-high-rollback-ratio.json @@ -1,36 +1,50 @@ { - "id": "ibmmq_otel-high-rollback-ratio", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[IBM MQ OTel] High transaction rollback ratio", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "ibmmq" - ], - "schedule": { - "interval": "5m" + "id": "ibmmq_otel-high-rollback-ratio", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when transaction rollback-rate exceeds 5% of total transactions. Poison messages or processing failures.", + "name": "[IBM MQ OTel] High transaction rollback ratio", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "ibmmq" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "artifacts": { + "dashboards": [ + { + "id": "ibmmq_otel-error-analysis" }, - "alertDelay": { - "active": 3 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "TS metrics-ibmmq.otel-*\n| STATS\n commit_rate = SUM(RATE(ibmmq_qmgr_commit_total)),\n rollback_rate = SUM(RATE(ibmmq_qmgr_rollback_total))\n BY attributes.qmgr\n// Require a minimum commit rate to avoid false positives on idle queue managers\n| WHERE commit_rate IS NOT NULL AND commit_rate > 0\n AND rollback_rate IS NOT NULL\n| EVAL rollback_pct = ROUND(rollback_rate / (commit_rate + rollback_rate) * 100.0, 2)\n// Alert when rollback rate exceeds 5% of total transactions\n// < 1% is healthy; 1-5% is warning; > 5% is critical\n// High rollback ratios indicate poison messages, processing failures,\n// or resource contention causing transaction failures\n// Adjust threshold and minimum commit rate for your workload\n| WHERE rollback_pct > 5.0" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "attributes.qmgr", - "termSize": 10 + { + "id": "ibmmq_otel-message-traffic" } + ], + "investigation_guide": { + "blob": "## IBM MQ high transaction rollback ratio\n\n### What fired\n`rollback / (commit + rollback) > 5%` over the window.\n\n### Why it matters\nHigh rollback rates signal poison messages repeatedly failing processing, auth changes breaking apps, or consumer bugs.\n\n### Triage\n1. Check dead-letter / backout queues for accumulating messages.\n2. Inspect consumer application logs.\n3. Look at BOTHRESH / BOQNAME queue attributes.\n\n### Remediation\n- Route poison messages to BOQNAME.\n- Fix consumer logic.\n\n### Tuning\n- 5% threshold with minimum commit rate filter; tighten to 1% for critical apps.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "TS metrics-ibmmq.otel-*\n| STATS\n commit_rate = SUM(RATE(ibmmq_qmgr_commit_total)),\n rollback_rate = SUM(RATE(ibmmq_qmgr_rollback_total))\n BY attributes.qmgr\n// Require a minimum commit rate to avoid false positives on idle queue managers\n| WHERE commit_rate IS NOT NULL AND commit_rate > 0\n AND rollback_rate IS NOT NULL\n| EVAL rollback_pct = ROUND(rollback_rate / (commit_rate + rollback_rate) * 100.0, 2)\n// Alert when rollback rate exceeds 5% of total transactions\n// < 1% is healthy; 1-5% is warning; > 5% is critical\n// High rollback ratios indicate poison messages, processing failures,\n// or resource contention causing transaction failures\n// Adjust threshold and minimum commit rate for your workload\n| WHERE rollback_pct > 5.0" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "attributes.qmgr", + "termSize": 10 } + } } diff --git a/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-log-filesystem-critically-low.json b/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-log-filesystem-critically-low.json index 1473351ae6e..0c9ee7c60c5 100644 --- a/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-log-filesystem-critically-low.json +++ b/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-log-filesystem-critically-low.json @@ -1,36 +1,50 @@ { - "id": "ibmmq_otel-log-filesystem-critically-low", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[IBM MQ OTel] Log filesystem critically low", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "ibmmq" - ], - "schedule": { - "interval": "5m" + "id": "ibmmq_otel-log-filesystem-critically-low", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when `ibmmq_qmgr_log_file_system_free_space_percentage < 10`. Log filesystem exhaustion halts all transactional operations.", + "name": "[IBM MQ OTel] Log filesystem critically low", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "ibmmq" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "artifacts": { + "dashboards": [ + { + "id": "ibmmq_otel-resources" }, - "alertDelay": { - "active": 3 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "TS metrics-ibmmq.otel-*\n| STATS log_fs_free_pct = MAX(LAST_OVER_TIME(ibmmq_qmgr_log_file_system_free_space_percentage))\n BY attributes.qmgr\n// Alert when log filesystem free space drops below 10%\n// Log filesystem exhaustion halts all transactional operations\n// Adjust threshold based on your capacity planning requirements\n| WHERE log_fs_free_pct IS NOT NULL AND log_fs_free_pct < 10" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "attributes.qmgr", - "termSize": 10 + { + "id": "ibmmq_otel-overview" } + ], + "investigation_guide": { + "blob": "## IBM MQ log filesystem critically low\n\n### What fired\nFree space on the log filesystem dropped below 10% during the window.\n\n### Why it matters\nWhen the log fills up, MQ refuses all persistent operations. Recovery requires clearing old logs or extending the volume.\n\n### Triage\n1. Check circular vs linear log mode.\n2. Inspect old log archival progress.\n\n### Remediation\n- Archive or remove old log extents.\n- Grow the log filesystem.\n\n### Tuning\n- 10% is a late threshold. Consider a 25% warning rule.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "TS metrics-ibmmq.otel-*\n| STATS log_fs_free_pct = MAX(LAST_OVER_TIME(ibmmq_qmgr_log_file_system_free_space_percentage))\n BY attributes.qmgr\n// Alert when log filesystem free space drops below 10%\n// Log filesystem exhaustion halts all transactional operations\n// Adjust threshold based on your capacity planning requirements\n| WHERE log_fs_free_pct IS NOT NULL AND log_fs_free_pct < 10" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "attributes.qmgr", + "termSize": 10 } + } } diff --git a/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-queue-manager-filesystem-low.json b/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-queue-manager-filesystem-low.json index f8432dc43a4..09a83337b08 100644 --- a/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-queue-manager-filesystem-low.json +++ b/packages/ibmmq_otel/kibana/alerting_rule_template/ibmmq_otel-queue-manager-filesystem-low.json @@ -1,36 +1,50 @@ { - "id": "ibmmq_otel-queue-manager-filesystem-low", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[IBM MQ OTel] Queue manager filesystem low", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "ibmmq" - ], - "schedule": { - "interval": "5m" + "id": "ibmmq_otel-queue-manager-filesystem-low", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when the queue-manager filesystem free space drops below 20%. Holds config and queue data; exhaustion corrupts the qmgr.", + "name": "[IBM MQ OTel] Queue manager filesystem low", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "ibmmq" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "artifacts": { + "dashboards": [ + { + "id": "ibmmq_otel-resources" }, - "alertDelay": { - "active": 3 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "TS metrics-ibmmq.otel-*\n| STATS qmgr_fs_free_pct = MAX(LAST_OVER_TIME(ibmmq_qmgr_queue_manager_file_system_free_space_percentage))\n BY attributes.qmgr\n// Alert when queue manager filesystem free space drops below 20%\n// The queue manager filesystem stores configuration and queue data\n// Adjust threshold based on your capacity planning requirements\n| WHERE qmgr_fs_free_pct IS NOT NULL AND qmgr_fs_free_pct < 20" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "attributes.qmgr", - "termSize": 10 + { + "id": "ibmmq_otel-overview" } + ], + "investigation_guide": { + "blob": "## IBM MQ queue manager filesystem low\n\n### What fired\nQueue manager filesystem free space dropped below 20%.\n\n### Why it matters\nThis filesystem stores config and queue data. Running out of space can corrupt queues and require restore from backup.\n\n### Triage\n1. Identify what is consuming space (queue depth, errors, cores).\n2. Check queue depths vs MAXDEPTH.\n\n### Remediation\n- Grow the filesystem.\n- Drain backed-up queues.\n\n### Tuning\n- 20% is generous; some deployments alert at 30%.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "TS metrics-ibmmq.otel-*\n| STATS qmgr_fs_free_pct = MAX(LAST_OVER_TIME(ibmmq_qmgr_queue_manager_file_system_free_space_percentage))\n BY attributes.qmgr\n// Alert when queue manager filesystem free space drops below 20%\n// The queue manager filesystem stores configuration and queue data\n// Adjust threshold based on your capacity planning requirements\n| WHERE qmgr_fs_free_pct IS NOT NULL AND qmgr_fs_free_pct < 20" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "attributes.qmgr", + "termSize": 10 } + } } diff --git a/packages/ibmmq_otel/manifest.yml b/packages/ibmmq_otel/manifest.yml index 05661329e6e..ef480f195dc 100644 --- a/packages/ibmmq_otel/manifest.yml +++ b/packages/ibmmq_otel/manifest.yml @@ -1,7 +1,7 @@ format_version: 3.5.7 name: ibmmq_otel title: "IBM MQ OpenTelemetry Assets" -version: 0.1.0 +version: 0.2.0 source: license: "Elastic-2.0" description: "IBM MQ Assets from OpenTelemetry Collector" @@ -12,7 +12,7 @@ categories: - observability conditions: kibana: - version: "^9.2.1" + version: "^9.4.0" elastic: subscription: "basic" discovery: diff --git a/packages/iis_otel/changelog.yml b/packages/iis_otel/changelog.yml index cfb5875af7f..28bb0496448 100644 --- a/packages/iis_otel/changelog.yml +++ b/packages/iis_otel/changelog.yml @@ -1,4 +1,9 @@ # newer versions go on top +- version: "0.4.0" + changes: + - description: Add description and artifact fields to alerting rule template. + type: enhancement + link: https://github.com/elastic/integrations/pull/18506 - version: "0.3.0" changes: - description: Add new assets diff --git a/packages/iis_otel/kibana/alerting_rule_template/iis_otel-application-pool-not-running.json b/packages/iis_otel/kibana/alerting_rule_template/iis_otel-application-pool-not-running.json index a8b38eceb08..c3dd8c5d4ca 100644 --- a/packages/iis_otel/kibana/alerting_rule_template/iis_otel-application-pool-not-running.json +++ b/packages/iis_otel/kibana/alerting_rule_template/iis_otel-application-pool-not-running.json @@ -3,9 +3,13 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when an IIS application pool is not in the Started state (`iis.application_pool.state != 3`). Pools in Stopped/Starting/Stopping/Pausing/Paused states cannot serve requests.", "name": "[IIS OTel] Application pool not running", "ruleTypeId": ".es-query", - "tags": ["observability", "iis"], + "tags": [ + "observability", + "iis" + ], "schedule": { "interval": "5m" }, @@ -17,13 +21,28 @@ "lookBackWindow": 10, "statusChangeThreshold": 4 }, + "artifacts": { + "dashboards": [ + { + "id": "iis_otel-sites" + }, + { + "id": "iis_otel-overview" + } + ], + "investigation_guide": { + "blob": "## IIS application pool not running\n\n### What fired\n`iis.application_pool.state` was non-3 (i.e. not Started) on at least one pool during the window.\n\n### Why it matters\nAn application pool that is not Started cannot host websites or applications. Requests routed to that pool fail with HTTP 503 (Service Unavailable). Common causes include application crashes, rapid-fail protection trips, or operator-driven stops.\n\n### Triage\n1. Identify the affected pool(s) and host(s) from the alert context.\n2. Check the IIS event log for rapid-fail protection events and application crashes.\n3. Inspect the worker process (`w3wp.exe`) crash logs in Windows Event Viewer (Application log).\n4. Verify whether the pool was stopped intentionally (deploys, maintenance).\n\n### Remediation\n- Restart the application pool via `Restart-WebAppPool` or IIS Manager.\n- Disable rapid-fail protection only after fixing the underlying crashes.\n- Address the application bug causing repeated worker process crashes.\n\n### Tuning\n- Fires on any non-Started state. To exclude planned maintenance, suppress with maintenance windows or filter specific pool names.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-iisreceiver.otel-*\n| WHERE `iis.application_pool.state` IS NOT NULL\n| WHERE `iis.application_pool.state` != 3\n| STATS last_state = MAX(`iis.application_pool.state`) BY iis.application_pool, host.name\n| SORT last_state ASC\n| LIMIT 20" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 5, diff --git a/packages/iis_otel/kibana/alerting_rule_template/iis_otel-bandwidth-throttled.json b/packages/iis_otel/kibana/alerting_rule_template/iis_otel-bandwidth-throttled.json index 80c9f3a929d..d2d6b2df179 100644 --- a/packages/iis_otel/kibana/alerting_rule_template/iis_otel-bandwidth-throttled.json +++ b/packages/iis_otel/kibana/alerting_rule_template/iis_otel-bandwidth-throttled.json @@ -3,9 +3,13 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `iis.network.blocked` increases \u2014 IIS is enforcing bandwidth throttling and blocking traffic. Indicates configured bandwidth limits are being hit.", "name": "[IIS OTel] Bandwidth throttling detected", "ruleTypeId": ".es-query", - "tags": ["observability", "iis"], + "tags": [ + "observability", + "iis" + ], "schedule": { "interval": "5m" }, @@ -17,13 +21,28 @@ "lookBackWindow": 10, "statusChangeThreshold": 4 }, + "artifacts": { + "dashboards": [ + { + "id": "iis_otel-overview" + }, + { + "id": "iis_otel-sites" + } + ], + "investigation_guide": { + "blob": "## IIS bandwidth throttling detected\n\n### What fired\n`INCREASE(iis.network.blocked) > 0` on at least one host during the window.\n\n### Why it matters\nIIS supports per-site bandwidth limits. When traffic exceeds the configured limit, IIS blocks additional bytes and the counter increments. Sustained blocking causes user-visible latency or transfer failures.\n\n### Triage\n1. Identify the affected host and (via the Sites & Pools dashboard) the busiest sites.\n2. Compare against the configured `bandwidthLimit` for each site.\n3. Look for traffic spikes from specific clients.\n\n### Remediation\n- Raise the per-site bandwidth limit if the workload has legitimately grown.\n- Throttle abusive clients at the edge / WAF.\n\n### Tuning\n- Fires on any non-zero increase. If brief spikes are tolerable, raise the threshold or add an `alertDelay`.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "TS metrics-iisreceiver.otel-*\n| WHERE iis.network.blocked IS NOT NULL\n| STATS blocked_increase = SUM(INCREASE(iis.network.blocked)) BY host.name\n| WHERE blocked_increase > 0\n| SORT blocked_increase DESC\n| LIMIT 20" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 5, diff --git a/packages/iis_otel/kibana/alerting_rule_template/iis_otel-request-queue-age-high.json b/packages/iis_otel/kibana/alerting_rule_template/iis_otel-request-queue-age-high.json index 327b3a879c0..b5a34b4216d 100644 --- a/packages/iis_otel/kibana/alerting_rule_template/iis_otel-request-queue-age-high.json +++ b/packages/iis_otel/kibana/alerting_rule_template/iis_otel-request-queue-age-high.json @@ -3,9 +3,13 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when the max request queue age (`iis.request.queue.age.max`) exceeds 5000 ms. Aged requests indicate the application pool cannot dequeue fast enough.", "name": "[IIS OTel] Request queue age elevated", "ruleTypeId": ".es-query", - "tags": ["observability", "iis"], + "tags": [ + "observability", + "iis" + ], "schedule": { "interval": "5m" }, @@ -17,13 +21,28 @@ "lookBackWindow": 10, "statusChangeThreshold": 4 }, + "artifacts": { + "dashboards": [ + { + "id": "iis_otel-sites" + }, + { + "id": "iis_otel-overview" + } + ], + "investigation_guide": { + "blob": "## IIS request queue age elevated\n\n### What fired\n`iis.request.queue.age.max > 5000 ms` on a pool during the window.\n\n### Why it matters\nQueue age measures how long the oldest queued request has been waiting. Sustained high age means the pool's worker threads cannot drain the queue \u2014 usually because of slow back-end calls, thread starvation, or insufficient `maxConcurrentRequestsPerCPU`.\n\n### Triage\n1. Identify the affected pool from the Sites & Pools dashboard.\n2. Correlate with queue depth and request rejections on the same pool.\n3. Inspect downstream dependency latency (DB, cache, REST APIs).\n4. Check for thread or CLR contention via `w3wp.exe` performance counters.\n\n### Remediation\n- Scale out worker processes (`maxProcesses`) if CPU allows.\n- Tune `maxConcurrentRequestsPerCPU` or `processModel.requestQueueLimit`.\n- Optimise slow downstream calls.\n\n### Tuning\n- 5000 ms threshold; tighten for latency-sensitive workloads.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-iisreceiver.otel-*\n| WHERE iis.request.queue.age.max IS NOT NULL\n| STATS max_age_ms = MAX(iis.request.queue.age.max) BY iis.application_pool, host.name\n| WHERE max_age_ms > 5000\n| SORT max_age_ms DESC\n| LIMIT 20" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 5, diff --git a/packages/iis_otel/kibana/alerting_rule_template/iis_otel-request-queue-depth.json b/packages/iis_otel/kibana/alerting_rule_template/iis_otel-request-queue-depth.json index becaff7ac5d..99eb0cd6ad1 100644 --- a/packages/iis_otel/kibana/alerting_rule_template/iis_otel-request-queue-depth.json +++ b/packages/iis_otel/kibana/alerting_rule_template/iis_otel-request-queue-depth.json @@ -3,9 +3,13 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when the request queue depth (`iis.request.queue.count`) exceeds 10 on a pool. Sustained queueing precedes request rejections and timeouts.", "name": "[IIS OTel] Request queue depth elevated", "ruleTypeId": ".es-query", - "tags": ["observability", "iis"], + "tags": [ + "observability", + "iis" + ], "schedule": { "interval": "5m" }, @@ -17,13 +21,28 @@ "lookBackWindow": 10, "statusChangeThreshold": 4 }, + "artifacts": { + "dashboards": [ + { + "id": "iis_otel-sites" + }, + { + "id": "iis_otel-overview" + } + ], + "investigation_guide": { + "blob": "## IIS request queue depth elevated\n\n### What fired\n`iis.request.queue.count > 10` on a pool during the window.\n\n### Why it matters\nQueue depth grows when worker threads cannot service requests as fast as they arrive. Sustained queueing causes user-visible latency and, once the configured `requestQueueLimit` is hit, IIS will start rejecting new requests with HTTP 503.\n\n### Triage\n1. Identify the affected pool.\n2. Correlate with request queue age and rejection counters on the same pool.\n3. Inspect CPU and memory of the `w3wp.exe` processes.\n4. Look at downstream dependency latency.\n\n### Remediation\n- Add worker processes (`maxProcesses`).\n- Raise concurrency settings (`maxConcurrentRequestsPerCPU`).\n- Fix slow downstreams.\n\n### Tuning\n- 10 queue depth threshold; tune to baseline traffic.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-iisreceiver.otel-*\n| WHERE iis.request.queue.count IS NOT NULL\n| STATS max_queue = MAX(iis.request.queue.count) BY iis.application_pool, host.name\n| WHERE max_queue > 10\n| SORT max_queue DESC\n| LIMIT 20" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 5, diff --git a/packages/iis_otel/kibana/alerting_rule_template/iis_otel-request-rejections.json b/packages/iis_otel/kibana/alerting_rule_template/iis_otel-request-rejections.json index 4c3e2853ee8..aba459a9662 100644 --- a/packages/iis_otel/kibana/alerting_rule_template/iis_otel-request-rejections.json +++ b/packages/iis_otel/kibana/alerting_rule_template/iis_otel-request-rejections.json @@ -3,9 +3,13 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `iis.request.rejected` increases. Rejections mean IIS turned away requests because the queue was full or limits were reached.", "name": "[IIS OTel] Request rejections detected", "ruleTypeId": ".es-query", - "tags": ["observability", "iis"], + "tags": [ + "observability", + "iis" + ], "schedule": { "interval": "5m" }, @@ -17,13 +21,28 @@ "lookBackWindow": 10, "statusChangeThreshold": 4 }, + "artifacts": { + "dashboards": [ + { + "id": "iis_otel-overview" + }, + { + "id": "iis_otel-sites" + } + ], + "investigation_guide": { + "blob": "## IIS request rejections detected\n\n### What fired\n`INCREASE(iis.request.rejected) > 0` on a pool during the window.\n\n### Why it matters\nRejected requests are immediate HTTP 503 failures returned to the client without ever being processed. Almost always indicates a saturated pool \u2014 either the request queue is full or an explicit limit (`maxConcurrentRequestsPerCPU`, `processModel.requestQueueLimit`) was reached.\n\n### Triage\n1. Identify the affected pool.\n2. Inspect queue depth and queue age trends.\n3. Check `w3wp.exe` health (CPU, threads, GC) on the host.\n4. Look at downstream call latency.\n\n### Remediation\n- Raise the queue limit if hardware allows.\n- Add worker processes.\n- Fix slow downstream calls.\n\n### Tuning\n- Fires on any non-zero increase. Each rejection is a client-visible 503 \u2014 do not loosen the threshold without good reason.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "TS metrics-iisreceiver.otel-*\n| WHERE iis.request.rejected IS NOT NULL\n| STATS rejected_increase = SUM(INCREASE(iis.request.rejected)) BY iis.application_pool, host.name\n| WHERE rejected_increase > 0\n| SORT rejected_increase DESC\n| LIMIT 20" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 5, diff --git a/packages/iis_otel/manifest.yml b/packages/iis_otel/manifest.yml index 96b60200c4c..602ee87af78 100644 --- a/packages/iis_otel/manifest.yml +++ b/packages/iis_otel/manifest.yml @@ -1,7 +1,7 @@ format_version: 3.5.0 name: iis_otel title: "IIS OpenTelemetry assets" -version: 0.3.0 +version: 0.4.0 source: license: "Elastic-2.0" description: "IIS Assets for OpenTelemetry Collector" @@ -12,7 +12,7 @@ categories: - opentelemetry conditions: kibana: - version: "^9.3.0" + version: "^9.4.0" elastic: subscription: "basic" discovery: diff --git a/packages/influxdb_otel/changelog.yml b/packages/influxdb_otel/changelog.yml index bb45b5ec96e..6f70ee5bd01 100644 --- a/packages/influxdb_otel/changelog.yml +++ b/packages/influxdb_otel/changelog.yml @@ -1,4 +1,9 @@ # newer versions go on top +- version: "0.2.0" + changes: + - description: Add description and artifact fields to alerting rule template. + type: enhancement + link: https://github.com/elastic/integrations/pull/18506 - version: "0.1.0" changes: - description: Initial draft of the package diff --git a/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-cache-write-errors.json b/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-cache-write-errors.json index 3a394c518ed..a7cbc335fb4 100644 --- a/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-cache-write-errors.json +++ b/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-cache-write-errors.json @@ -1,40 +1,56 @@ { - "id": "influxdb_otel-cache-write-errors", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[InfluxDB OTel] Cache write errors", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "influxdb", - "storage" - ], - "schedule": { - "interval": "5m" + "id": "influxdb_otel-cache-write-errors", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when any cache write errors occur (`storage_cache_writes_err` increases). Indicates memory/cache subsystem issues.", + "name": "[InfluxDB OTel] Cache write errors", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "influxdb", + "storage" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 8, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "influxdb_otel-http-storage" }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 8, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-influxdb.otel-*\n// Cache write errors counter (cumulative)\n| WHERE COALESCE(metrics.storage_cache_writes_err, storage_cache_writes_err) IS NOT NULL\n// Detect increase in errors over the window\n| STATS min_err = MIN(COALESCE(metrics.storage_cache_writes_err, storage_cache_writes_err)),\n max_err = MAX(COALESCE(metrics.storage_cache_writes_err, storage_cache_writes_err))\n BY host.name\n| EVAL error_increase = max_err - min_err\n// Alert when any new cache write errors occurred\n| WHERE error_increase > 0" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 10, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "host.name", - "termSize": 20, - "excludeHitsFromPreviousRun": true + { + "id": "influxdb_otel-overview" } + ], + "investigation_guide": { + "blob": "## InfluxDB cache write errors\n\n### What fired\n`storage_cache_writes_err` increased during the window.\n\n### Why it matters\nCache write errors mean InfluxDB failed to buffer incoming writes in memory. They precede shard write errors and user-visible failures.\n\n### Triage\n1. Check host memory utilisation.\n2. Inspect InfluxDB logs for specific error messages.\n\n### Remediation\n- Scale memory.\n- Reduce write volume or shard more aggressively.\n\n### Tuning\n- Fires on any increase.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-influxdb.otel-*\n// Cache write errors counter (cumulative)\n| WHERE COALESCE(metrics.storage_cache_writes_err, storage_cache_writes_err) IS NOT NULL\n// Detect increase in errors over the window\n| STATS min_err = MIN(COALESCE(metrics.storage_cache_writes_err, storage_cache_writes_err)),\n max_err = MAX(COALESCE(metrics.storage_cache_writes_err, storage_cache_writes_err))\n BY host.name\n| EVAL error_increase = max_err - min_err\n// Alert when any new cache write errors occurred\n| WHERE error_increase > 0" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 10, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "host.name", + "termSize": 20, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-compaction-queue-backlog.json b/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-compaction-queue-backlog.json index b18f73dc4b1..c9d16a3745b 100644 --- a/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-compaction-queue-backlog.json +++ b/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-compaction-queue-backlog.json @@ -1,40 +1,56 @@ { - "id": "influxdb_otel-compaction-queue-backlog", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[InfluxDB OTel] Compaction queue backlog", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "influxdb", - "storage" - ], - "schedule": { - "interval": "10m" + "id": "influxdb_otel-compaction-queue-backlog", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when the compaction queue depth exceeds 10 (`storage_compactions_queued >= 10`). Compaction backlog degrades read performance.", + "name": "[InfluxDB OTel] Compaction queue backlog", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "influxdb", + "storage" + ], + "schedule": { + "interval": "10m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 6, + "statusChangeThreshold": 3 + }, + "artifacts": { + "dashboards": [ + { + "id": "influxdb_otel-overview" }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 6, - "statusChangeThreshold": 3 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-influxdb.otel-*\n// Compactions waiting to run (gauge)\n| WHERE COALESCE(metrics.storage_compactions_queued, storage_compactions_queued) IS NOT NULL\n// Get max queue depth per host and compaction level\n| STATS max_queued = MAX(COALESCE(metrics.storage_compactions_queued, storage_compactions_queued))\n BY host.name, COALESCE(attributes.level, \"unknown\")\n// Alert when compaction queue exceeds threshold (e.g. 10)\n// Adjust for your environment\n| WHERE max_queued >= 10" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "host.name", - "termSize": 20, - "excludeHitsFromPreviousRun": true + { + "id": "influxdb_otel-http-storage" } + ], + "investigation_guide": { + "blob": "## InfluxDB compaction queue backlog\n\n### What fired\nPer-level `storage_compactions_queued >= 10` during the window.\n\n### Why it matters\nCompaction reduces shard size and improves read performance. A growing queue means writes outpace compaction, which degrades reads and increases disk usage.\n\n### Triage\n1. Check compaction worker counts and CPU.\n2. Inspect disk IOPS.\n3. Look for recent write-volume spikes.\n\n### Remediation\n- Scale compaction workers.\n- Upgrade storage.\n\n### Tuning\n- `>= 10` per level; tune per workload.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-influxdb.otel-*\n// Compactions waiting to run (gauge)\n| WHERE COALESCE(metrics.storage_compactions_queued, storage_compactions_queued) IS NOT NULL\n// Get max queue depth per host and compaction level\n| STATS max_queued = MAX(COALESCE(metrics.storage_compactions_queued, storage_compactions_queued))\n BY host.name, COALESCE(attributes.level, \"unknown\")\n// Alert when compaction queue exceeds threshold (e.g. 10)\n// Adjust for your environment\n| WHERE max_queued >= 10" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "host.name", + "termSize": 20, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-dropped-writes.json b/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-dropped-writes.json index 0301d4343eb..5a250f4c9cd 100644 --- a/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-dropped-writes.json +++ b/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-dropped-writes.json @@ -1,40 +1,56 @@ { - "id": "influxdb_otel-dropped-writes", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[InfluxDB OTel] Dropped writes", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "influxdb", - "storage" - ], - "schedule": { - "interval": "5m" + "id": "influxdb_otel-dropped-writes", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when any points are dropped at the shard or cache level (`storage_shard_write_dropped_sum` / `storage_cache_writes_dropped` increase). Direct data loss.", + "name": "[InfluxDB OTel] Dropped writes", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "influxdb", + "storage" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 8, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "influxdb_otel-http-storage" }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 8, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-influxdb.otel-*\n// Shard dropped points (cumulative) and cache dropped writes (cumulative)\n| WHERE COALESCE(metrics.storage_shard_write_dropped_sum, storage_shard_write_dropped_sum) IS NOT NULL\n OR COALESCE(metrics.storage_cache_writes_dropped, storage_cache_writes_dropped) IS NOT NULL\n// Detect increase in dropped points over the window\n| STATS min_shard_drop = MIN(COALESCE(metrics.storage_shard_write_dropped_sum, storage_shard_write_dropped_sum)),\n max_shard_drop = MAX(COALESCE(metrics.storage_shard_write_dropped_sum, storage_shard_write_dropped_sum)),\n min_cache_drop = MIN(COALESCE(metrics.storage_cache_writes_dropped, storage_cache_writes_dropped)),\n max_cache_drop = MAX(COALESCE(metrics.storage_cache_writes_dropped, storage_cache_writes_dropped))\n BY host.name\n| EVAL shard_drop_increase = max_shard_drop - min_shard_drop,\n cache_drop_increase = max_cache_drop - min_cache_drop\n// Alert when any new dropped writes occurred (data loss risk)\n| WHERE shard_drop_increase > 0 OR cache_drop_increase > 0" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 10, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "host.name", - "termSize": 20, - "excludeHitsFromPreviousRun": true + { + "id": "influxdb_otel-overview" } + ], + "investigation_guide": { + "blob": "## InfluxDB dropped writes\n\n### What fired\nDropped-write counters increased during the window.\n\n### Why it matters\nDropped points are silent data loss. Usually caused by shard saturation, malformed points, or cardinality limits.\n\n### Triage\n1. Check ingest pipeline for retries and errors.\n2. Inspect cardinality growth.\n3. Look at shard health.\n\n### Remediation\n- Fix ingest source.\n- Scale InfluxDB or shard differently.\n\n### Tuning\n- Fires on any increase.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-influxdb.otel-*\n// Shard dropped points (cumulative) and cache dropped writes (cumulative)\n| WHERE COALESCE(metrics.storage_shard_write_dropped_sum, storage_shard_write_dropped_sum) IS NOT NULL\n OR COALESCE(metrics.storage_cache_writes_dropped, storage_cache_writes_dropped) IS NOT NULL\n// Detect increase in dropped points over the window\n| STATS min_shard_drop = MIN(COALESCE(metrics.storage_shard_write_dropped_sum, storage_shard_write_dropped_sum)),\n max_shard_drop = MAX(COALESCE(metrics.storage_shard_write_dropped_sum, storage_shard_write_dropped_sum)),\n min_cache_drop = MIN(COALESCE(metrics.storage_cache_writes_dropped, storage_cache_writes_dropped)),\n max_cache_drop = MAX(COALESCE(metrics.storage_cache_writes_dropped, storage_cache_writes_dropped))\n BY host.name\n| EVAL shard_drop_increase = max_shard_drop - min_shard_drop,\n cache_drop_increase = max_cache_drop - min_cache_drop\n// Alert when any new dropped writes occurred (data loss risk)\n| WHERE shard_drop_increase > 0 OR cache_drop_increase > 0" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 10, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "host.name", + "termSize": 20, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-go-goroutine-leak.json b/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-go-goroutine-leak.json index 552730e01ba..f43692ef588 100644 --- a/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-go-goroutine-leak.json +++ b/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-go-goroutine-leak.json @@ -1,40 +1,53 @@ { - "id": "influxdb_otel-go-goroutine-leak", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[InfluxDB OTel] Go goroutine leak suspected", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "influxdb", - "runtime" - ], - "schedule": { - "interval": "10m" - }, - "alertDelay": { - "active": 3 - }, - "flapping": { - "lookBackWindow": 6, - "statusChangeThreshold": 3 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-influxdb.otel-*\n// Go runtime goroutine count (gauge)\n| WHERE COALESCE(metrics.go_goroutines, go_goroutines) IS NOT NULL\n// Get max goroutine count per host over the window\n| STATS max_goroutines = MAX(COALESCE(metrics.go_goroutines, go_goroutines))\n BY host.name\n// Alert when goroutines exceed threshold (e.g. 10000)\n// High count may indicate leak; adjust for your baseline\n| WHERE max_goroutines > 10000" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "host.name", - "termSize": 20, - "excludeHitsFromPreviousRun": true + "id": "influxdb_otel-go-goroutine-leak", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when `go_goroutines > 10000` on the InfluxDB process. Indicates a goroutine leak that will eventually crash the server.", + "name": "[InfluxDB OTel] Go goroutine leak suspected", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "influxdb", + "runtime" + ], + "schedule": { + "interval": "10m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 6, + "statusChangeThreshold": 3 + }, + "artifacts": { + "dashboards": [ + { + "id": "influxdb_otel-overview" } + ], + "investigation_guide": { + "blob": "## InfluxDB goroutine leak suspected\n\n### What fired\n`go_goroutines` exceeded 10,000 during the window.\n\n### Why it matters\nHealthy InfluxDB instances run with hundreds to low thousands of goroutines. Counts above 10,000 indicate leaks \u2014 usually unclosed HTTP handlers, stuck queries, or misbehaving tasks.\n\n### Triage\n1. Sample pprof/goroutine profile from the admin endpoint.\n2. Look at recent task execution failures.\n\n### Remediation\n- Restart the server to recover.\n- Root-cause via profile and file an issue.\n\n### Tuning\n- 10,000 threshold; tune to your baseline.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-influxdb.otel-*\n// Go runtime goroutine count (gauge)\n| WHERE COALESCE(metrics.go_goroutines, go_goroutines) IS NOT NULL\n// Get max goroutine count per host over the window\n| STATS max_goroutines = MAX(COALESCE(metrics.go_goroutines, go_goroutines))\n BY host.name\n// Alert when goroutines exceed threshold (e.g. 10000)\n// High count may indicate leak; adjust for your baseline\n| WHERE max_goroutines > 10000" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "host.name", + "termSize": 20, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-http-api-high-error-rate.json b/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-http-api-high-error-rate.json index d7a1ff1b1ac..48dff75fe3e 100644 --- a/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-http-api-high-error-rate.json +++ b/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-http-api-high-error-rate.json @@ -1,40 +1,56 @@ { - "id": "influxdb_otel-http-api-high-error-rate", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[InfluxDB OTel] HTTP API high error rate", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "influxdb", - "api" - ], - "schedule": { - "interval": "5m" + "id": "influxdb_otel-http-api-high-error-rate", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when the HTTP API error rate (non-2xx responses) exceeds the computed threshold. Client-visible API failures.", + "name": "[InfluxDB OTel] HTTP API high error rate", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "influxdb", + "api" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "influxdb_otel-http-storage" }, - "alertDelay": { - "active": 3 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-influxdb.otel-*\n// HTTP API requests with response code/status\n| WHERE COALESCE(metrics.http_api_requests_total, http_api_requests_total) IS NOT NULL\n// Aggregate total and error counts per host (status != 2XX = errors)\n| STATS total = SUM(COALESCE(metrics.http_api_requests_total, http_api_requests_total)),\n errors = SUM(COALESCE(metrics.http_api_requests_total, http_api_requests_total))\n WHERE COALESCE(attributes.status, status) != \"2XX\"\n BY host.name\n// Minimum sample size to avoid noisy low-traffic hosts\n| WHERE total > 10\n// Calculate error rate as percentage\n| EVAL error_rate_pct = ROUND(errors / total * 100.0, 2)\n// Alert threshold: adjust for your environment (e.g. 5%)\n| WHERE error_rate_pct > 5.0\n| SORT error_rate_pct DESC\n| LIMIT 10" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "host.name", - "termSize": 10, - "excludeHitsFromPreviousRun": true + { + "id": "influxdb_otel-overview" } + ], + "investigation_guide": { + "blob": "## InfluxDB HTTP API high error rate\n\n### What fired\nThe ratio of non-2xx responses exceeded the threshold over a minimum 10-request sample.\n\n### Why it matters\nHTTP errors cover auth failures, malformed writes, query failures, and overload. Sustained high rates indicate client-visible API degradation.\n\n### Triage\n1. Split errors by status code to classify the failure mode.\n2. Correlate with server-side metrics (compaction, cache errors).\n\n### Remediation\n- Fix the dominant failure mode.\n\n### Tuning\n- Threshold and sample size are tuned in the query; adjust to match traffic.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-influxdb.otel-*\n// HTTP API requests with response code/status\n| WHERE COALESCE(metrics.http_api_requests_total, http_api_requests_total) IS NOT NULL\n// Aggregate total and error counts per host (status != 2XX = errors)\n| STATS total = SUM(COALESCE(metrics.http_api_requests_total, http_api_requests_total)),\n errors = SUM(COALESCE(metrics.http_api_requests_total, http_api_requests_total))\n WHERE COALESCE(attributes.status, status) != \"2XX\"\n BY host.name\n// Minimum sample size to avoid noisy low-traffic hosts\n| WHERE total > 10\n// Calculate error rate as percentage\n| EVAL error_rate_pct = ROUND(errors / total * 100.0, 2)\n// Alert threshold: adjust for your environment (e.g. 5%)\n| WHERE error_rate_pct > 5.0\n| SORT error_rate_pct DESC\n| LIMIT 10" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "host.name", + "termSize": 10, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-query-controller-saturated.json b/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-query-controller-saturated.json index e4935072656..6c5365e4f2e 100644 --- a/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-query-controller-saturated.json +++ b/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-query-controller-saturated.json @@ -1,40 +1,56 @@ { - "id": "influxdb_otel-query-controller-saturated", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[InfluxDB OTel] Query controller saturated", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "influxdb", - "query" - ], - "schedule": { - "interval": "5m" + "id": "influxdb_otel-query-controller-saturated", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when query queueing is high (>= 5) or the query memory budget is nearly exhausted (< 10 MB unused). Query controller saturation.", + "name": "[InfluxDB OTel] Query controller saturated", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "influxdb", + "query" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "influxdb_otel-query-tasks" }, - "alertDelay": { - "active": 3 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-influxdb.otel-*\n// Query controller queueing and memory budget\n| WHERE COALESCE(metrics.qc_queueing_active, qc_queueing_active) IS NOT NULL\n OR COALESCE(metrics.qc_memory_unused_bytes, qc_memory_unused_bytes) IS NOT NULL\n// Get latest values per host (gauges)\n| STATS max_queueing = MAX(COALESCE(metrics.qc_queueing_active, qc_queueing_active)),\n min_memory_unused = MIN(COALESCE(metrics.qc_memory_unused_bytes, qc_memory_unused_bytes))\n BY host.name\n// Alert when queueing is high (>= 5) or memory budget exhausted (< 10MB)\n// Adjust thresholds for your workload\n| WHERE max_queueing >= 5 OR (min_memory_unused IS NOT NULL AND min_memory_unused < 10485760)" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 10, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "host.name", - "termSize": 20, - "excludeHitsFromPreviousRun": true + { + "id": "influxdb_otel-overview" } + ], + "investigation_guide": { + "blob": "## InfluxDB query controller saturated\n\n### What fired\n`qc_queueing_active >= 5` or `qc_memory_unused_bytes < 10 MB` during the window.\n\n### Why it matters\nThe query controller queues queries when resources are tight. Queueing plus exhausted memory budget leads to query timeouts.\n\n### Triage\n1. Check for long-running or memory-hungry queries.\n2. Inspect concurrent-query limits.\n\n### Remediation\n- Kill offending queries.\n- Raise concurrency / memory budget.\n\n### Tuning\n- Thresholds tune in the query.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-influxdb.otel-*\n// Query controller queueing and memory budget\n| WHERE COALESCE(metrics.qc_queueing_active, qc_queueing_active) IS NOT NULL\n OR COALESCE(metrics.qc_memory_unused_bytes, qc_memory_unused_bytes) IS NOT NULL\n// Get latest values per host (gauges)\n| STATS max_queueing = MAX(COALESCE(metrics.qc_queueing_active, qc_queueing_active)),\n min_memory_unused = MIN(COALESCE(metrics.qc_memory_unused_bytes, qc_memory_unused_bytes))\n BY host.name\n// Alert when queueing is high (>= 5) or memory budget exhausted (< 10MB)\n// Adjust thresholds for your workload\n| WHERE max_queueing >= 5 OR (min_memory_unused IS NOT NULL AND min_memory_unused < 10485760)" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 10, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "host.name", + "termSize": 20, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-scrape-down.json b/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-scrape-down.json index 7406688bc29..71a17bea2b2 100644 --- a/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-scrape-down.json +++ b/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-scrape-down.json @@ -1,39 +1,52 @@ { - "id": "influxdb_otel-scrape-down", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[InfluxDB OTel] Scrape target down", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "influxdb" - ], - "schedule": { - "interval": "2m" - }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 6, - "statusChangeThreshold": 3 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-influxdb.otel-*\n// Scrape health: up=0 means target unreachable\n| WHERE COALESCE(metrics.up, up) IS NOT NULL AND host.name IS NOT NULL\n// Max up in window: 0 means host was down for entire window\n| STATS max_up = MAX(COALESCE(metrics.up, up)),\n target = MAX(COALESCE(server.address, \"unknown\"))\n BY host.name\n| WHERE max_up == 0" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "host.name", - "termSize": 20, - "excludeHitsFromPreviousRun": true + "id": "influxdb_otel-scrape-down", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when the scrape target `up == 0`. The collector cannot reach the InfluxDB metrics endpoint.", + "name": "[InfluxDB OTel] Scrape target down", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "influxdb" + ], + "schedule": { + "interval": "2m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 6, + "statusChangeThreshold": 3 + }, + "artifacts": { + "dashboards": [ + { + "id": "influxdb_otel-overview" } + ], + "investigation_guide": { + "blob": "## InfluxDB scrape target down\n\n### What fired\n`up == 0` for the entire window.\n\n### Why it matters\nA down target means either the InfluxDB process is gone, its metrics endpoint is blocked, or the collector is misconfigured.\n\n### Triage\n1. Confirm whether the process is running.\n2. Check network/firewall between collector and InfluxDB.\n\n### Remediation\n- Restore process / network / collector config.\n\n### Tuning\n- Fires when `up == 0` for the whole window.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-influxdb.otel-*\n// Scrape health: up=0 means target unreachable\n| WHERE COALESCE(metrics.up, up) IS NOT NULL AND host.name IS NOT NULL\n// Max up in window: 0 means host was down for entire window\n| STATS max_up = MAX(COALESCE(metrics.up, up)),\n target = MAX(COALESCE(server.address, \"unknown\"))\n BY host.name\n| WHERE max_up == 0" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "host.name", + "termSize": 20, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-shard-write-errors.json b/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-shard-write-errors.json index c12890fb27c..565e59cc482 100644 --- a/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-shard-write-errors.json +++ b/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-shard-write-errors.json @@ -1,40 +1,56 @@ { - "id": "influxdb_otel-shard-write-errors", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[InfluxDB OTel] Shard write errors", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "influxdb", - "storage" - ], - "schedule": { - "interval": "5m" + "id": "influxdb_otel-shard-write-errors", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when shard write error counters increase. Indicates failures to persist points to shards.", + "name": "[InfluxDB OTel] Shard write errors", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "influxdb", + "storage" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 8, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "influxdb_otel-http-storage" }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 8, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-influxdb.otel-*\n// Shard write error counters (cumulative)\n| WHERE COALESCE(metrics.storage_shard_write_err_count, storage_shard_write_err_count) IS NOT NULL\n OR COALESCE(metrics.storage_shard_write_err_sum, storage_shard_write_err_sum) IS NOT NULL\n// Detect increase in errors over the window, by host and shard\n| STATS min_count = MIN(COALESCE(metrics.storage_shard_write_err_count, storage_shard_write_err_count)),\n max_count = MAX(COALESCE(metrics.storage_shard_write_err_count, storage_shard_write_err_count)),\n min_sum = MIN(COALESCE(metrics.storage_shard_write_err_sum, storage_shard_write_err_sum)),\n max_sum = MAX(COALESCE(metrics.storage_shard_write_err_sum, storage_shard_write_err_sum))\n BY host.name, COALESCE(attributes.id, attributes.bucket, \"default\")\n| EVAL count_increase = max_count - min_count,\n sum_increase = max_sum - min_sum\n// Alert when any new shard write errors occurred\n| WHERE count_increase > 0 OR sum_increase > 0" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 10, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "host.name", - "termSize": 20, - "excludeHitsFromPreviousRun": true + { + "id": "influxdb_otel-overview" } + ], + "investigation_guide": { + "blob": "## InfluxDB shard write errors\n\n### What fired\nShard write error counters increased during the window.\n\n### Why it matters\nShard-level write errors are the last stop before data loss. Often caused by disk pressure, cardinality explosion, or corrupted TSI.\n\n### Triage\n1. Check disk space and I/O health.\n2. Inspect shard directory sizes.\n\n### Remediation\n- Grow disk / reduce retention.\n- Repair or rebuild corrupted shards.\n\n### Tuning\n- Fires on any increase.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-influxdb.otel-*\n// Shard write error counters (cumulative)\n| WHERE COALESCE(metrics.storage_shard_write_err_count, storage_shard_write_err_count) IS NOT NULL\n OR COALESCE(metrics.storage_shard_write_err_sum, storage_shard_write_err_sum) IS NOT NULL\n// Detect increase in errors over the window, by host and shard\n| STATS min_count = MIN(COALESCE(metrics.storage_shard_write_err_count, storage_shard_write_err_count)),\n max_count = MAX(COALESCE(metrics.storage_shard_write_err_count, storage_shard_write_err_count)),\n min_sum = MIN(COALESCE(metrics.storage_shard_write_err_sum, storage_shard_write_err_sum)),\n max_sum = MAX(COALESCE(metrics.storage_shard_write_err_sum, storage_shard_write_err_sum))\n BY host.name, COALESCE(attributes.id, attributes.bucket, \"default\")\n| EVAL count_increase = max_count - min_count,\n sum_increase = max_sum - min_sum\n// Alert when any new shard write errors occurred\n| WHERE count_increase > 0 OR sum_increase > 0" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 10, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "host.name", + "termSize": 20, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-task-execution-failures.json b/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-task-execution-failures.json index 47f8f94f8d1..74e039067a7 100644 --- a/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-task-execution-failures.json +++ b/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-task-execution-failures.json @@ -1,40 +1,56 @@ { - "id": "influxdb_otel-task-execution-failures", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[InfluxDB OTel] Task execution failures", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "influxdb", - "tasks" - ], - "schedule": { - "interval": "5m" + "id": "influxdb_otel-task-execution-failures", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when `task_scheduler_total_execute_failure` increases. Scheduled tasks (downsamples, continuous queries) are failing.", + "name": "[InfluxDB OTel] Task execution failures", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "influxdb", + "tasks" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 8, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "influxdb_otel-query-tasks" }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 8, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-influxdb.otel-*\n// Task scheduler execution failure counter (cumulative)\n| WHERE COALESCE(metrics.task_scheduler_total_execute_failure, task_scheduler_total_execute_failure) IS NOT NULL\n// Detect increase in failures over the window\n| STATS min_fail = MIN(COALESCE(metrics.task_scheduler_total_execute_failure, task_scheduler_total_execute_failure)),\n max_fail = MAX(COALESCE(metrics.task_scheduler_total_execute_failure, task_scheduler_total_execute_failure))\n BY host.name\n| EVAL failure_increase = max_fail - min_fail\n// Alert when any new task execution failures occurred\n| WHERE failure_increase > 0" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 10, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "host.name", - "termSize": 20, - "excludeHitsFromPreviousRun": true + { + "id": "influxdb_otel-overview" } + ], + "investigation_guide": { + "blob": "## InfluxDB task execution failures\n\n### What fired\nCounter `task_scheduler_total_execute_failure` increased during the window.\n\n### Why it matters\nFailed tasks mean downsampling, rollups, or alerting flows did not run. Depending on the task, this may cause stale downsamples or missed alerts.\n\n### Triage\n1. Identify failing tasks via the Tasks API.\n2. Inspect task error logs.\n\n### Remediation\n- Fix the task script or its dependencies.\n\n### Tuning\n- Fires on any increase.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-influxdb.otel-*\n// Task scheduler execution failure counter (cumulative)\n| WHERE COALESCE(metrics.task_scheduler_total_execute_failure, task_scheduler_total_execute_failure) IS NOT NULL\n// Detect increase in failures over the window\n| STATS min_fail = MIN(COALESCE(metrics.task_scheduler_total_execute_failure, task_scheduler_total_execute_failure)),\n max_fail = MAX(COALESCE(metrics.task_scheduler_total_execute_failure, task_scheduler_total_execute_failure))\n BY host.name\n| EVAL failure_increase = max_fail - min_fail\n// Alert when any new task execution failures occurred\n| WHERE failure_increase > 0" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 10, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "host.name", + "termSize": 20, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-wal-write-errors.json b/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-wal-write-errors.json index 981a1a805d2..c5060d9a519 100644 --- a/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-wal-write-errors.json +++ b/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-wal-write-errors.json @@ -1,40 +1,56 @@ { - "id": "influxdb_otel-wal-write-errors", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[InfluxDB OTel] WAL write errors", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "influxdb", - "storage" - ], - "schedule": { - "interval": "5m" + "id": "influxdb_otel-wal-write-errors", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when `storage_wal_writes_err` increases. WAL write failures risk data loss on restart.", + "name": "[InfluxDB OTel] WAL write errors", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "influxdb", + "storage" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 8, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "influxdb_otel-http-storage" }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 8, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-influxdb.otel-*\n// WAL write errors counter (cumulative)\n| WHERE COALESCE(metrics.storage_wal_writes_err, storage_wal_writes_err) IS NOT NULL\n// Detect increase in errors over the window\n| STATS min_err = MIN(COALESCE(metrics.storage_wal_writes_err, storage_wal_writes_err)),\n max_err = MAX(COALESCE(metrics.storage_wal_writes_err, storage_wal_writes_err))\n BY host.name\n| EVAL error_increase = max_err - min_err\n// Alert when any new WAL write errors occurred\n| WHERE error_increase > 0" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 10, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "host.name", - "termSize": 20, - "excludeHitsFromPreviousRun": true + { + "id": "influxdb_otel-overview" } + ], + "investigation_guide": { + "blob": "## InfluxDB WAL write errors\n\n### What fired\n`storage_wal_writes_err` increased during the window.\n\n### Why it matters\nWAL write errors mean crash recovery would not replay all writes. A serious durability signal.\n\n### Triage\n1. Check disk health and space.\n2. Inspect disk I/O errors in dmesg / cloud provider logs.\n\n### Remediation\n- Repair or replace the underlying disk.\n\n### Tuning\n- Fires on any increase.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-influxdb.otel-*\n// WAL write errors counter (cumulative)\n| WHERE COALESCE(metrics.storage_wal_writes_err, storage_wal_writes_err) IS NOT NULL\n// Detect increase in errors over the window\n| STATS min_err = MIN(COALESCE(metrics.storage_wal_writes_err, storage_wal_writes_err)),\n max_err = MAX(COALESCE(metrics.storage_wal_writes_err, storage_wal_writes_err))\n BY host.name\n| EVAL error_increase = max_err - min_err\n// Alert when any new WAL write errors occurred\n| WHERE error_increase > 0" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 10, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "host.name", + "termSize": 20, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-writer-timeouts.json b/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-writer-timeouts.json index 434d367f83b..8d666967bbf 100644 --- a/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-writer-timeouts.json +++ b/packages/influxdb_otel/kibana/alerting_rule_template/influxdb_otel-writer-timeouts.json @@ -1,40 +1,56 @@ { - "id": "influxdb_otel-writer-timeouts", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[InfluxDB OTel] Storage writer timeouts", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "influxdb", - "storage" - ], - "schedule": { - "interval": "5m" + "id": "influxdb_otel-writer-timeouts", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when `storage_writer_timeouts` increases. Storage writer timeouts indicate disk or replication pressure.", + "name": "[InfluxDB OTel] Storage writer timeouts", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "influxdb", + "storage" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 8, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "influxdb_otel-http-storage" }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 8, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-influxdb.otel-*\n// Storage writer timeout counter (cumulative)\n| WHERE COALESCE(metrics.storage_writer_timeouts, storage_writer_timeouts) IS NOT NULL\n// Detect increase in timeouts over the window\n| STATS min_timeout = MIN(COALESCE(metrics.storage_writer_timeouts, storage_writer_timeouts)),\n max_timeout = MAX(COALESCE(metrics.storage_writer_timeouts, storage_writer_timeouts))\n BY host.name\n| EVAL timeout_increase = max_timeout - min_timeout\n// Alert when any new writer timeouts occurred\n| WHERE timeout_increase > 0" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 10, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "host.name", - "termSize": 20, - "excludeHitsFromPreviousRun": true + { + "id": "influxdb_otel-overview" } + ], + "investigation_guide": { + "blob": "## InfluxDB storage writer timeouts\n\n### What fired\nCounter `storage_writer_timeouts` increased during the window.\n\n### Why it matters\nWriter timeouts usually mean the storage subsystem took too long to acknowledge a write. Consequences: rejected writes and client retries.\n\n### Triage\n1. Check disk IOPS and latency.\n2. Correlate with cache and WAL errors.\n\n### Remediation\n- Upgrade storage or reduce write volume.\n\n### Tuning\n- Fires on any increase.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-influxdb.otel-*\n// Storage writer timeout counter (cumulative)\n| WHERE COALESCE(metrics.storage_writer_timeouts, storage_writer_timeouts) IS NOT NULL\n// Detect increase in timeouts over the window\n| STATS min_timeout = MIN(COALESCE(metrics.storage_writer_timeouts, storage_writer_timeouts)),\n max_timeout = MAX(COALESCE(metrics.storage_writer_timeouts, storage_writer_timeouts))\n BY host.name\n| EVAL timeout_increase = max_timeout - min_timeout\n// Alert when any new writer timeouts occurred\n| WHERE timeout_increase > 0" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 10, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "host.name", + "termSize": 20, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/influxdb_otel/manifest.yml b/packages/influxdb_otel/manifest.yml index 84d72cd5aac..43a00279b56 100644 --- a/packages/influxdb_otel/manifest.yml +++ b/packages/influxdb_otel/manifest.yml @@ -1,7 +1,7 @@ format_version: 3.5.7 name: influxdb_otel title: "InfluxDb OpenTelemetry Assets" -version: 0.1.0 +version: 0.2.0 source: license: "Elastic-2.0" description: "InfluxDb Assets for OpenTelemetry Collector" @@ -12,7 +12,7 @@ categories: - opentelemetry conditions: kibana: - version: "^9.3.1" + version: "^9.4.0" elastic: subscription: "basic" discovery: diff --git a/packages/kafka_otel/changelog.yml b/packages/kafka_otel/changelog.yml index 0fc0d2d9120..c985aa21f36 100644 --- a/packages/kafka_otel/changelog.yml +++ b/packages/kafka_otel/changelog.yml @@ -1,4 +1,9 @@ # newer versions go on top +- version: "0.2.0" + changes: + - description: Add description and artifact fields to alerting rule template. + type: enhancement + link: https://github.com/elastic/integrations/pull/18506 - version: "0.1.0" changes: - description: Initial draft of the Kafka OTel content package diff --git a/packages/kafka_otel/kibana/alerting_rule_template/kafka_otel-broker-count-low.json b/packages/kafka_otel/kibana/alerting_rule_template/kafka_otel-broker-count-low.json index 0b3b2b5eaa6..ffe6ace985c 100644 --- a/packages/kafka_otel/kibana/alerting_rule_template/kafka_otel-broker-count-low.json +++ b/packages/kafka_otel/kibana/alerting_rule_template/kafka_otel-broker-count-low.json @@ -1,40 +1,54 @@ { - "id": "kafka_otel-broker-count-low", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Kafka OTel] Broker count below threshold", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "kafka" - ], - "schedule": { - "interval": "5m" + "id": "kafka_otel-broker-count-low", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when `kafka.brokers` drops below 2. A broker loss weakens replication and triggers partition reassignments.", + "name": "[Kafka OTel] Broker count below threshold", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "kafka" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "enabled": true, + "lookBackWindow": 6, + "statusChangeThreshold": 3 + }, + "artifacts": { + "dashboards": [ + { + "id": "kafka_otel-overview" }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "enabled": true, - "lookBackWindow": 6, - "statusChangeThreshold": 3 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-kafkametricsreceiver.otel-*\n// Cluster-level broker count (no attributes)\n| WHERE kafka.brokers IS NOT NULL\n| STATS latest_brokers = MAX(kafka.brokers)\n// Threshold: alert when broker count < expected minimum.\n// Customise the value below for your cluster (e.g. 2 for HA, 1 for any loss)\n| WHERE latest_brokers < 2" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "all", - "excludeHitsFromPreviousRun": false + { + "id": "kafka_otel-replication" } + ], + "investigation_guide": { + "blob": "## Kafka broker count below threshold\n\n### What fired\n`kafka.brokers` metric fell below 2 during the window.\n\n### Why it matters\nA reduced broker count shrinks available capacity and can leave partitions under-replicated. If replication factor equals lost brokers, data becomes unavailable.\n\n### Triage\n1. Check which broker is missing (host-level monitoring).\n2. Correlate with under-replicated partitions.\n3. Inspect controller election events.\n\n### Remediation\n- Bring the missing broker back or replace it.\n- Let the cluster re-replicate affected partitions.\n\n### Tuning\n- Adjust the minimum count (2) to match your cluster size (e.g. 3 for HA tiers).\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-kafkametricsreceiver.otel-*\n// Cluster-level broker count (no attributes)\n| WHERE kafka.brokers IS NOT NULL\n| STATS latest_brokers = MAX(kafka.brokers)\n// Threshold: alert when broker count < expected minimum.\n// Customise the value below for your cluster (e.g. 2 for HA, 1 for any loss)\n| WHERE latest_brokers < 2" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "all", + "excludeHitsFromPreviousRun": false } + } } diff --git a/packages/kafka_otel/kibana/alerting_rule_template/kafka_otel-consumer-lag-high-by-topic.json b/packages/kafka_otel/kibana/alerting_rule_template/kafka_otel-consumer-lag-high-by-topic.json index 3b9865065e9..ea22bd72e1b 100644 --- a/packages/kafka_otel/kibana/alerting_rule_template/kafka_otel-consumer-lag-high-by-topic.json +++ b/packages/kafka_otel/kibana/alerting_rule_template/kafka_otel-consumer-lag-high-by-topic.json @@ -1,42 +1,56 @@ { - "id": "kafka_otel-consumer-lag-high-by-topic", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Kafka OTel] High consumer lag by topic", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "kafka" - ], - "schedule": { - "interval": "5m" + "id": "kafka_otel-consumer-lag-high-by-topic", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when `kafka.consumer_group.lag_sum > 50000` for a group+topic. Aggregate lag indicates consumer processing delay.", + "name": "[Kafka OTel] High consumer lag by topic", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "kafka" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "enabled": true, + "lookBackWindow": 6, + "statusChangeThreshold": 3 + }, + "artifacts": { + "dashboards": [ + { + "id": "kafka_otel-consumer-groups" }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "enabled": true, - "lookBackWindow": 6, - "statusChangeThreshold": 3 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-kafkametricsreceiver.otel-*\n// Topic-level lag sum (aggregate across partitions for a consumer group)\n| WHERE kafka.consumer_group.lag_sum IS NOT NULL\n| STATS lag_sum = MAX(kafka.consumer_group.lag_sum)\n BY attributes.group, attributes.topic\n// Threshold: total lag for group on topic. Adjust for expected throughput\n| WHERE lag_sum > 50000\n| SORT lag_sum DESC" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "attributes.group", - "termSize": 20, - "excludeHitsFromPreviousRun": false + { + "id": "kafka_otel-overview" } + ], + "investigation_guide": { + "blob": "## Kafka high consumer lag by topic\n\n### What fired\n`kafka.consumer_group.lag_sum` exceeded 50,000 messages on a group+topic.\n\n### Why it matters\nAggregate lag across partitions is the canonical latency proxy. Sustained growth means consumption cannot keep up with production; downstream SLAs will miss.\n\n### Triage\n1. Check if lag is concentrated on specific partitions.\n2. Inspect consumer instance count and member stability.\n3. Look for consumer restarts or pod evictions.\n\n### Remediation\n- Scale the consumer group.\n- Fix slow consumer code paths.\n\n### Tuning\n- 50,000 is a generic starting point; tune to your throughput.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-kafkametricsreceiver.otel-*\n// Topic-level lag sum (aggregate across partitions for a consumer group)\n| WHERE kafka.consumer_group.lag_sum IS NOT NULL\n| STATS lag_sum = MAX(kafka.consumer_group.lag_sum)\n BY attributes.group, attributes.topic\n// Threshold: total lag for group on topic. Adjust for expected throughput\n| WHERE lag_sum > 50000\n| SORT lag_sum DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "attributes.group", + "termSize": 20, + "excludeHitsFromPreviousRun": false } + } } diff --git a/packages/kafka_otel/kibana/alerting_rule_template/kafka_otel-consumer-lag-high-per-partition.json b/packages/kafka_otel/kibana/alerting_rule_template/kafka_otel-consumer-lag-high-per-partition.json index f77d209a4e0..170f353c59b 100644 --- a/packages/kafka_otel/kibana/alerting_rule_template/kafka_otel-consumer-lag-high-per-partition.json +++ b/packages/kafka_otel/kibana/alerting_rule_template/kafka_otel-consumer-lag-high-per-partition.json @@ -1,42 +1,56 @@ { - "id": "kafka_otel-consumer-lag-high-per-partition", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Kafka OTel] High consumer lag per partition", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "kafka" - ], - "schedule": { - "interval": "5m" + "id": "kafka_otel-consumer-lag-high-per-partition", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when per-partition `kafka.consumer_group.lag > 10000`. Per-partition lag surfaces hot-spotting and stuck consumers.", + "name": "[Kafka OTel] High consumer lag per partition", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "kafka" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "enabled": true, + "lookBackWindow": 6, + "statusChangeThreshold": 3 + }, + "artifacts": { + "dashboards": [ + { + "id": "kafka_otel-consumer-groups" }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "enabled": true, - "lookBackWindow": 6, - "statusChangeThreshold": 3 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-kafkametricsreceiver.otel-*\n// Per-partition consumer group lag (offset distance behind log-end)\n| WHERE kafka.consumer_group.lag IS NOT NULL\n| STATS max_lag = MAX(kafka.consumer_group.lag)\n BY attributes.group, attributes.topic, attributes.partition\n// Threshold: lag in messages. Adjust for your throughput (e.g. 10000, 100000)\n| WHERE max_lag > 10000\n| SORT max_lag DESC" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "attributes.group", - "termSize": 20, - "excludeHitsFromPreviousRun": false + { + "id": "kafka_otel-topics-partitions" } + ], + "investigation_guide": { + "blob": "## Kafka high consumer lag per partition\n\n### What fired\n`kafka.consumer_group.lag > 10,000` on at least one partition.\n\n### Why it matters\nPer-partition lag isolates the specific partitions / consumers struggling. Useful for detecting key-skew or specific consumer crashes.\n\n### Triage\n1. Look at partition assignment (is one consumer owning the hot partition?).\n2. Check recent rebalances in the consumer group.\n\n### Remediation\n- Rebalance manually or fix the slow consumer.\n- Re-key producer data to distribute load.\n\n### Tuning\n- 10,000 per partition is generic; tune to throughput.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-kafkametricsreceiver.otel-*\n// Per-partition consumer group lag (offset distance behind log-end)\n| WHERE kafka.consumer_group.lag IS NOT NULL\n| STATS max_lag = MAX(kafka.consumer_group.lag)\n BY attributes.group, attributes.topic, attributes.partition\n// Threshold: lag in messages. Adjust for your throughput (e.g. 10000, 100000)\n| WHERE max_lag > 10000\n| SORT max_lag DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "attributes.group", + "termSize": 20, + "excludeHitsFromPreviousRun": false } + } } diff --git a/packages/kafka_otel/kibana/alerting_rule_template/kafka_otel-empty-consumer-groups.json b/packages/kafka_otel/kibana/alerting_rule_template/kafka_otel-empty-consumer-groups.json index 0fa4c13d4d1..2946923eb2b 100644 --- a/packages/kafka_otel/kibana/alerting_rule_template/kafka_otel-empty-consumer-groups.json +++ b/packages/kafka_otel/kibana/alerting_rule_template/kafka_otel-empty-consumer-groups.json @@ -1,42 +1,56 @@ { - "id": "kafka_otel-empty-consumer-groups", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Kafka OTel] Empty consumer groups", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "kafka" - ], - "schedule": { - "interval": "5m" + "id": "kafka_otel-empty-consumer-groups", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when `kafka.consumer_group.members == 0`. A group with no members is not consuming \u2014 the group is offline.", + "name": "[Kafka OTel] Empty consumer groups", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "kafka" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "enabled": true, + "lookBackWindow": 6, + "statusChangeThreshold": 3 + }, + "artifacts": { + "dashboards": [ + { + "id": "kafka_otel-consumer-groups" }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "enabled": true, - "lookBackWindow": 6, - "statusChangeThreshold": 3 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-kafkametricsreceiver.otel-*\n// Consumer group member count per group\n| WHERE kafka.consumer_group.members IS NOT NULL\n| STATS members = MAX(kafka.consumer_group.members) BY attributes.group\n// Groups with zero members: no active consumers, consumption paused\n| WHERE members == 0\n| SORT attributes.group ASC" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "attributes.group", - "termSize": 20, - "excludeHitsFromPreviousRun": false + { + "id": "kafka_otel-overview" } + ], + "investigation_guide": { + "blob": "## Kafka empty consumer groups\n\n### What fired\n`kafka.consumer_group.members == 0` during the window.\n\n### Why it matters\nAn empty consumer group means no clients are consuming \u2014 messages accumulate, lag grows, and downstream systems stall.\n\n### Triage\n1. Verify whether the group is intentional (one-off migration) or unexpected.\n2. Check consumer instance health and deployment state.\n\n### Remediation\n- Restart consumers.\n- Delete the group if it is truly abandoned.\n\n### Tuning\n- Fires on any empty group. Filter out expected one-off groups by name if needed.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-kafkametricsreceiver.otel-*\n// Consumer group member count per group\n| WHERE kafka.consumer_group.members IS NOT NULL\n| STATS members = MAX(kafka.consumer_group.members) BY attributes.group\n// Groups with zero members: no active consumers, consumption paused\n| WHERE members == 0\n| SORT attributes.group ASC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "attributes.group", + "termSize": 20, + "excludeHitsFromPreviousRun": false } + } } diff --git a/packages/kafka_otel/kibana/alerting_rule_template/kafka_otel-under-replicated-partitions.json b/packages/kafka_otel/kibana/alerting_rule_template/kafka_otel-under-replicated-partitions.json index 44b703680de..3eb9aeb6231 100644 --- a/packages/kafka_otel/kibana/alerting_rule_template/kafka_otel-under-replicated-partitions.json +++ b/packages/kafka_otel/kibana/alerting_rule_template/kafka_otel-under-replicated-partitions.json @@ -1,42 +1,56 @@ { - "id": "kafka_otel-under-replicated-partitions", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Kafka OTel] Under-replicated partitions", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "kafka" - ], - "schedule": { - "interval": "5m" + "id": "kafka_otel-under-replicated-partitions", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when `kafka.partition.replicas > kafka.partition.replicas_in_sync` on any partition. Replication is degraded and durability is weakened.", + "name": "[Kafka OTel] Under-replicated partitions", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "kafka" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "enabled": true, + "lookBackWindow": 6, + "statusChangeThreshold": 3 + }, + "artifacts": { + "dashboards": [ + { + "id": "kafka_otel-replication" }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "enabled": true, - "lookBackWindow": 6, - "statusChangeThreshold": 3 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-kafkametricsreceiver.otel-*\n// Partition metrics: replicas vs in-sync replicas\n| WHERE kafka.partition.replicas IS NOT NULL AND kafka.partition.replicas_in_sync IS NOT NULL\n| STATS replicas = MAX(kafka.partition.replicas),\n replicas_in_sync = MAX(kafka.partition.replicas_in_sync)\n BY attributes.topic, attributes.partition\n// Alert when any replica is out of sync (replication degraded)\n// Indicates broker failures, network issues, or overloaded followers\n| WHERE replicas > replicas_in_sync\n| SORT replicas_in_sync ASC" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "attributes.topic", - "termSize": 20, - "excludeHitsFromPreviousRun": false + { + "id": "kafka_otel-topics-partitions" } + ], + "investigation_guide": { + "blob": "## Kafka under-replicated partitions\n\n### What fired\nAt least one partition has `replicas_in_sync < replicas`.\n\n### Why it matters\nUnder-replicated partitions have weakened durability: a leader loss while under-replicated can lose messages. Usually caused by broker failures or overloaded followers.\n\n### Triage\n1. Check broker count \u2014 is a broker down?\n2. Inspect broker-level saturation metrics (if available outside this receiver).\n3. Look at controller logs for replication events.\n\n### Remediation\n- Restore the lost broker.\n- Scale followers if they are CPU/disk bound.\n\n### Tuning\n- Fires on any under-replication. Always P1.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-kafkametricsreceiver.otel-*\n// Partition metrics: replicas vs in-sync replicas\n| WHERE kafka.partition.replicas IS NOT NULL AND kafka.partition.replicas_in_sync IS NOT NULL\n| STATS replicas = MAX(kafka.partition.replicas),\n replicas_in_sync = MAX(kafka.partition.replicas_in_sync)\n BY attributes.topic, attributes.partition\n// Alert when any replica is out of sync (replication degraded)\n// Indicates broker failures, network issues, or overloaded followers\n| WHERE replicas > replicas_in_sync\n| SORT replicas_in_sync ASC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "attributes.topic", + "termSize": 20, + "excludeHitsFromPreviousRun": false } + } } diff --git a/packages/kafka_otel/manifest.yml b/packages/kafka_otel/manifest.yml index b9bbffed169..5cad81416e7 100644 --- a/packages/kafka_otel/manifest.yml +++ b/packages/kafka_otel/manifest.yml @@ -1,7 +1,7 @@ format_version: 3.5.0 name: kafka_otel title: "Kafka OpenTelemetry Assets" -version: 0.1.0 +version: 0.2.0 source: license: "Elastic-2.0" description: "Kafka Assets for OpenTelemetry Collector" @@ -12,7 +12,7 @@ categories: - opentelemetry conditions: kibana: - version: "^9.3.0" + version: "^9.4.0" elastic: subscription: "basic" discovery: diff --git a/packages/memcached_otel/changelog.yml b/packages/memcached_otel/changelog.yml index a73716398c3..b376101b633 100644 --- a/packages/memcached_otel/changelog.yml +++ b/packages/memcached_otel/changelog.yml @@ -1,4 +1,9 @@ # newer versions go on top +- version: "0.2.0" + changes: + - description: Add description and artifact fields to alerting rule template. + type: enhancement + link: https://github.com/elastic/integrations/pull/18506 - version: "0.1.0" changes: - description: First release of the Memcached OpenTelemetry content package diff --git a/packages/memcached_otel/kibana/alerting_rule_template/memcached_otel-high-connection-count.json b/packages/memcached_otel/kibana/alerting_rule_template/memcached_otel-high-connection-count.json index 52059845cb6..879b8144ce8 100644 --- a/packages/memcached_otel/kibana/alerting_rule_template/memcached_otel-high-connection-count.json +++ b/packages/memcached_otel/kibana/alerting_rule_template/memcached_otel-high-connection-count.json @@ -1,40 +1,53 @@ { - "id": "memcached_otel-high-connection-count", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Memcached OTel] High connection count", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "memcached" - ], - "schedule": { - "interval": "5m" - }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "enabled": true, - "lookBackWindow": 6, - "statusChangeThreshold": 3 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "TS metrics-memcachedreceiver.otel-*\n| WHERE memcached.connections.current IS NOT NULL\n// Current connections (gauge) — use max over window for peak detection\n| STATS max_connections = MAX(LAST_OVER_TIME(memcached.connections.current))\n BY host.name, COALESCE(service.instance.id, host.name)\n// Alert when connections exceed 800 — adjust for your -c limit\n| WHERE max_connections > 800\n| SORT max_connections DESC\n| LIMIT 20" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "host.name", - "termSize": 10, - "excludeHitsFromPreviousRun": true + "id": "memcached_otel-high-connection-count", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when `memcached.connections.current > 800`. Approaching the `-c` limit means new connections will be refused.", + "name": "[Memcached OTel] High connection count", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "memcached" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "enabled": true, + "lookBackWindow": 6, + "statusChangeThreshold": 3 + }, + "artifacts": { + "dashboards": [ + { + "id": "memcached_otel-overview" } + ], + "investigation_guide": { + "blob": "## Memcached high connection count\n\n### What fired\nCurrent connections exceeded 800 during the window.\n\n### Why it matters\nMemcached has a fixed connection limit (`-c`, default 1024). Above ~80% of the limit, new connections can be refused silently. Usually caused by client-side connection churn or missing pooling.\n\n### Triage\n1. Check client connection churn with `connections.total` rate.\n2. Inspect client pool sizing.\n\n### Remediation\n- Enable/tune client-side pooling.\n- Raise `-c` if legitimately needed.\n\n### Tuning\n- 800 assumes 1024 limit; tune to your `-c` setting.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "TS metrics-memcachedreceiver.otel-*\n| WHERE memcached.connections.current IS NOT NULL\n// Current connections (gauge) \u2014 use max over window for peak detection\n| STATS max_connections = MAX(LAST_OVER_TIME(memcached.connections.current))\n BY host.name, COALESCE(service.instance.id, host.name)\n// Alert when connections exceed 800 \u2014 adjust for your -c limit\n| WHERE max_connections > 800\n| SORT max_connections DESC\n| LIMIT 20" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "host.name", + "termSize": 10, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/memcached_otel/kibana/alerting_rule_template/memcached_otel-high-cpu-usage.json b/packages/memcached_otel/kibana/alerting_rule_template/memcached_otel-high-cpu-usage.json index 5b48759055f..87f0fb668c0 100644 --- a/packages/memcached_otel/kibana/alerting_rule_template/memcached_otel-high-cpu-usage.json +++ b/packages/memcached_otel/kibana/alerting_rule_template/memcached_otel-high-cpu-usage.json @@ -1,40 +1,53 @@ { - "id": "memcached_otel-high-cpu-usage", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Memcached OTel] High CPU usage", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "memcached" - ], - "schedule": { - "interval": "5m" - }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "enabled": true, - "lookBackWindow": 6, - "statusChangeThreshold": 3 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "TS metrics-memcachedreceiver.otel-*\n| WHERE memcached.cpu.usage IS NOT NULL\n// CPU usage is cumulative seconds (system + user) — RATE gives cores in use\n| STATS cpu_cores = SUM(RATE(memcached.cpu.usage))\n BY host.name, COALESCE(service.instance.id, host.name)\n// Alert when CPU exceeds 0.8 cores — memcached is typically CPU-light\n| WHERE cpu_cores > 0.8\n| SORT cpu_cores DESC\n| LIMIT 20" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "host.name", - "termSize": 10, - "excludeHitsFromPreviousRun": true + "id": "memcached_otel-high-cpu-usage", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when Memcached CPU usage exceeds 0.8 cores. Memcached is typically CPU-light; high CPU indicates heavy workload or pathological access patterns.", + "name": "[Memcached OTel] High CPU usage", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "memcached" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "enabled": true, + "lookBackWindow": 6, + "statusChangeThreshold": 3 + }, + "artifacts": { + "dashboards": [ + { + "id": "memcached_otel-overview" } + ], + "investigation_guide": { + "blob": "## Memcached high CPU usage\n\n### What fired\nCPU usage rate exceeded 0.8 cores during the window.\n\n### Why it matters\nBecause of its single-threaded command processing, Memcached's CPU usage tracks linearly with command rate. High CPU is unusual and often indicates very large values, heavy traffic, or pathological access patterns.\n\n### Triage\n1. Correlate with command throughput.\n2. Check item sizes and large-value usage.\n\n### Remediation\n- Scale horizontally.\n- Reduce value size or client-side work-split.\n\n### Tuning\n- 0.8 cores for a busy cache; tune to your thread count (`-t`).\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "TS metrics-memcachedreceiver.otel-*\n| WHERE memcached.cpu.usage IS NOT NULL\n// CPU usage is cumulative seconds (system + user) \u2014 RATE gives cores in use\n| STATS cpu_cores = SUM(RATE(memcached.cpu.usage))\n BY host.name, COALESCE(service.instance.id, host.name)\n// Alert when CPU exceeds 0.8 cores \u2014 memcached is typically CPU-light\n| WHERE cpu_cores > 0.8\n| SORT cpu_cores DESC\n| LIMIT 20" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "host.name", + "termSize": 10, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/memcached_otel/kibana/alerting_rule_template/memcached_otel-high-eviction-rate.json b/packages/memcached_otel/kibana/alerting_rule_template/memcached_otel-high-eviction-rate.json index 8f1e2122d0c..e06c95f61f3 100644 --- a/packages/memcached_otel/kibana/alerting_rule_template/memcached_otel-high-eviction-rate.json +++ b/packages/memcached_otel/kibana/alerting_rule_template/memcached_otel-high-eviction-rate.json @@ -1,40 +1,53 @@ { - "id": "memcached_otel-high-eviction-rate", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Memcached OTel] High eviction rate", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "memcached" - ], - "schedule": { - "interval": "5m" - }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "enabled": true, - "lookBackWindow": 6, - "statusChangeThreshold": 3 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "TS metrics-memcachedreceiver.otel-*\n| WHERE memcached.evictions IS NOT NULL\n// Eviction rate (evictions/sec) — rising evictions indicate memory pressure\n| STATS eviction_rate = SUM(RATE(memcached.evictions))\n BY host.name, COALESCE(service.instance.id, host.name)\n// Alert when eviction rate exceeds 10/sec — tune for your workload\n| WHERE eviction_rate > 10.0\n| SORT eviction_rate DESC\n| LIMIT 20" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "host.name", - "termSize": 10, - "excludeHitsFromPreviousRun": true + "id": "memcached_otel-high-eviction-rate", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when the eviction rate exceeds 10/sec. Rising evictions signal memory pressure and will drop the hit ratio.", + "name": "[Memcached OTel] High eviction rate", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "memcached" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "enabled": true, + "lookBackWindow": 6, + "statusChangeThreshold": 3 + }, + "artifacts": { + "dashboards": [ + { + "id": "memcached_otel-overview" } + ], + "investigation_guide": { + "blob": "## Memcached high eviction rate\n\n### What fired\n`memcached.evictions` rate exceeded 10/sec over the window.\n\n### Why it matters\nEvictions happen when memory is full. Elevated eviction rates indicate the working set no longer fits, degrading hit ratio and driving load on the backing store.\n\n### Triage\n1. Compare against hit-ratio trend.\n2. Check for recent traffic or key-distribution changes.\n\n### Remediation\n- Increase memory (`-m`).\n- Reduce TTL or remove large keys.\n\n### Tuning\n- 10/sec is a generic threshold; tune to workload.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "TS metrics-memcachedreceiver.otel-*\n| WHERE memcached.evictions IS NOT NULL\n// Eviction rate (evictions/sec) \u2014 rising evictions indicate memory pressure\n| STATS eviction_rate = SUM(RATE(memcached.evictions))\n BY host.name, COALESCE(service.instance.id, host.name)\n// Alert when eviction rate exceeds 10/sec \u2014 tune for your workload\n| WHERE eviction_rate > 10.0\n| SORT eviction_rate DESC\n| LIMIT 20" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "host.name", + "termSize": 10, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/memcached_otel/kibana/alerting_rule_template/memcached_otel-low-command-throughput.json b/packages/memcached_otel/kibana/alerting_rule_template/memcached_otel-low-command-throughput.json index 35cb12fb7f0..8ca544c5c25 100644 --- a/packages/memcached_otel/kibana/alerting_rule_template/memcached_otel-low-command-throughput.json +++ b/packages/memcached_otel/kibana/alerting_rule_template/memcached_otel-low-command-throughput.json @@ -1,40 +1,53 @@ { - "id": "memcached_otel-low-command-throughput", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Memcached OTel] Low command throughput", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "memcached" - ], - "schedule": { - "interval": "5m" - }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "enabled": true, - "lookBackWindow": 6, - "statusChangeThreshold": 3 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "TS metrics-memcachedreceiver.otel-*\n| WHERE memcached.commands IS NOT NULL\n// Total command rate across all types (get, set, flush, touch)\n| STATS cmd_rate = SUM(RATE(memcached.commands))\n BY host.name, COALESCE(service.instance.id, host.name)\n// Alert when throughput drops below 1 cmd/sec — indicates cache is idle or unreachable\n| WHERE cmd_rate < 1.0\n| SORT cmd_rate ASC\n| LIMIT 20" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "host.name", - "termSize": 10, - "excludeHitsFromPreviousRun": true + "id": "memcached_otel-low-command-throughput", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when the command rate drops below 1/sec. The cache is idle or unreachable by clients.", + "name": "[Memcached OTel] Low command throughput", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "memcached" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "enabled": true, + "lookBackWindow": 6, + "statusChangeThreshold": 3 + }, + "artifacts": { + "dashboards": [ + { + "id": "memcached_otel-overview" } + ], + "investigation_guide": { + "blob": "## Memcached low command throughput\n\n### What fired\nCommand rate dropped below 1/sec during the window.\n\n### Why it matters\nA near-zero command rate either means clients have all disconnected or stopped issuing requests. Application layer almost certainly has an outage.\n\n### Triage\n1. Check Memcached connection count.\n2. Inspect client-side application health.\n3. Verify network between clients and Memcached.\n\n### Remediation\n- Fix the client or network issue.\n\n### Tuning\n- Fires on < 1 cmd/sec. Filter out maintenance windows.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "TS metrics-memcachedreceiver.otel-*\n| WHERE memcached.commands IS NOT NULL\n// Total command rate across all types (get, set, flush, touch)\n| STATS cmd_rate = SUM(RATE(memcached.commands))\n BY host.name, COALESCE(service.instance.id, host.name)\n// Alert when throughput drops below 1 cmd/sec \u2014 indicates cache is idle or unreachable\n| WHERE cmd_rate < 1.0\n| SORT cmd_rate ASC\n| LIMIT 20" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "host.name", + "termSize": 10, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/memcached_otel/kibana/alerting_rule_template/memcached_otel-low-hit-ratio.json b/packages/memcached_otel/kibana/alerting_rule_template/memcached_otel-low-hit-ratio.json index a9f0be05988..37b9defa2df 100644 --- a/packages/memcached_otel/kibana/alerting_rule_template/memcached_otel-low-hit-ratio.json +++ b/packages/memcached_otel/kibana/alerting_rule_template/memcached_otel-low-hit-ratio.json @@ -1,40 +1,53 @@ { - "id": "memcached_otel-low-hit-ratio", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Memcached OTel] Low hit ratio for get operations", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "memcached" - ], - "schedule": { - "interval": "5m" - }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "enabled": true, - "lookBackWindow": 6, - "statusChangeThreshold": 3 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-memcachedreceiver.otel-*\n| WHERE memcached.operation_hit_ratio IS NOT NULL\n| WHERE operation == \"get\" OR attributes.operation == \"get\"\n// Bug: operation_hit_ratio actually reports miss ratio (see OTel contrib #30695)\n// True hit ratio = 100 - miss_ratio. Alert when hit ratio < 80% (miss > 20%)\n| STATS miss_ratio_pct = AVG(memcached.operation_hit_ratio), hit_ratio = 100.0 - AVG(memcached.operation_hit_ratio)\n BY host.name, COALESCE(service.instance.id, host.name)\n| WHERE miss_ratio_pct > 20.0\n| SORT miss_ratio_pct DESC\n| LIMIT 20" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "host.name", - "termSize": 10, - "excludeHitsFromPreviousRun": true + "id": "memcached_otel-low-hit-ratio", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when the get-operation miss ratio exceeds 20% (hit ratio < 80%). Low hit ratios mean the cache is ineffective.", + "name": "[Memcached OTel] Low hit ratio for get operations", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "memcached" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "enabled": true, + "lookBackWindow": 6, + "statusChangeThreshold": 3 + }, + "artifacts": { + "dashboards": [ + { + "id": "memcached_otel-overview" } + ], + "investigation_guide": { + "blob": "## Memcached low hit ratio for get operations\n\n### What fired\nTrue hit ratio (`100 - operation_hit_ratio`) dropped below 80%.\n\n### Why it matters\nLow cache hit ratios mean the backing store absorbs more traffic, raising latency and load. Causes: cold cache post-restart, working-set shift, insufficient memory.\n\n### Triage\n1. Check evictions trend \u2014 high evictions correlate with low hit ratio.\n2. Inspect recent restarts / deploys.\n3. Look at key-distribution patterns.\n\n### Remediation\n- Increase memory.\n- Adjust TTLs.\n\n### Tuning\n- `operation_hit_ratio` actually reports miss ratio (upstream bug); the rule accounts for this.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-memcachedreceiver.otel-*\n| WHERE memcached.operation_hit_ratio IS NOT NULL\n| WHERE operation == \"get\" OR attributes.operation == \"get\"\n// Bug: operation_hit_ratio actually reports miss ratio (see OTel contrib #30695)\n// True hit ratio = 100 - miss_ratio. Alert when hit ratio < 80% (miss > 20%)\n| STATS miss_ratio_pct = AVG(memcached.operation_hit_ratio), hit_ratio = 100.0 - AVG(memcached.operation_hit_ratio)\n BY host.name, COALESCE(service.instance.id, host.name)\n| WHERE miss_ratio_pct > 20.0\n| SORT miss_ratio_pct DESC\n| LIMIT 20" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "host.name", + "termSize": 10, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/memcached_otel/manifest.yml b/packages/memcached_otel/manifest.yml index 70278fd8958..99c49246281 100644 --- a/packages/memcached_otel/manifest.yml +++ b/packages/memcached_otel/manifest.yml @@ -1,7 +1,7 @@ format_version: 3.5.7 name: memcached_otel title: "Memcached OpenTelemetry assets" -version: 0.1.0 +version: 0.2.0 source: license: "Elastic-2.0" description: "Memcached Assets for OpenTelemetry Collector" @@ -11,7 +11,7 @@ categories: - opentelemetry conditions: kibana: - version: "^9.3.0" + version: "^9.4.0" elastic: subscription: "basic" discovery: diff --git a/packages/microsoft_sqlserver_otel/changelog.yml b/packages/microsoft_sqlserver_otel/changelog.yml index df13fcf10bc..54eda1c1bec 100644 --- a/packages/microsoft_sqlserver_otel/changelog.yml +++ b/packages/microsoft_sqlserver_otel/changelog.yml @@ -1,4 +1,9 @@ # newer versions go on top +- version: "0.2.0" + changes: + - description: Add description and artifact fields to alerting rule template. + type: enhancement + link: https://github.com/elastic/integrations/pull/18506 - version: "0.1.0" changes: - description: First release of the Microsoft SQL Server Assets for OpenTelemetry Collector. diff --git a/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-blocked-processes.json b/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-blocked-processes.json index bfdd8d1c997..79a5dc3706f 100644 --- a/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-blocked-processes.json +++ b/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-blocked-processes.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `sqlserver.processes.blocked > 0`. Any active blocking chain degrades throughput for multiple sessions.", "name": "[SQL Server OTel] Blocked processes", "ruleTypeId": ".es-query", "tags": [ @@ -19,13 +20,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "microsoft_sqlserver_otel-concurrency-errors" + }, + { + "id": "microsoft_sqlserver_otel-overview" + } + ], + "investigation_guide": { + "blob": "## SQL Server blocked processes\n\n### What fired\n`sqlserver.processes.blocked > 0` during the window.\n\n### Why it matters\nBlocked processes are queries waiting on locks held by others. Long or frequent blocking cascades into thread exhaustion and timeouts.\n\n### Triage\n1. Use the Concurrency dashboard to see blocking chains.\n2. Check `sqlserver.lock.wait.rate` for corroboration.\n\n### Remediation\n- Kill the head blocker if appropriate.\n- Optimise transactions to reduce lock hold time.\n\n### Tuning\n- Fires on any non-zero value.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-sqlserverreceiver.otel-default\n// Non-zero blocked processes means active blocking chains\n| WHERE metrics.sqlserver.processes.blocked IS NOT NULL\n| STATS max_blocked = MAX(metrics.sqlserver.processes.blocked),\n instance_id = MAX(resource.attributes.service.instance.id)\n BY COALESCE(resource.attributes.service.instance.id, host.name, \"unknown\")\n| WHERE max_blocked > 0\n| KEEP instance_id, max_blocked" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 5, diff --git a/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-buffer-cache-hit-ratio-low.json b/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-buffer-cache-hit-ratio-low.json index 720e77de44e..67c7926a4e4 100644 --- a/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-buffer-cache-hit-ratio-low.json +++ b/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-buffer-cache-hit-ratio-low.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when the buffer cache hit ratio drops below 95%. A low ratio means disk reads are replacing cache hits \u2014 memory pressure.", "name": "[SQL Server OTel] Buffer cache hit ratio low", "ruleTypeId": ".es-query", "tags": [ @@ -19,13 +20,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "microsoft_sqlserver_otel-memory-saturation" + }, + { + "id": "microsoft_sqlserver_otel-overview" + } + ], + "investigation_guide": { + "blob": "## SQL Server buffer cache hit ratio low\n\n### What fired\nBuffer cache hit ratio dropped below 95% during the window.\n\n### Why it matters\nSQL Server serves most pages from the buffer pool. A dropping hit ratio means more reads are forced to disk \u2014 a classic memory-pressure signal.\n\n### Triage\n1. Check page life expectancy and free-list stalls.\n2. Inspect memory grants pending.\n3. Look for large queries pushing buffer pool.\n\n### Remediation\n- Add memory.\n- Tune problem queries.\n\n### Tuning\n- 95% threshold; some workloads tolerate 90-95%.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-sqlserverreceiver.otel-default\n// Instance-level metric: buffer cache hit ratio as percentage (0-100)\n| WHERE metrics.sqlserver.page.buffer_cache.hit_ratio IS NOT NULL\n| STATS min_ratio = MIN(metrics.sqlserver.page.buffer_cache.hit_ratio),\n instance_id = MAX(resource.attributes.service.instance.id)\n BY COALESCE(resource.attributes.service.instance.id, host.name, \"unknown\")\n// Ideally > 95%; drops mean disk reads replacing cache hits\n| WHERE min_ratio < 95.0\n| KEEP instance_id, min_ratio" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 10, diff --git a/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-database-io-latency-high.json b/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-database-io-latency-high.json index 4791ebe24e6..6b01aa63913 100644 --- a/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-database-io-latency-high.json +++ b/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-database-io-latency-high.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when average per-operation I/O latency exceeds 100 ms. Storage pressure degrades overall database performance.", "name": "[SQL Server OTel] Database I/O latency high", "ruleTypeId": ".es-query", "tags": [ @@ -19,13 +20,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "microsoft_sqlserver_otel-database-io" + }, + { + "id": "microsoft_sqlserver_otel-overview" + } + ], + "investigation_guide": { + "blob": "## SQL Server database I/O latency high\n\n### What fired\n`INCREASE(sqlserver.database.latency) / INCREASE(sqlserver.database.operations) > 0.1 s` during the window.\n\n### Why it matters\nHigh average I/O latency per operation means the storage backend is the bottleneck. Every query reading or writing waits longer.\n\n### Triage\n1. Split by file_type (data, log, filestream) and direction to localise.\n2. Check OS wait types for PAGEIOLATCH* or WRITELOG.\n3. Inspect storage layer metrics.\n\n### Remediation\n- Migrate files to faster storage.\n- Reduce I/O-heavy workloads.\n\n### Tuning\n- 100 ms threshold; tighten for OLTP workloads.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "TS metrics-sqlserverreceiver.otel-default\n// Per-database file I/O latency (counter); INCREASE over window indicates storage pressure\n| WHERE metrics.sqlserver.database.latency IS NOT NULL\n AND metrics.sqlserver.database.operations IS NOT NULL\n| STATS lat_increase = SUM(INCREASE(metrics.sqlserver.database.latency)),\n ops_increase = SUM(INCREASE(metrics.sqlserver.database.operations))\n BY instance_id = COALESCE(resource.attributes.service.instance.id, host.name, \"unknown\"),\n database_name = COALESCE(resource.attributes.sqlserver.database.name, \"\"),\n file_type = COALESCE(attributes.file_type, file_type, \"\"),\n direction = COALESCE(attributes.direction, direction, \"\")\n// Avg latency per op > 0.1s (100ms) when we have operations in the window\n| WHERE ops_increase > 0 AND (lat_increase / ops_increase) > 0.1\n| KEEP instance_id, database_name, file_type, direction, lat_increase, ops_increase" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 10, diff --git a/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-deadlocks-detected.json b/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-deadlocks-detected.json index fa787588587..42511dcc186 100644 --- a/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-deadlocks-detected.json +++ b/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-deadlocks-detected.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `sqlserver.deadlock.rate > 0`. Sustained non-zero deadlock rate is a serious concurrency problem.", "name": "[SQL Server OTel] Deadlocks detected", "ruleTypeId": ".es-query", "tags": [ @@ -19,13 +20,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "microsoft_sqlserver_otel-concurrency-errors" + }, + { + "id": "microsoft_sqlserver_otel-overview" + } + ], + "investigation_guide": { + "blob": "## SQL Server deadlocks detected\n\n### What fired\n`sqlserver.deadlock.rate > 0` during the window.\n\n### Why it matters\nDeadlocks abort one or more transactions to break a cycle. Sustained deadlocks point to inconsistent locking order or overly broad transactions.\n\n### Triage\n1. Examine deadlock XML graphs from Extended Events.\n2. Look for hot tables / indexes.\n\n### Remediation\n- Enforce consistent access order.\n- Use appropriate isolation levels and smaller transactions.\n\n### Tuning\n- Fires on any non-zero rate.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-sqlserverreceiver.otel-default\n// Any non-zero sustained deadlock rate is a serious concurrency problem\n| WHERE metrics.sqlserver.deadlock.rate IS NOT NULL\n| STATS max_deadlocks = MAX(metrics.sqlserver.deadlock.rate),\n instance_id = MAX(resource.attributes.service.instance.id)\n BY COALESCE(resource.attributes.service.instance.id, host.name, \"unknown\")\n| WHERE max_deadlocks > 0\n| KEEP instance_id, max_deadlocks" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 5, diff --git a/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-execution-errors.json b/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-execution-errors.json index 43beeae14f0..8fa4ab237ec 100644 --- a/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-execution-errors.json +++ b/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-execution-errors.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `sqlserver.database.execution.errors > 0`. Any engine-reported execution errors warrant investigation.", "name": "[SQL Server OTel] Execution errors", "ruleTypeId": ".es-query", "tags": [ @@ -19,13 +20,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "microsoft_sqlserver_otel-concurrency-errors" + }, + { + "id": "microsoft_sqlserver_otel-overview" + } + ], + "investigation_guide": { + "blob": "## SQL Server execution errors\n\n### What fired\nExecution error counter increased during the window.\n\n### Why it matters\nThe engine counts failed executions across the instance. Sustained counts usually correlate with deadlocks, permission issues, or application bugs.\n\n### Triage\n1. Correlate with deadlocks, lock timeouts, and memory grant pending counts.\n2. Inspect application logs for the affected queries.\n\n### Remediation\n- Fix the root cause in the application or DB config.\n\n### Tuning\n- Fires on any non-zero total.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-sqlserverreceiver.otel-default\n// Direct error count from the engine\n| WHERE metrics.sqlserver.database.execution.errors IS NOT NULL\n| STATS total_errors = SUM(metrics.sqlserver.database.execution.errors),\n instance_id = MAX(resource.attributes.service.instance.id)\n BY COALESCE(resource.attributes.service.instance.id, host.name, \"unknown\")\n| WHERE total_errors > 0\n| KEEP instance_id, total_errors" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 5, diff --git a/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-free-list-stalls.json b/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-free-list-stalls.json index fbc83616157..f2683355582 100644 --- a/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-free-list-stalls.json +++ b/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-free-list-stalls.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `sqlserver.page.buffer_cache.free_list.stalls.rate > 0`. Free-list stalls mean extreme memory pressure on the buffer pool.", "name": "[SQL Server OTel] Free list stalls", "ruleTypeId": ".es-query", "tags": [ @@ -19,13 +20,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "microsoft_sqlserver_otel-memory-saturation" + }, + { + "id": "microsoft_sqlserver_otel-overview" + } + ], + "investigation_guide": { + "blob": "## SQL Server free list stalls\n\n### What fired\nFree-list stall rate exceeded 0 during the window.\n\n### Why it matters\nSQL Server stalls when it cannot find a free page in the buffer pool \u2014 a near-catastrophic memory pressure event.\n\n### Triage\n1. Check page life expectancy and lazy writer activity.\n2. Look at memory grants pending and query memory waits.\n\n### Remediation\n- Add memory immediately.\n- Kill or throttle memory-intensive queries.\n\n### Tuning\n- Fires on any non-zero rate.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { - "esql": "FROM metrics-sqlserverreceiver.otel-default\n// Buffer pool cannot find a free page — indicates extreme memory pressure\n| WHERE metrics.sqlserver.page.buffer_cache.free_list.stalls.rate IS NOT NULL\n| STATS max_stalls = MAX(metrics.sqlserver.page.buffer_cache.free_list.stalls.rate),\n instance_id = MAX(resource.attributes.service.instance.id)\n BY COALESCE(resource.attributes.service.instance.id, host.name, \"unknown\")\n| WHERE max_stalls > 0\n| KEEP instance_id, max_stalls" + "esql": "FROM metrics-sqlserverreceiver.otel-default\n// Buffer pool cannot find a free page \u2014 indicates extreme memory pressure\n| WHERE metrics.sqlserver.page.buffer_cache.free_list.stalls.rate IS NOT NULL\n| STATS max_stalls = MAX(metrics.sqlserver.page.buffer_cache.free_list.stalls.rate),\n instance_id = MAX(resource.attributes.service.instance.id)\n BY COALESCE(resource.attributes.service.instance.id, host.name, \"unknown\")\n| WHERE max_stalls > 0\n| KEEP instance_id, max_stalls" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 5, diff --git a/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-lock-timeouts.json b/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-lock-timeouts.json index 418e5e1f80b..0678e731273 100644 --- a/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-lock-timeouts.json +++ b/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-lock-timeouts.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `sqlserver.lock.timeout.rate > 0`. Queries giving up waiting for locks indicate contention beyond clients' patience.", "name": "[SQL Server OTel] Lock timeouts", "ruleTypeId": ".es-query", "tags": [ @@ -19,13 +20,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "microsoft_sqlserver_otel-concurrency-errors" + }, + { + "id": "microsoft_sqlserver_otel-overview" + } + ], + "investigation_guide": { + "blob": "## SQL Server lock timeouts\n\n### What fired\nLock timeout rate exceeded 0 during the window.\n\n### Why it matters\nTimeouts mean queries failed rather than waited \u2014 worse user experience than blocking alone. Usually paired with rising lock waits and blocked processes.\n\n### Triage\n1. Look at blocked processes and lock-wait rate on the same host.\n2. Check application-side lock timeout settings.\n\n### Remediation\n- Reduce transaction hold time.\n- Apply row-versioning / snapshot isolation where safe.\n\n### Tuning\n- Fires on any non-zero rate.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-sqlserverreceiver.otel-default\n// Lock timeouts: queries giving up waiting for locks\n| WHERE metrics.sqlserver.lock.timeout.rate IS NOT NULL\n| STATS max_timeouts = MAX(metrics.sqlserver.lock.timeout.rate),\n instance_id = MAX(resource.attributes.service.instance.id)\n BY COALESCE(resource.attributes.service.instance.id, host.name, \"unknown\")\n| WHERE max_timeouts > 0\n| KEEP instance_id, max_timeouts" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 5, diff --git a/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-lock-wait-rate-high.json b/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-lock-wait-rate-high.json index 076fceef84f..60ae3ccb306 100644 --- a/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-lock-wait-rate-high.json +++ b/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-lock-wait-rate-high.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when lock waits exceed 10 requests/sec. High wait rate means queries routinely block on each other.", "name": "[SQL Server OTel] Lock wait rate high", "ruleTypeId": ".es-query", "tags": [ @@ -19,13 +20,28 @@ "lookBackWindow": 8, "statusChangeThreshold": 4 }, + "artifacts": { + "dashboards": [ + { + "id": "microsoft_sqlserver_otel-concurrency-errors" + }, + { + "id": "microsoft_sqlserver_otel-overview" + } + ], + "investigation_guide": { + "blob": "## SQL Server lock wait rate high\n\n### What fired\nAverage `sqlserver.lock.wait.rate` exceeded 10/sec over the window.\n\n### Why it matters\nLock waits indicate contention even when no timeouts have occurred. Sustained high wait rates correlate with blocked processes and throughput degradation.\n\n### Triage\n1. Profile top wait types.\n2. Identify hot tables / indexes.\n\n### Remediation\n- Tune indexes / query plans.\n- Use snapshot isolation where appropriate.\n\n### Tuning\n- 10/sec is a moderate starting point.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-sqlserverreceiver.otel-default\n// Lock requests resulting in waits per second; high values mean queries blocking each other\n| WHERE metrics.sqlserver.lock.wait.rate IS NOT NULL\n| STATS avg_wait_rate = AVG(metrics.sqlserver.lock.wait.rate),\n instance_id = MAX(resource.attributes.service.instance.id)\n BY COALESCE(resource.attributes.service.instance.id, host.name, \"unknown\")\n// Adjust threshold (requests/s) based on expected workload; 10 is a starting point\n| WHERE avg_wait_rate > 10.0\n| KEEP instance_id, avg_wait_rate" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 10, diff --git a/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-login-rate-spike.json b/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-login-rate-spike.json index aa1204dc8ef..2cca4bba08c 100644 --- a/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-login-rate-spike.json +++ b/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-login-rate-spike.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `sqlserver.login.rate > 50/sec`. A sudden login spike often indicates connection-pool misconfiguration or a connection storm.", "name": "[SQL Server OTel] Login rate spike", "ruleTypeId": ".es-query", "tags": [ @@ -19,13 +20,25 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "microsoft_sqlserver_otel-overview" + } + ], + "investigation_guide": { + "blob": "## SQL Server login rate spike\n\n### What fired\nAverage login rate exceeded 50/sec over the window.\n\n### Why it matters\nApplication clients should reuse pooled connections. A login storm suggests a pool is disabled or cycling rapidly, which is expensive (auth, reset, etc.) and can exhaust the worker thread budget.\n\n### Triage\n1. Identify which clients are spiking.\n2. Check pool settings (min/max, connection TTL).\n\n### Remediation\n- Fix client pooling.\n\n### Tuning\n- 50/sec assumes an OLTP baseline near zero; tune to your workload.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-sqlserverreceiver.otel-default\n// High login rate may indicate connection pool misconfiguration or connection storm\n| WHERE metrics.sqlserver.login.rate IS NOT NULL\n| STATS avg_login_rate = AVG(metrics.sqlserver.login.rate),\n instance_id = MAX(resource.attributes.service.instance.id)\n BY COALESCE(resource.attributes.service.instance.id, host.name, \"unknown\")\n// Adjust threshold (logins/s) based on baseline; 50/s is a starting point for investigation\n| WHERE avg_login_rate > 50.0\n| KEEP instance_id, avg_login_rate" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 5, diff --git a/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-memory-grants-pending.json b/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-memory-grants-pending.json index fb116fc68dd..03f7d6e1d5e 100644 --- a/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-memory-grants-pending.json +++ b/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-memory-grants-pending.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `sqlserver.memory.grants.pending.count > 0`. Queries unable to get memory grants cannot execute.", "name": "[SQL Server OTel] Memory grants pending", "ruleTypeId": ".es-query", "tags": [ @@ -19,13 +20,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "microsoft_sqlserver_otel-memory-saturation" + }, + { + "id": "microsoft_sqlserver_otel-query-performance" + } + ], + "investigation_guide": { + "blob": "## SQL Server memory grants pending\n\n### What fired\nMemory grants pending > 0 during the window.\n\n### Why it matters\nQueries request memory for sorts / hashes before executing. Pending grants mean the workspace memory is exhausted \u2014 queries wait and may time out.\n\n### Triage\n1. Check memory configuration.\n2. Look at queries with large memory grants (sorts, hashes).\n\n### Remediation\n- Add memory or tune queries.\n\n### Tuning\n- Fires on any non-zero value.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-sqlserverreceiver.otel-default\n// Queries waiting for memory grants cannot execute; any sustained non-zero is critical\n| WHERE metrics.sqlserver.memory.grants.pending.count IS NOT NULL\n| STATS max_pending = MAX(metrics.sqlserver.memory.grants.pending.count),\n instance_id = MAX(resource.attributes.service.instance.id)\n BY COALESCE(resource.attributes.service.instance.id, host.name, \"unknown\")\n| WHERE max_pending > 0\n| KEEP instance_id, max_pending" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 5, diff --git a/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-os-wait-duration-high.json b/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-os-wait-duration-high.json index 9fafe4ab95d..6b219d84dfb 100644 --- a/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-os-wait-duration-high.json +++ b/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-os-wait-duration-high.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when OS wait time delta exceeds 60 seconds within the window. Sustained waits point to storage/CPU pressure.", "name": "[SQL Server OTel] OS wait duration high", "ruleTypeId": ".es-query", "tags": [ @@ -19,13 +20,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "microsoft_sqlserver_otel-database-io" + }, + { + "id": "microsoft_sqlserver_otel-overview" + } + ], + "investigation_guide": { + "blob": "## SQL Server OS wait duration high\n\n### What fired\n`INCREASE(sqlserver.os.wait.duration) > 60s` during the window.\n\n### Why it matters\nOS wait types (PAGEIOLATCH_SH, WRITELOG, SOS_SCHEDULER_YIELD) expose the dominant resource constraint. Sustained waits mean the DB is I/O- or CPU-bound.\n\n### Triage\n1. Split by wait_type to find the dominant category.\n2. Correlate with disk latency / CPU metrics.\n\n### Remediation\n- Address the underlying resource pressure.\n\n### Tuning\n- 60s across all wait types in the window; dive deeper per wait_type.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "TS metrics-sqlserverreceiver.otel-default\n// Cumulative wait time by wait type (counter); PAGEIOLATCH_SH, WRITELOG, SOS_SCHEDULER_YIELD indicate I/O/CPU pressure\n| WHERE metrics.sqlserver.os.wait.duration IS NOT NULL\n| STATS wait_increase = SUM(INCREASE(metrics.sqlserver.os.wait.duration))\n BY instance_id = COALESCE(resource.attributes.service.instance.id, host.name, \"unknown\"),\n wait_type = COALESCE(attributes.wait.type, wait.type, \"\")\n// Alert when wait time increased by > 60s in the window (indicates sustained waits)\n| WHERE wait_increase > 60.0\n| KEEP instance_id, wait_type, wait_increase" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 10, diff --git a/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-page-life-expectancy-low.json b/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-page-life-expectancy-low.json index 970cf75b3e1..190e4d07315 100644 --- a/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-page-life-expectancy-low.json +++ b/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-page-life-expectancy-low.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `sqlserver.page.life_expectancy < 300` seconds. PLE below 300 is the canonical memory-pressure threshold.", "name": "[SQL Server OTel] Page life expectancy low", "ruleTypeId": ".es-query", "tags": [ @@ -19,13 +20,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "microsoft_sqlserver_otel-memory-saturation" + }, + { + "id": "microsoft_sqlserver_otel-overview" + } + ], + "investigation_guide": { + "blob": "## SQL Server page life expectancy low\n\n### What fired\nPLE dropped below 300 seconds during the window.\n\n### Why it matters\nPage life expectancy is how long a page stays in the buffer pool. The well-known 300s threshold is a severe memory pressure indicator.\n\n### Triage\n1. Correlate with free-list stalls, buffer hit ratio, memory grants pending.\n\n### Remediation\n- Add memory.\n- Optimise bad queries / indexes.\n\n### Tuning\n- 300s; raise for larger memory footprints (some DBAs use 300 * GB_of_buffer_pool / 4).\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-sqlserverreceiver.otel-default\n// Instance-level metric: buffer pool page lifetime in seconds\n| WHERE metrics.sqlserver.page.life_expectancy IS NOT NULL\n| STATS min_ple = MIN(metrics.sqlserver.page.life_expectancy),\n instance_id = MAX(resource.attributes.service.instance.id)\n BY COALESCE(resource.attributes.service.instance.id, host.name, \"unknown\")\n// PLE below 300s indicates severe memory pressure; adjust threshold for your environment\n| WHERE min_ple < 300\n| KEEP instance_id, min_ple" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 10, diff --git a/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-tempdb-space-low.json b/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-tempdb-space-low.json index 9ea0a5f856c..ab42b55a230 100644 --- a/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-tempdb-space-low.json +++ b/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-tempdb-space-low.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when TempDB free space drops below 100 MB. TempDB exhaustion affects sorts, hashes, snapshots, and version store.", "name": "[SQL Server OTel] TempDB space low", "ruleTypeId": ".es-query", "tags": [ @@ -19,13 +20,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "microsoft_sqlserver_otel-database-io" + }, + { + "id": "microsoft_sqlserver_otel-overview" + } + ], + "investigation_guide": { + "blob": "## SQL Server TempDB space low\n\n### What fired\nTempDB free space dropped below 100 MB (102400 KB) during the window.\n\n### Why it matters\nTempDB is used by every connection for sorts, spills, and version store. Running out crashes many query patterns.\n\n### Triage\n1. Check version store size and long-running open transactions.\n2. Inspect large sorts / hash joins.\n\n### Remediation\n- Kill the long-running transaction holding the version store.\n- Grow TempDB files.\n\n### Tuning\n- 100 MB default; tune to TempDB total size.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-sqlserverreceiver.otel-default\n// TempDB space by state (free/used); filter for free space\n| WHERE metrics.sqlserver.database.tempdb.space IS NOT NULL\n| WHERE COALESCE(attributes.tempdb.state, tempdb.state) == \"free\"\n| STATS min_free_kb = MIN(metrics.sqlserver.database.tempdb.space),\n instance_id = MAX(resource.attributes.service.instance.id)\n BY COALESCE(resource.attributes.service.instance.id, host.name, \"unknown\")\n// Alert when free space below 100MB (102400 KB); adjust for your TempDB size\n| WHERE min_free_kb < 102400\n| KEEP instance_id, min_free_kb" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 10, diff --git a/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-tempdb-version-store-large.json b/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-tempdb-version-store-large.json index 31b62340782..14b18ad7086 100644 --- a/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-tempdb-version-store-large.json +++ b/packages/microsoft_sqlserver_otel/kibana/alerting_rule_template/microsoft_sqlserver_otel-tempdb-version-store-large.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when TempDB version store exceeds 1 GB. Large version stores indicate a long-running transaction preventing cleanup.", "name": "[SQL Server OTel] TempDB version store large", "ruleTypeId": ".es-query", "tags": [ @@ -19,13 +20,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "microsoft_sqlserver_otel-database-io" + }, + { + "id": "microsoft_sqlserver_otel-overview" + } + ], + "investigation_guide": { + "blob": "## SQL Server TempDB version store large\n\n### What fired\nVersion store size exceeded 1 GB during the window.\n\n### Why it matters\nVersion store rows are created for snapshot isolation and read-committed snapshot. They persist until the oldest open transaction ends. An unbounded growth indicates a leaked or zombie transaction.\n\n### Triage\n1. Find the oldest open transaction.\n2. Check for stuck application instances.\n\n### Remediation\n- Kill the stuck transaction after confirming.\n\n### Tuning\n- 1 GB default; tune to TempDB capacity.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-sqlserverreceiver.otel-default\n// Version store growth can exhaust TempDB; affects snapshot isolation, sorts, spills\n| WHERE metrics.sqlserver.database.tempdb.version_store.size IS NOT NULL\n| STATS max_version_store_kb = MAX(metrics.sqlserver.database.tempdb.version_store.size),\n instance_id = MAX(resource.attributes.service.instance.id)\n BY COALESCE(resource.attributes.service.instance.id, host.name, \"unknown\")\n// Alert when version store exceeds 1GB (1048576 KB); adjust for your TempDB capacity\n| WHERE max_version_store_kb > 1048576\n| KEEP instance_id, max_version_store_kb" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 10, diff --git a/packages/microsoft_sqlserver_otel/manifest.yml b/packages/microsoft_sqlserver_otel/manifest.yml index 3a066a0a1a6..42785f84d81 100644 --- a/packages/microsoft_sqlserver_otel/manifest.yml +++ b/packages/microsoft_sqlserver_otel/manifest.yml @@ -1,7 +1,7 @@ format_version: 3.5.7 name: microsoft_sqlserver_otel title: "Microsoft SQL Server Assets" -version: 0.1.0 +version: 0.2.0 source: license: "Elastic-2.0" description: "Microsoft SQL Server Assets" @@ -12,7 +12,7 @@ categories: - observability conditions: kibana: - version: "^9.3.0" + version: "^9.4.0" elastic: subscription: "basic" discovery: diff --git a/packages/mongodb_otel/changelog.yml b/packages/mongodb_otel/changelog.yml index 3c0d28f8fd6..e8d749d63a2 100644 --- a/packages/mongodb_otel/changelog.yml +++ b/packages/mongodb_otel/changelog.yml @@ -1,4 +1,9 @@ # newer versions go on top +- version: "0.2.0" + changes: + - description: Add description and artifact fields to alerting rule template. + type: enhancement + link: https://github.com/elastic/integrations/pull/18506 - version: "0.1.0" changes: - description: Initial release of the MongoDB OpenTelemetry content pack with dashboards, alert rules, and SLO templates diff --git a/packages/mongodb_otel/kibana/alerting_rule_template/mongodb_otel-connection-exhaustion.json b/packages/mongodb_otel/kibana/alerting_rule_template/mongodb_otel-connection-exhaustion.json index c5610068d90..e01a79dd79b 100644 --- a/packages/mongodb_otel/kibana/alerting_rule_template/mongodb_otel-connection-exhaustion.json +++ b/packages/mongodb_otel/kibana/alerting_rule_template/mongodb_otel-connection-exhaustion.json @@ -2,15 +2,36 @@ "id": "mongodb_otel-connection-exhaustion", "type": "alerting_rule_template", "attributes": { + "description": "Alerts when `mongodb.connection.count (type=available) < 10`. Low available connections mean clients are about to hit the pool limit.", "name": "[MongoDB OTel] Connection exhaustion", + "ruleTypeId": ".es-query", "tags": [ "observability", "mongodb" ], - "ruleTypeId": ".es-query", "schedule": { "interval": "5m" }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "mongodb_otel-capacity" + }, + { + "id": "mongodb_otel-overview" + } + ], + "investigation_guide": { + "blob": "## MongoDB connection exhaustion\n\n### What fired\nMinimum `mongodb.connection.count` with `type=available` dropped below 10 during the window.\n\n### Why it matters\nEach Mongo host has a bounded connection pool. Running out means clients get connection errors and retry storms can overwhelm the server.\n\n### Triage\n1. Check client pool configuration.\n2. Identify clients holding many idle connections.\n\n### Remediation\n- Raise the pool limit.\n- Fix client-side connection leaks.\n\n### Tuning\n- Threshold 10; tune to your pool size.\n" + } + }, "params": { "searchType": "esqlQuery", "timeWindowSize": 15, @@ -20,13 +41,6 @@ }, "groupBy": "row", "timeField": "@timestamp" - }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 } }, "managed": true, diff --git a/packages/mongodb_otel/kibana/alerting_rule_template/mongodb_otel-cursor-timeouts.json b/packages/mongodb_otel/kibana/alerting_rule_template/mongodb_otel-cursor-timeouts.json index 68db702ab72..c45a3e92d12 100644 --- a/packages/mongodb_otel/kibana/alerting_rule_template/mongodb_otel-cursor-timeouts.json +++ b/packages/mongodb_otel/kibana/alerting_rule_template/mongodb_otel-cursor-timeouts.json @@ -2,15 +2,36 @@ "id": "mongodb_otel-cursor-timeouts", "type": "alerting_rule_template", "attributes": { + "description": "Alerts when `mongodb.cursor.timeout.count` increases. Cursor timeouts mean clients can't consume result batches in time.", "name": "[MongoDB OTel] Cursor timeouts", + "ruleTypeId": ".es-query", "tags": [ "observability", "mongodb" ], - "ruleTypeId": ".es-query", "schedule": { "interval": "5m" }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "mongodb_otel-operations" + }, + { + "id": "mongodb_otel-overview" + } + ], + "investigation_guide": { + "blob": "## MongoDB cursor timeouts\n\n### What fired\n`mongodb.cursor.timeout.count` increased during the window.\n\n### Why it matters\nMongoDB cursors expire after 10 minutes of inactivity. Timeouts mean the application did not iterate fast enough or held the cursor without use.\n\n### Triage\n1. Check application query patterns and batch sizes.\n2. Look for cursor leaks.\n\n### Remediation\n- Fix application to iterate cursors promptly or use `noCursorTimeout` carefully.\n\n### Tuning\n- Fires on any increase.\n" + } + }, "params": { "searchType": "esqlQuery", "timeWindowSize": 15, @@ -20,13 +41,6 @@ }, "groupBy": "row", "timeField": "@timestamp" - }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 } }, "managed": true, diff --git a/packages/mongodb_otel/kibana/alerting_rule_template/mongodb_otel-global-lock-contention.json b/packages/mongodb_otel/kibana/alerting_rule_template/mongodb_otel-global-lock-contention.json index 1c2fd8397ea..b13798683be 100644 --- a/packages/mongodb_otel/kibana/alerting_rule_template/mongodb_otel-global-lock-contention.json +++ b/packages/mongodb_otel/kibana/alerting_rule_template/mongodb_otel-global-lock-contention.json @@ -2,15 +2,36 @@ "id": "mongodb_otel-global-lock-contention", "type": "alerting_rule_template", "attributes": { + "description": "Alerts when the global lock is held more than 500 ms/sec on average. High lock time severely limits throughput.", "name": "[MongoDB OTel] Global lock contention", + "ruleTypeId": ".es-query", "tags": [ "observability", "mongodb" ], - "ruleTypeId": ".es-query", "schedule": { "interval": "5m" }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "mongodb_otel-operations" + }, + { + "id": "mongodb_otel-overview" + } + ], + "investigation_guide": { + "blob": "## MongoDB global lock contention\n\n### What fired\n`RATE(mongodb.global_lock.time) > 500 ms/sec` during the window.\n\n### Why it matters\nMongoDB uses reader-writer locks at various scopes. Sustained high global lock time means operations are queued behind each other \u2014 throughput collapses.\n\n### Triage\n1. Look at write-heavy collections.\n2. Check WT cache pressure.\n\n### Remediation\n- Shard to spread locking.\n- Reduce write contention in hot collections.\n\n### Tuning\n- 500 ms/sec threshold; tune to workload.\n" + } + }, "params": { "searchType": "esqlQuery", "timeWindowSize": 15, @@ -20,13 +41,6 @@ }, "groupBy": "row", "timeField": "@timestamp" - }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 } }, "managed": true, diff --git a/packages/mongodb_otel/kibana/alerting_rule_template/mongodb_otel-high-cache-miss-rate.json b/packages/mongodb_otel/kibana/alerting_rule_template/mongodb_otel-high-cache-miss-rate.json index 68a582df708..07535e19b93 100644 --- a/packages/mongodb_otel/kibana/alerting_rule_template/mongodb_otel-high-cache-miss-rate.json +++ b/packages/mongodb_otel/kibana/alerting_rule_template/mongodb_otel-high-cache-miss-rate.json @@ -2,15 +2,36 @@ "id": "mongodb_otel-high-cache-miss-rate", "type": "alerting_rule_template", "attributes": { + "description": "Alerts when the WiredTiger cache miss rate rises. Low cache hit ratios mean reads are going to disk.", "name": "[MongoDB OTel] High cache miss rate", + "ruleTypeId": ".es-query", "tags": [ "observability", "mongodb" ], - "ruleTypeId": ".es-query", "schedule": { "interval": "5m" }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "mongodb_otel-capacity" + }, + { + "id": "mongodb_otel-overview" + } + ], + "investigation_guide": { + "blob": "## MongoDB high cache miss rate\n\n### What fired\nPer-server cache miss rate rose above the computed threshold.\n\n### Why it matters\nWiredTiger's internal cache is the main performance buffer. Rising miss rate means working set has exceeded cache or cache has been evicted \u2014 reads slow dramatically.\n\n### Triage\n1. Check cache sizing.\n2. Look at recent workload changes.\n\n### Remediation\n- Add memory / raise `wiredTigerCacheSizeGB`.\n- Tune queries / indexes to reduce scanned data.\n\n### Tuning\n- Fires on relative miss rate growth; thresholds tune in the query.\n" + } + }, "params": { "searchType": "esqlQuery", "timeWindowSize": 15, @@ -20,13 +41,6 @@ }, "groupBy": "row", "timeField": "@timestamp" - }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 } }, "managed": true, diff --git a/packages/mongodb_otel/kibana/alerting_rule_template/mongodb_otel-high-cursor-count.json b/packages/mongodb_otel/kibana/alerting_rule_template/mongodb_otel-high-cursor-count.json index 1142aaa781e..b935f351001 100644 --- a/packages/mongodb_otel/kibana/alerting_rule_template/mongodb_otel-high-cursor-count.json +++ b/packages/mongodb_otel/kibana/alerting_rule_template/mongodb_otel-high-cursor-count.json @@ -2,15 +2,36 @@ "id": "mongodb_otel-high-cursor-count", "type": "alerting_rule_template", "attributes": { + "description": "Alerts when open cursor count exceeds 1000. Possible cursor leak.", "name": "[MongoDB OTel] High cursor count", + "ruleTypeId": ".es-query", "tags": [ "observability", "mongodb" ], - "ruleTypeId": ".es-query", "schedule": { "interval": "5m" }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "mongodb_otel-operations" + }, + { + "id": "mongodb_otel-overview" + } + ], + "investigation_guide": { + "blob": "## MongoDB high cursor count\n\n### What fired\nCursor count exceeded 1000 during the window.\n\n### Why it matters\nLarge open-cursor counts usually mean applications are forgetting to close cursors or iterating them very slowly. The server will time them out eventually but in the meantime they hold memory and connection resources.\n\n### Triage\n1. Sample `db.serverStatus().metrics.cursor` to find leakers.\n2. Inspect application query patterns.\n\n### Remediation\n- Fix cursor handling in the application.\n\n### Tuning\n- 1000 threshold; tune to workload.\n" + } + }, "params": { "searchType": "esqlQuery", "timeWindowSize": 15, @@ -20,13 +41,6 @@ }, "groupBy": "row", "timeField": "@timestamp" - }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 } }, "managed": true, diff --git a/packages/mongodb_otel/kibana/alerting_rule_template/mongodb_otel-high-memory-usage.json b/packages/mongodb_otel/kibana/alerting_rule_template/mongodb_otel-high-memory-usage.json index ae2fbcbe4b2..96b696b6dc7 100644 --- a/packages/mongodb_otel/kibana/alerting_rule_template/mongodb_otel-high-memory-usage.json +++ b/packages/mongodb_otel/kibana/alerting_rule_template/mongodb_otel-high-memory-usage.json @@ -2,15 +2,36 @@ "id": "mongodb_otel-high-memory-usage", "type": "alerting_rule_template", "attributes": { + "description": "Alerts when resident memory exceeds 4 GB. Adjust threshold to match host RAM.", "name": "[MongoDB OTel] High memory usage", + "ruleTypeId": ".es-query", "tags": [ "observability", "mongodb" ], - "ruleTypeId": ".es-query", "schedule": { "interval": "5m" }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "mongodb_otel-capacity" + }, + { + "id": "mongodb_otel-overview" + } + ], + "investigation_guide": { + "blob": "## MongoDB high memory usage\n\n### What fired\nResident memory exceeded 4 GB during the window.\n\n### Why it matters\nMongoDB's memory footprint is dominated by WiredTiger cache and connections. Crossing the host ceiling causes OOM or swap thrashing.\n\n### Triage\n1. Compare against host RAM.\n2. Inspect cache sizing and working-set size.\n\n### Remediation\n- Scale host.\n- Reduce cache size or connections.\n\n### Tuning\n- 4 GB is placeholder; set to host RAM specific threshold.\n" + } + }, "params": { "searchType": "esqlQuery", "timeWindowSize": 15, @@ -20,13 +41,6 @@ }, "groupBy": "row", "timeField": "@timestamp" - }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 } }, "managed": true, diff --git a/packages/mongodb_otel/kibana/alerting_rule_template/mongodb_otel-high-operation-latency.json b/packages/mongodb_otel/kibana/alerting_rule_template/mongodb_otel-high-operation-latency.json index eb8787675a8..f9276b4ef3c 100644 --- a/packages/mongodb_otel/kibana/alerting_rule_template/mongodb_otel-high-operation-latency.json +++ b/packages/mongodb_otel/kibana/alerting_rule_template/mongodb_otel-high-operation-latency.json @@ -2,15 +2,36 @@ "id": "mongodb_otel-high-operation-latency", "type": "alerting_rule_template", "attributes": { + "description": "Alerts when average operation time per op exceeds threshold by operation type. Identifies slow operations.", "name": "[MongoDB OTel] High operation latency", + "ruleTypeId": ".es-query", "tags": [ "observability", "mongodb" ], - "ruleTypeId": ".es-query", "schedule": { "interval": "5m" }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "mongodb_otel-operations" + }, + { + "id": "mongodb_otel-overview" + } + ], + "investigation_guide": { + "blob": "## MongoDB high operation latency\n\n### What fired\nPer-operation-type latency rose above the computed threshold.\n\n### Why it matters\nRising average operation latency indicates server- or storage-side slowdowns. Break down by operation type (insert/query/update/command) to localise.\n\n### Triage\n1. Check disk I/O and WT cache hit.\n2. Look for slow queries in profiler logs.\n\n### Remediation\n- Fix slow queries / missing indexes.\n- Scale storage.\n\n### Tuning\n- Thresholds tune in the query.\n" + } + }, "params": { "searchType": "esqlQuery", "timeWindowSize": 15, @@ -20,13 +41,6 @@ }, "groupBy": "row", "timeField": "@timestamp" - }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 } }, "managed": true, diff --git a/packages/mongodb_otel/kibana/alerting_rule_template/mongodb_otel-high-session-count.json b/packages/mongodb_otel/kibana/alerting_rule_template/mongodb_otel-high-session-count.json index 386b4519aee..b1bbac800de 100644 --- a/packages/mongodb_otel/kibana/alerting_rule_template/mongodb_otel-high-session-count.json +++ b/packages/mongodb_otel/kibana/alerting_rule_template/mongodb_otel-high-session-count.json @@ -2,15 +2,36 @@ "id": "mongodb_otel-high-session-count", "type": "alerting_rule_template", "attributes": { + "description": "Alerts when active sessions exceed 500. Possible connection pool misconfiguration.", "name": "[MongoDB OTel] High session count", + "ruleTypeId": ".es-query", "tags": [ "observability", "mongodb" ], - "ruleTypeId": ".es-query", "schedule": { "interval": "5m" }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "mongodb_otel-capacity" + }, + { + "id": "mongodb_otel-overview" + } + ], + "investigation_guide": { + "blob": "## MongoDB high session count\n\n### What fired\nSession count exceeded 500 during the window.\n\n### Why it matters\nMongoDB opens a logical session per unique driver session. Sustained high counts usually indicate short-lived client sessions without reuse or driver misconfigurations.\n\n### Triage\n1. Inspect client driver versions and settings.\n\n### Remediation\n- Enable session caching / reuse.\n\n### Tuning\n- 500 threshold; tune to driver load.\n" + } + }, "params": { "searchType": "esqlQuery", "timeWindowSize": 15, @@ -20,13 +41,6 @@ }, "groupBy": "row", "timeField": "@timestamp" - }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 } }, "managed": true, diff --git a/packages/mongodb_otel/manifest.yml b/packages/mongodb_otel/manifest.yml index 6cda7d13a67..dab28aeb322 100644 --- a/packages/mongodb_otel/manifest.yml +++ b/packages/mongodb_otel/manifest.yml @@ -1,7 +1,7 @@ format_version: 3.5.5 name: mongodb_otel title: "MongoDB OpenTelemetry Assets" -version: 0.1.0 +version: 0.2.0 source: license: "Elastic-2.0" description: "MongoDB Assets for OpenTelemetry Collector" @@ -12,7 +12,7 @@ categories: - opentelemetry conditions: kibana: - version: "^9.3.0" + version: "^9.4.0" elastic: subscription: "basic" discovery: diff --git a/packages/mysql_otel/changelog.yml b/packages/mysql_otel/changelog.yml index 4474b725fa7..f2058692a36 100644 --- a/packages/mysql_otel/changelog.yml +++ b/packages/mysql_otel/changelog.yml @@ -1,4 +1,9 @@ # newer versions go on top +- version: "0.5.0" + changes: + - description: Add description and artifact fields to alerting rule template. + type: enhancement + link: https://github.com/elastic/integrations/pull/18506 - version: "0.4.0" changes: - description: Add alerting rule templates and SLO templates. diff --git a/packages/mysql_otel/kibana/alerting_rule_template/mysql_otel-high-buffer-pool-dirty-ratio.json b/packages/mysql_otel/kibana/alerting_rule_template/mysql_otel-high-buffer-pool-dirty-ratio.json index 824d4ea0ed5..3c1697decfc 100644 --- a/packages/mysql_otel/kibana/alerting_rule_template/mysql_otel-high-buffer-pool-dirty-ratio.json +++ b/packages/mysql_otel/kibana/alerting_rule_template/mysql_otel-high-buffer-pool-dirty-ratio.json @@ -1,39 +1,61 @@ { - "id": "mysql_otel-high-buffer-pool-dirty-ratio", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[MySQL OTel] High buffer pool dirty page ratio", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "mysql" - ], - "schedule": { - "interval": "5m" + "id": "mysql_otel-high-buffer-pool-dirty-ratio", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when the InnoDB buffer pool dirty ratio exceeds 75%. InnoDB will flush aggressively above this threshold.", + "name": "[MySQL OTel] High buffer pool dirty page ratio", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "mysql" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "mysql_otel-14f9bee0-b03e-4e6d-b386-c89d5e7589c8" }, - "alertDelay": { - "active": 3 + { + "id": "mysql_otel-233fa1a2-7c05-46b9-8ae6-112d42610e12" }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 + { + "id": "mysql_otel-49452773-8358-4394-827e-d15d1e3f8467" }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-mysqlreceiver.otel-*\n// Buffer pool usage (bytes) by status: clean vs dirty; gauges\n| WHERE metrics.mysql.buffer_pool.usage IS NOT NULL\n AND attributes.status IN (\"dirty\", \"clean\")\n| STATS\n dirty = MAX(metrics.mysql.buffer_pool.usage) WHERE attributes.status == \"dirty\",\n clean = MAX(metrics.mysql.buffer_pool.usage) WHERE attributes.status == \"clean\"\n BY resource.attributes.mysql.instance.endpoint\n| EVAL total = dirty + clean, ratio = CASE(total > 0, dirty / total, 0)\n// Alert when dirty ratio > 75%; InnoDB will flush aggressively above this\n| WHERE ratio > 0.75 AND total > 0\n| SORT ratio DESC" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "resource.attributes.mysql.instance.endpoint", - "termSize": 10, - "excludeHitsFromPreviousRun": true + { + "id": "mysql_otel-71977e60-f126-4c18-8769-639e686585a3" } + ], + "investigation_guide": { + "blob": "## MySQL high buffer pool dirty page ratio\n\n### What fired\nDirty pages / (dirty + clean) > 0.75 on an instance during the window.\n\n### Why it matters\nDirty pages are modified pages awaiting flush. Above 75%, InnoDB enters aggressive async flush mode, which tanks write latency and saturates disk.\n\n### Triage\n1. Check disk write IOPS.\n2. Inspect InnoDB flush and log settings.\n\n### Remediation\n- Upgrade disk.\n- Tune `innodb_io_capacity` to match hardware.\n\n### Tuning\n- 75% threshold matches InnoDB's documented aggressive-flush band.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-mysqlreceiver.otel-*\n// Buffer pool usage (bytes) by status: clean vs dirty; gauges\n| WHERE metrics.mysql.buffer_pool.usage IS NOT NULL\n AND attributes.status IN (\"dirty\", \"clean\")\n| STATS\n dirty = MAX(metrics.mysql.buffer_pool.usage) WHERE attributes.status == \"dirty\",\n clean = MAX(metrics.mysql.buffer_pool.usage) WHERE attributes.status == \"clean\"\n BY resource.attributes.mysql.instance.endpoint\n| EVAL total = dirty + clean, ratio = CASE(total > 0, dirty / total, 0)\n// Alert when dirty ratio > 75%; InnoDB will flush aggressively above this\n| WHERE ratio > 0.75 AND total > 0\n| SORT ratio DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "resource.attributes.mysql.instance.endpoint", + "termSize": 10, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/mysql_otel/kibana/alerting_rule_template/mysql_otel-high-connection-error-rate.json b/packages/mysql_otel/kibana/alerting_rule_template/mysql_otel-high-connection-error-rate.json index c937c051559..0fb13d94ed7 100644 --- a/packages/mysql_otel/kibana/alerting_rule_template/mysql_otel-high-connection-error-rate.json +++ b/packages/mysql_otel/kibana/alerting_rule_template/mysql_otel-high-connection-error-rate.json @@ -1,39 +1,61 @@ { - "id": "mysql_otel-high-connection-error-rate", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[MySQL OTel] High connection error rate", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "mysql" - ], - "schedule": { - "interval": "5m" + "id": "mysql_otel-high-connection-error-rate", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when connection errors exceed 5 in the window. Auth, host-cache, or network issues.", + "name": "[MySQL OTel] High connection error rate", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "mysql" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "mysql_otel-14f9bee0-b03e-4e6d-b386-c89d5e7589c8" }, - "alertDelay": { - "active": 3 + { + "id": "mysql_otel-233fa1a2-7c05-46b9-8ae6-112d42610e12" }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 + { + "id": "mysql_otel-49452773-8358-4394-827e-d15d1e3f8467" }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "TS metrics-mysqlreceiver.otel-*\n// Connection errors are a monotonic counter; use INCREASE for delta over the time window\n| WHERE metrics.mysql.connection.errors IS NOT NULL\n| STATS increase = SUM(INCREASE(metrics.mysql.connection.errors))\n BY resource.attributes.mysql.instance.endpoint, attributes.error\n// Alert when more than 5 connection errors in the window; adjust threshold as needed\n| WHERE increase > 5\n| SORT increase DESC" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "resource.attributes.mysql.instance.endpoint", - "termSize": 10, - "excludeHitsFromPreviousRun": true + { + "id": "mysql_otel-71977e60-f126-4c18-8769-639e686585a3" } + ], + "investigation_guide": { + "blob": "## MySQL high connection error rate\n\n### What fired\n`INCREASE(mysql.connection.errors) > 5` in the window.\n\n### Why it matters\nMySQL tracks connection errors by type (max_connections, accept, select, etc.). Any sustained rate indicates clients cannot connect reliably.\n\n### Triage\n1. Split by `attributes.error` to categorise.\n2. Check `max_connections` utilisation.\n3. Inspect host cache / DNS.\n\n### Remediation\n- Raise `max_connections`.\n- Fix DNS / auth.\n\n### Tuning\n- 5/window threshold; tune to baseline.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "TS metrics-mysqlreceiver.otel-*\n// Connection errors are a monotonic counter; use INCREASE for delta over the time window\n| WHERE metrics.mysql.connection.errors IS NOT NULL\n| STATS increase = SUM(INCREASE(metrics.mysql.connection.errors))\n BY resource.attributes.mysql.instance.endpoint, attributes.error\n// Alert when more than 5 connection errors in the window; adjust threshold as needed\n| WHERE increase > 5\n| SORT increase DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "resource.attributes.mysql.instance.endpoint", + "termSize": 10, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/mysql_otel/kibana/alerting_rule_template/mysql_otel-high-row-lock-contention.json b/packages/mysql_otel/kibana/alerting_rule_template/mysql_otel-high-row-lock-contention.json index 0400bea3b15..b4383edc3c1 100644 --- a/packages/mysql_otel/kibana/alerting_rule_template/mysql_otel-high-row-lock-contention.json +++ b/packages/mysql_otel/kibana/alerting_rule_template/mysql_otel-high-row-lock-contention.json @@ -1,39 +1,61 @@ { - "id": "mysql_otel-high-row-lock-contention", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[MySQL OTel] High row lock contention", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "mysql" - ], - "schedule": { - "interval": "5m" + "id": "mysql_otel-high-row-lock-contention", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when InnoDB row lock waits exceed 100 in 15 minutes. Indicates application-level contention.", + "name": "[MySQL OTel] High row lock contention", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "mysql" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "mysql_otel-14f9bee0-b03e-4e6d-b386-c89d5e7589c8" }, - "alertDelay": { - "active": 3 + { + "id": "mysql_otel-233fa1a2-7c05-46b9-8ae6-112d42610e12" }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 + { + "id": "mysql_otel-49452773-8358-4394-827e-d15d1e3f8467" }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "TS metrics-mysqlreceiver.otel-*\n// Row lock waits are a monotonic counter; use INCREASE for delta over the time window\n| WHERE metrics.mysql.row_locks IS NOT NULL\n AND attributes.kind == \"waits\"\n| STATS increase = SUM(INCREASE(metrics.mysql.row_locks))\n BY resource.attributes.mysql.instance.endpoint\n// Alert when more than 100 lock waits in the window; tune based on workload\n| WHERE increase > 100\n| SORT increase DESC" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "resource.attributes.mysql.instance.endpoint", - "termSize": 10, - "excludeHitsFromPreviousRun": true + { + "id": "mysql_otel-71977e60-f126-4c18-8769-639e686585a3" } + ], + "investigation_guide": { + "blob": "## MySQL high row lock contention\n\n### What fired\n`INCREASE(mysql.row_locks)` with `kind=waits` exceeded 100 during the window.\n\n### Why it matters\nRow lock waits mean transactions are serialising on the same rows. Heavy waits cause latency and possible deadlocks.\n\n### Triage\n1. Identify the hot rows / indexes.\n2. Inspect slow queries causing long transactions.\n\n### Remediation\n- Shorten transactions.\n- Adjust isolation / access order.\n\n### Tuning\n- 100 threshold; tune to workload.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "TS metrics-mysqlreceiver.otel-*\n// Row lock waits are a monotonic counter; use INCREASE for delta over the time window\n| WHERE metrics.mysql.row_locks IS NOT NULL\n AND attributes.kind == \"waits\"\n| STATS increase = SUM(INCREASE(metrics.mysql.row_locks))\n BY resource.attributes.mysql.instance.endpoint\n// Alert when more than 100 lock waits in the window; tune based on workload\n| WHERE increase > 100\n| SORT increase DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "resource.attributes.mysql.instance.endpoint", + "termSize": 10, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/mysql_otel/kibana/alerting_rule_template/mysql_otel-high-slow-query-rate.json b/packages/mysql_otel/kibana/alerting_rule_template/mysql_otel-high-slow-query-rate.json index d180bf2b474..737b80b041d 100644 --- a/packages/mysql_otel/kibana/alerting_rule_template/mysql_otel-high-slow-query-rate.json +++ b/packages/mysql_otel/kibana/alerting_rule_template/mysql_otel-high-slow-query-rate.json @@ -1,39 +1,61 @@ { - "id": "mysql_otel-high-slow-query-rate", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[MySQL OTel] High slow query rate", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "mysql" - ], - "schedule": { - "interval": "5m" + "id": "mysql_otel-high-slow-query-rate", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when slow queries exceed 10 in the window. Slow queries indicate query plan or indexing problems.", + "name": "[MySQL OTel] High slow query rate", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "mysql" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "mysql_otel-14f9bee0-b03e-4e6d-b386-c89d5e7589c8" }, - "alertDelay": { - "active": 3 + { + "id": "mysql_otel-233fa1a2-7c05-46b9-8ae6-112d42610e12" }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 + { + "id": "mysql_otel-49452773-8358-4394-827e-d15d1e3f8467" }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "TS metrics-mysqlreceiver.otel-*\n// Slow query count is a monotonic counter; use INCREASE for delta over the time window\n| WHERE metrics.mysql.query.slow.count IS NOT NULL\n| STATS increase = SUM(INCREASE(metrics.mysql.query.slow.count))\n BY resource.attributes.mysql.instance.endpoint\n// Alert when more than 10 slow queries in the window; tune based on long_query_time and traffic\n| WHERE increase > 10\n| SORT increase DESC" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "resource.attributes.mysql.instance.endpoint", - "termSize": 10, - "excludeHitsFromPreviousRun": true + { + "id": "mysql_otel-71977e60-f126-4c18-8769-639e686585a3" } + ], + "investigation_guide": { + "blob": "## MySQL high slow query rate\n\n### What fired\n`INCREASE(mysql.query.slow.count) > 10` in the window.\n\n### Why it matters\nSlow queries exceed `long_query_time`. A spike suggests missing indexes, stale stats, or a change in traffic pattern.\n\n### Triage\n1. Enable slow query log and inspect the dominant queries.\n2. Check EXPLAIN plans and index usage.\n\n### Remediation\n- Add missing indexes.\n- Refactor queries.\n\n### Tuning\n- 10/window threshold; tune to `long_query_time`.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "TS metrics-mysqlreceiver.otel-*\n// Slow query count is a monotonic counter; use INCREASE for delta over the time window\n| WHERE metrics.mysql.query.slow.count IS NOT NULL\n| STATS increase = SUM(INCREASE(metrics.mysql.query.slow.count))\n BY resource.attributes.mysql.instance.endpoint\n// Alert when more than 10 slow queries in the window; tune based on long_query_time and traffic\n| WHERE increase > 10\n| SORT increase DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "resource.attributes.mysql.instance.endpoint", + "termSize": 10, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/mysql_otel/kibana/alerting_rule_template/mysql_otel-replication-lag.json b/packages/mysql_otel/kibana/alerting_rule_template/mysql_otel-replication-lag.json index 9488f8139a0..ff3689db78d 100644 --- a/packages/mysql_otel/kibana/alerting_rule_template/mysql_otel-replication-lag.json +++ b/packages/mysql_otel/kibana/alerting_rule_template/mysql_otel-replication-lag.json @@ -1,39 +1,61 @@ { - "id": "mysql_otel-replication-lag", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[MySQL OTel] Replication lag", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "mysql" - ], - "schedule": { - "interval": "5m" + "id": "mysql_otel-replication-lag", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when replica `time_behind_source > 60` seconds. Stale replicas risk read-after-write issues and data loss on failover.", + "name": "[MySQL OTel] Replication lag", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "mysql" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "mysql_otel-14f9bee0-b03e-4e6d-b386-c89d5e7589c8" }, - "alertDelay": { - "active": 3 + { + "id": "mysql_otel-233fa1a2-7c05-46b9-8ae6-112d42610e12" }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 + { + "id": "mysql_otel-49452773-8358-4394-827e-d15d1e3f8467" }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-mysqlreceiver.otel-*\n// Replica lag is a gauge (seconds behind source); only present on replica instances\n| WHERE metrics.mysql.replica.time_behind_source IS NOT NULL\n| STATS lag_seconds = MAX(metrics.mysql.replica.time_behind_source)\n BY resource.attributes.mysql.instance.endpoint\n// Alert when lag exceeds 60 seconds; adjust for your RPO requirements\n| WHERE lag_seconds > 60\n| SORT lag_seconds DESC" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "resource.attributes.mysql.instance.endpoint", - "termSize": 10, - "excludeHitsFromPreviousRun": true + { + "id": "mysql_otel-71977e60-f126-4c18-8769-639e686585a3" } + ], + "investigation_guide": { + "blob": "## MySQL replication lag\n\n### What fired\n`mysql.replica.time_behind_source > 60s` during the window.\n\n### Why it matters\nReplication lag directly exposes stale reads from replicas and risks data loss during failover.\n\n### Triage\n1. Check network between primary and replica.\n2. Inspect replica CPU / disk.\n3. Look for long-running transactions on the primary.\n\n### Remediation\n- Scale replica or use parallel SQL threads.\n- Break up long-running transactions.\n\n### Tuning\n- 60s threshold; tune to RPO.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-mysqlreceiver.otel-*\n// Replica lag is a gauge (seconds behind source); only present on replica instances\n| WHERE metrics.mysql.replica.time_behind_source IS NOT NULL\n| STATS lag_seconds = MAX(metrics.mysql.replica.time_behind_source)\n BY resource.attributes.mysql.instance.endpoint\n// Alert when lag exceeds 60 seconds; adjust for your RPO requirements\n| WHERE lag_seconds > 60\n| SORT lag_seconds DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "resource.attributes.mysql.instance.endpoint", + "termSize": 10, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/mysql_otel/kibana/alerting_rule_template/mysql_otel-thread-saturation.json b/packages/mysql_otel/kibana/alerting_rule_template/mysql_otel-thread-saturation.json index 7dc5b3f08f4..c76c58b913b 100644 --- a/packages/mysql_otel/kibana/alerting_rule_template/mysql_otel-thread-saturation.json +++ b/packages/mysql_otel/kibana/alerting_rule_template/mysql_otel-thread-saturation.json @@ -1,39 +1,61 @@ { - "id": "mysql_otel-thread-saturation", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[MySQL OTel] Thread saturation", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "mysql" - ], - "schedule": { - "interval": "5m" + "id": "mysql_otel-thread-saturation", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when `threads_running / threads_connected > 0.9` with at least 5 threads connected. Indicates CPU-bound workload.", + "name": "[MySQL OTel] Thread saturation", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "mysql" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "mysql_otel-14f9bee0-b03e-4e6d-b386-c89d5e7589c8" }, - "alertDelay": { - "active": 3 + { + "id": "mysql_otel-233fa1a2-7c05-46b9-8ae6-112d42610e12" }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 + { + "id": "mysql_otel-49452773-8358-4394-827e-d15d1e3f8467" }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-mysqlreceiver.otel-*\n// Threads are gauges; pivot running vs connected by instance\n| WHERE metrics.mysql.threads IS NOT NULL\n AND attributes.kind IN (\"running\", \"connected\")\n| STATS\n running = MAX(metrics.mysql.threads) WHERE attributes.kind == \"running\",\n connected = MAX(metrics.mysql.threads) WHERE attributes.kind == \"connected\"\n BY resource.attributes.mysql.instance.endpoint\n// Saturation = running threads close to connected (CPU-bound workload)\n| EVAL ratio = CASE(connected > 0, running / connected, 0)\n// Alert when ratio > 0.9 and we have meaningful thread count; adjust threshold as needed\n| WHERE ratio > 0.9 AND connected >= 5\n| SORT ratio DESC" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "resource.attributes.mysql.instance.endpoint", - "termSize": 10, - "excludeHitsFromPreviousRun": true + { + "id": "mysql_otel-71977e60-f126-4c18-8769-639e686585a3" } + ], + "investigation_guide": { + "blob": "## MySQL thread saturation\n\n### What fired\nRunning threads over connected threads exceeded 90% with at least 5 connections.\n\n### Why it matters\nWhen nearly all connections are actively executing, MySQL is CPU-bound. Query throughput stalls and queues form.\n\n### Triage\n1. Inspect top SQL by CPU.\n2. Look at InnoDB concurrency settings.\n\n### Remediation\n- Tune `innodb_thread_concurrency` / add CPU.\n- Optimise heavy queries.\n\n### Tuning\n- 0.9 threshold; 5 connections minimum avoids noise.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-mysqlreceiver.otel-*\n// Threads are gauges; pivot running vs connected by instance\n| WHERE metrics.mysql.threads IS NOT NULL\n AND attributes.kind IN (\"running\", \"connected\")\n| STATS\n running = MAX(metrics.mysql.threads) WHERE attributes.kind == \"running\",\n connected = MAX(metrics.mysql.threads) WHERE attributes.kind == \"connected\"\n BY resource.attributes.mysql.instance.endpoint\n// Saturation = running threads close to connected (CPU-bound workload)\n| EVAL ratio = CASE(connected > 0, running / connected, 0)\n// Alert when ratio > 0.9 and we have meaningful thread count; adjust threshold as needed\n| WHERE ratio > 0.9 AND connected >= 5\n| SORT ratio DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "resource.attributes.mysql.instance.endpoint", + "termSize": 10, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/mysql_otel/manifest.yml b/packages/mysql_otel/manifest.yml index 6088de3b241..7b5edeb8016 100644 --- a/packages/mysql_otel/manifest.yml +++ b/packages/mysql_otel/manifest.yml @@ -1,7 +1,7 @@ format_version: 3.5.5 name: mysql_otel title: MySQL OpenTelemetry Assets -version: 0.4.0 +version: 0.5.0 source: license: "Elastic-2.0" description: "MySQL Assets for OpenTelemetry Collector" @@ -12,7 +12,7 @@ categories: - opentelemetry conditions: kibana: - version: "^9.3.0" + version: "^9.4.0" elastic: subscription: "basic" discovery: diff --git a/packages/nginx_otel/changelog.yml b/packages/nginx_otel/changelog.yml index a24e09a0600..cba2a25a81f 100644 --- a/packages/nginx_otel/changelog.yml +++ b/packages/nginx_otel/changelog.yml @@ -1,4 +1,9 @@ # newer versions go on top +- version: "0.4.0" + changes: + - description: Add description and artifact fields to alerting rule template. + type: enhancement + link: https://github.com/elastic/integrations/pull/18506 - version: "0.3.0" changes: - description: Add SRE-aligned dashboard hierarchy with Request Health, Server Internals, and Traffic & Capacity dashboards covering access logs, error logs, and metrics. SLO and alerting rule templates diff --git a/packages/nginx_otel/kibana/alerting_rule_template/nginx-active-connections-high.json b/packages/nginx_otel/kibana/alerting_rule_template/nginx-active-connections-high.json new file mode 100644 index 00000000000..be5e7c2dd39 --- /dev/null +++ b/packages/nginx_otel/kibana/alerting_rule_template/nginx-active-connections-high.json @@ -0,0 +1,59 @@ +{ + "id": "nginx-active-connections-high", + "type": "alerting_rule_template", + "attributes": { + "description": "Alerts when service error rate exceeds 5%. (Rule is a generic placeholder that should be renamed for NGINX.)", + "name": "High error rate by service", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "myservice" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "enabled": true, + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "nginx_otel-a1b2c3d4-e5f6-7890-abcd-ef1234567890" + }, + { + "id": "nginx_otel-b2c3d4e5-f6a7-8901-bcde-f12345678901" + }, + { + "id": "nginx_otel-c3d4e5f6-a7b8-9012-cdef-123456789012" + } + ], + "investigation_guide": { + "blob": "## NGINX: generic error-rate rule (placeholder)\n\n### What fired\n`error_rate_pct > 5%` on a service in the log-based query.\n\n### Why it matters\nThis rule appears to be a leftover myservice template and should be reworked as a real NGINX rule. Treat any alert from it with skepticism until it is corrected.\n\n### Triage\n1. Inspect the rule query \u2014 confirm it targets NGINX datasets.\n2. Consider disabling until corrected.\n\n### Remediation\n- Replace the rule query with an NGINX-specific one (e.g. 5xx error rate).\n\n### Tuning\n- Placeholder \u2014 should be retired or reworked.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM logs-myservicereceiver.otel-*\n// Flag each log entry as an error or not\n| EVAL is_error = CASE(log.level == \"error\", 1, 0)\n// Aggregate total logs and error count per service\n| STATS total = COUNT(*), errors = SUM(is_error) BY service.name\n// Minimum sample size to avoid noisy low-traffic services\n// Adjust this threshold based on expected traffic volume\n| WHERE total > 50\n// Calculate error rate as a percentage\n| EVAL error_rate_pct = ROUND(errors / total * 100.0, 2)\n// Alert threshold: adjust this to tune sensitivity\n| WHERE error_rate_pct > 5.0\n| SORT error_rate_pct DESC\n| LIMIT 10" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "service.name", + "termSize": 10, + "excludeHitsFromPreviousRun": true + } + }, + "managed": true +} diff --git a/packages/nginx_otel/kibana/alerting_rule_template/nginx_otel-dropped-connections.json b/packages/nginx_otel/kibana/alerting_rule_template/nginx_otel-dropped-connections.json index 596af2348e5..310c7ad05d8 100644 --- a/packages/nginx_otel/kibana/alerting_rule_template/nginx_otel-dropped-connections.json +++ b/packages/nginx_otel/kibana/alerting_rule_template/nginx_otel-dropped-connections.json @@ -1,37 +1,54 @@ { - "id": "nginx_otel-dropped-connections", - "type": "alerting_rule_template", - "attributes": { - "name": "[Nginx OTel] Dropped connections detected", - "tags": [ - "Nginx OTel" - ], - "ruleTypeId": ".es-query", - "schedule": { - "interval": "1m" + "id": "nginx_otel-dropped-connections", + "type": "alerting_rule_template", + "attributes": { + "description": "Alerts when `connections_accepted - connections_handled > 0`. NGINX is dropping connections \u2014 worker_connections or FDs exhausted.", + "name": "[Nginx OTel] Dropped connections detected", + "ruleTypeId": ".es-query", + "tags": [ + "Nginx OTel" + ], + "schedule": { + "interval": "1m" + }, + "alertDelay": { + "active": 3 + }, + "artifacts": { + "dashboards": [ + { + "id": "nginx_otel-a1b2c3d4-e5f6-7890-abcd-ef1234567890" }, - "params": { - "searchType": "esqlQuery", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "esqlQuery": { - "esql": "FROM metrics-nginxreceiver.otel-*\n// Filter to connection counter documents (exclude gauge docs with state attribute)\n| WHERE state IS NULL\n// Cast counters to long for aggregation support\n| EVAL accepted = TO_LONG(nginx.connections_accepted), handled = TO_LONG(nginx.connections_handled)\n// Compute delta over the time window per host\n| STATS max_accepted = MAX(accepted), min_accepted = MIN(accepted), max_handled = MAX(handled), min_handled = MIN(handled) BY host.name\n// Calculate new connections accepted vs handled in this window\n| EVAL delta_accepted = max_accepted - min_accepted, delta_handled = max_handled - min_handled\n| EVAL dropped = delta_accepted - delta_handled\n// Alert when any connections were dropped\n| WHERE dropped > 0\n| SORT dropped DESC\n| LIMIT 10" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "groupBy": "row", - "termField": "host.name", - "termSize": 10 + { + "id": "nginx_otel-b2c3d4e5-f6a7-8901-bcde-f12345678901" }, - "alertDelay": { - "active": 3 + { + "id": "nginx_otel-c3d4e5f6-a7b8-9012-cdef-123456789012" } + ], + "investigation_guide": { + "blob": "## NGINX dropped connections detected\n\n### What fired\nWindow delta of accepted minus handled > 0.\n\n### Why it matters\nAny dropped connection is a definitive saturation signal: NGINX could not handle all incoming connections \u2014 typically because `worker_connections` is exhausted or the process hit its FD limit.\n\n### Triage\n1. Inspect `nginx.connections_current` by state.\n2. Check `worker_connections` config and FD ulimit.\n\n### Remediation\n- Raise `worker_connections` / `worker_processes`.\n- Raise FD limits.\n\n### Tuning\n- Fires on any drop.\n" + } }, - "managed": true, - "coreMigrationVersion": "8.8.0", - "typeMigrationVersion": "10.1.0" + "params": { + "searchType": "esqlQuery", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "esqlQuery": { + "esql": "FROM metrics-nginxreceiver.otel-*\n// Filter to connection counter documents (exclude gauge docs with state attribute)\n| WHERE state IS NULL\n// Cast counters to long for aggregation support\n| EVAL accepted = TO_LONG(nginx.connections_accepted), handled = TO_LONG(nginx.connections_handled)\n// Compute delta over the time window per host\n| STATS max_accepted = MAX(accepted), min_accepted = MIN(accepted), max_handled = MAX(handled), min_handled = MIN(handled) BY host.name\n// Calculate new connections accepted vs handled in this window\n| EVAL delta_accepted = max_accepted - min_accepted, delta_handled = max_handled - min_handled\n| EVAL dropped = delta_accepted - delta_handled\n// Alert when any connections were dropped\n| WHERE dropped > 0\n| SORT dropped DESC\n| LIMIT 10" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "groupBy": "row", + "termField": "host.name", + "termSize": 10 + } + }, + "managed": true, + "coreMigrationVersion": "8.8.0", + "typeMigrationVersion": "10.1.0" } diff --git a/packages/nginx_otel/kibana/alerting_rule_template/nginx_otel-error-log-spike.json b/packages/nginx_otel/kibana/alerting_rule_template/nginx_otel-error-log-spike.json index dc3d53fbe1b..a72ef6f2844 100644 --- a/packages/nginx_otel/kibana/alerting_rule_template/nginx_otel-error-log-spike.json +++ b/packages/nginx_otel/kibana/alerting_rule_template/nginx_otel-error-log-spike.json @@ -1,37 +1,54 @@ { - "id": "nginx_otel-error-log-spike", - "type": "alerting_rule_template", - "attributes": { - "name": "[Nginx OTel] Error log spike", - "tags": [ - "Nginx OTel" - ], - "ruleTypeId": ".es-query", - "schedule": { - "interval": "1m" + "id": "nginx_otel-error-log-spike", + "type": "alerting_rule_template", + "attributes": { + "description": "Alerts when error/crit/alert/emerg log count exceeds 50 in 15 minutes. Operational errors beyond normal noise.", + "name": "[Nginx OTel] Error log spike", + "ruleTypeId": ".es-query", + "tags": [ + "Nginx OTel" + ], + "schedule": { + "interval": "1m" + }, + "alertDelay": { + "active": 3 + }, + "artifacts": { + "dashboards": [ + { + "id": "nginx_otel-a1b2c3d4-e5f6-7890-abcd-ef1234567890" }, - "params": { - "searchType": "esqlQuery", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "esqlQuery": { - "esql": "FROM logs-nginx.error.otel-*\n// Filter to severe log entries only\n| WHERE log.level IN (\"error\", \"crit\", \"alert\", \"emerg\")\n// Count error entries per host\n| STATS error_count = COUNT(*) BY host.name\n// Alert when error count exceeds threshold in the time window\n// Adjust threshold based on expected baseline error volume\n| WHERE error_count > 50\n| SORT error_count DESC\n| LIMIT 10" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "groupBy": "row", - "termField": "host.name", - "termSize": 10 + { + "id": "nginx_otel-b2c3d4e5-f6a7-8901-bcde-f12345678901" }, - "alertDelay": { - "active": 3 + { + "id": "nginx_otel-c3d4e5f6-a7b8-9012-cdef-123456789012" } + ], + "investigation_guide": { + "blob": "## NGINX error log spike\n\n### What fired\nMore than 50 high-severity error log entries in 15 minutes.\n\n### Why it matters\nBursts of severe errors signal upstream timeouts, configuration problems, FD exhaustion, or permission issues. Access log 5xx rates often lag behind error log spikes.\n\n### Triage\n1. Sample the dominant error message.\n2. Correlate with upstream / backend health.\n\n### Remediation\n- Fix the upstream or config issue.\n\n### Tuning\n- 50 errors/15m; tune to baseline noise.\n" + } }, - "managed": true, - "coreMigrationVersion": "8.8.0", - "typeMigrationVersion": "10.1.0" + "params": { + "searchType": "esqlQuery", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "esqlQuery": { + "esql": "FROM logs-nginx.error.otel-*\n// Filter to severe log entries only\n| WHERE log.level IN (\"error\", \"crit\", \"alert\", \"emerg\")\n// Count error entries per host\n| STATS error_count = COUNT(*) BY host.name\n// Alert when error count exceeds threshold in the time window\n// Adjust threshold based on expected baseline error volume\n| WHERE error_count > 50\n| SORT error_count DESC\n| LIMIT 10" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "groupBy": "row", + "termField": "host.name", + "termSize": 10 + } + }, + "managed": true, + "coreMigrationVersion": "8.8.0", + "typeMigrationVersion": "10.1.0" } diff --git a/packages/nginx_otel/kibana/alerting_rule_template/nginx_otel-high-4xx-error-rate.json b/packages/nginx_otel/kibana/alerting_rule_template/nginx_otel-high-4xx-error-rate.json index 1f4a349fcaf..9e1b2a6be67 100644 --- a/packages/nginx_otel/kibana/alerting_rule_template/nginx_otel-high-4xx-error-rate.json +++ b/packages/nginx_otel/kibana/alerting_rule_template/nginx_otel-high-4xx-error-rate.json @@ -1,37 +1,54 @@ { - "id": "nginx_otel-high-4xx-error-rate", - "type": "alerting_rule_template", - "attributes": { - "name": "[Nginx OTel] High 4xx error rate", - "tags": [ - "Nginx OTel" - ], - "ruleTypeId": ".es-query", - "schedule": { - "interval": "1m" + "id": "nginx_otel-high-4xx-error-rate", + "type": "alerting_rule_template", + "attributes": { + "description": "Alerts when 4xx responses exceed 15% of access logs. High client-error rate indicates broken links or client/bot activity.", + "name": "[Nginx OTel] High 4xx error rate", + "ruleTypeId": ".es-query", + "tags": [ + "Nginx OTel" + ], + "schedule": { + "interval": "1m" + }, + "alertDelay": { + "active": 3 + }, + "artifacts": { + "dashboards": [ + { + "id": "nginx_otel-a1b2c3d4-e5f6-7890-abcd-ef1234567890" }, - "params": { - "searchType": "esqlQuery", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "esqlQuery": { - "esql": "FROM logs-nginx.access.otel-*\n// Flag each access log entry as a client error (4xx) or not\n| EVAL is_4xx = CASE(http.response.status_code >= 400 AND http.response.status_code < 500, 1, 0)\n// Aggregate total requests and 4xx count per NGINX host\n| STATS total = COUNT(*), errors_4xx = SUM(is_4xx) BY host.name\n// Minimum sample size to avoid noisy low-traffic hosts\n| WHERE total > 50\n// Calculate 4xx error rate as a percentage\n| EVAL error_rate_pct = ROUND(TO_DOUBLE(errors_4xx) / TO_DOUBLE(total) * 100.0, 2)\n// Alert threshold: 15% is a reasonable starting point for client errors\n| WHERE error_rate_pct > 15.0\n| SORT error_rate_pct DESC\n| LIMIT 10" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "groupBy": "row", - "termField": "host.name", - "termSize": 10 + { + "id": "nginx_otel-b2c3d4e5-f6a7-8901-bcde-f12345678901" }, - "alertDelay": { - "active": 3 + { + "id": "nginx_otel-c3d4e5f6-a7b8-9012-cdef-123456789012" } + ], + "investigation_guide": { + "blob": "## NGINX high 4xx error rate\n\n### What fired\n4xx response rate exceeded 15% over the window (min 50 total requests).\n\n### Why it matters\n4xx errors are client-side \u2014 bad requests, missing resources, unauthorised access. Sudden spikes often reflect broken deployments or bot traffic.\n\n### Triage\n1. Inspect top URLs returning 4xx.\n2. Check client IP distribution for botting.\n\n### Remediation\n- Fix broken URLs / routing.\n- Block abusive clients at the edge.\n\n### Tuning\n- 15% threshold; tune to site.\n" + } }, - "managed": true, - "coreMigrationVersion": "8.8.0", - "typeMigrationVersion": "10.1.0" + "params": { + "searchType": "esqlQuery", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "esqlQuery": { + "esql": "FROM logs-nginx.access.otel-*\n// Flag each access log entry as a client error (4xx) or not\n| EVAL is_4xx = CASE(http.response.status_code >= 400 AND http.response.status_code < 500, 1, 0)\n// Aggregate total requests and 4xx count per NGINX host\n| STATS total = COUNT(*), errors_4xx = SUM(is_4xx) BY host.name\n// Minimum sample size to avoid noisy low-traffic hosts\n| WHERE total > 50\n// Calculate 4xx error rate as a percentage\n| EVAL error_rate_pct = ROUND(TO_DOUBLE(errors_4xx) / TO_DOUBLE(total) * 100.0, 2)\n// Alert threshold: 15% is a reasonable starting point for client errors\n| WHERE error_rate_pct > 15.0\n| SORT error_rate_pct DESC\n| LIMIT 10" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "groupBy": "row", + "termField": "host.name", + "termSize": 10 + } + }, + "managed": true, + "coreMigrationVersion": "8.8.0", + "typeMigrationVersion": "10.1.0" } diff --git a/packages/nginx_otel/kibana/alerting_rule_template/nginx_otel-high-5xx-error-rate.json b/packages/nginx_otel/kibana/alerting_rule_template/nginx_otel-high-5xx-error-rate.json index 36bce4cd7c6..3830294eae1 100644 --- a/packages/nginx_otel/kibana/alerting_rule_template/nginx_otel-high-5xx-error-rate.json +++ b/packages/nginx_otel/kibana/alerting_rule_template/nginx_otel-high-5xx-error-rate.json @@ -1,37 +1,54 @@ { - "id": "nginx_otel-high-5xx-error-rate", - "type": "alerting_rule_template", - "attributes": { - "name": "[Nginx OTel] High 5xx error rate", - "tags": [ - "Nginx OTel" - ], - "ruleTypeId": ".es-query", - "schedule": { - "interval": "1m" + "id": "nginx_otel-high-5xx-error-rate", + "type": "alerting_rule_template", + "attributes": { + "description": "Alerts when 5xx responses exceed 5% of access logs. Server-side errors directly impact users.", + "name": "[Nginx OTel] High 5xx error rate", + "ruleTypeId": ".es-query", + "tags": [ + "Nginx OTel" + ], + "schedule": { + "interval": "1m" + }, + "alertDelay": { + "active": 3 + }, + "artifacts": { + "dashboards": [ + { + "id": "nginx_otel-a1b2c3d4-e5f6-7890-abcd-ef1234567890" }, - "params": { - "searchType": "esqlQuery", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "esqlQuery": { - "esql": "FROM logs-nginx.access.otel-*\n// Flag each access log entry as a server error (5xx) or not\n| EVAL is_5xx = CASE(http.response.status_code >= 500, 1, 0)\n// Aggregate total requests and 5xx count per NGINX host\n| STATS total = COUNT(*), errors_5xx = SUM(is_5xx) BY host.name\n// Minimum sample size to avoid noisy low-traffic hosts\n| WHERE total > 50\n// Calculate 5xx error rate as a percentage\n| EVAL error_rate_pct = ROUND(TO_DOUBLE(errors_5xx) / TO_DOUBLE(total) * 100.0, 2)\n// Alert threshold: adjust to tune sensitivity\n| WHERE error_rate_pct > 5.0\n| SORT error_rate_pct DESC\n| LIMIT 10" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "groupBy": "row", - "termField": "host.name", - "termSize": 10 + { + "id": "nginx_otel-b2c3d4e5-f6a7-8901-bcde-f12345678901" }, - "alertDelay": { - "active": 3 + { + "id": "nginx_otel-c3d4e5f6-a7b8-9012-cdef-123456789012" } + ], + "investigation_guide": { + "blob": "## NGINX high 5xx error rate\n\n### What fired\n5xx response rate exceeded 5% over the window (min 50 total requests).\n\n### Why it matters\n5xx errors are server-side \u2014 upstream failures, misconfigured proxying, or NGINX internal errors. Usually client-visible outages.\n\n### Triage\n1. Inspect error log for correlated entries.\n2. Check upstream application health.\n\n### Remediation\n- Roll back or fix the offending upstream.\n\n### Tuning\n- 5% threshold; tighten per SLA.\n" + } }, - "managed": true, - "coreMigrationVersion": "8.8.0", - "typeMigrationVersion": "10.1.0" + "params": { + "searchType": "esqlQuery", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "esqlQuery": { + "esql": "FROM logs-nginx.access.otel-*\n// Flag each access log entry as a server error (5xx) or not\n| EVAL is_5xx = CASE(http.response.status_code >= 500, 1, 0)\n// Aggregate total requests and 5xx count per NGINX host\n| STATS total = COUNT(*), errors_5xx = SUM(is_5xx) BY host.name\n// Minimum sample size to avoid noisy low-traffic hosts\n| WHERE total > 50\n// Calculate 5xx error rate as a percentage\n| EVAL error_rate_pct = ROUND(TO_DOUBLE(errors_5xx) / TO_DOUBLE(total) * 100.0, 2)\n// Alert threshold: adjust to tune sensitivity\n| WHERE error_rate_pct > 5.0\n| SORT error_rate_pct DESC\n| LIMIT 10" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "groupBy": "row", + "termField": "host.name", + "termSize": 10 + } + }, + "managed": true, + "coreMigrationVersion": "8.8.0", + "typeMigrationVersion": "10.1.0" } diff --git a/packages/nginx_otel/kibana/alerting_rule_template/nginx_otel-high-active-connections.json b/packages/nginx_otel/kibana/alerting_rule_template/nginx_otel-high-active-connections.json index c8829f81d8f..4e57f1403e9 100644 --- a/packages/nginx_otel/kibana/alerting_rule_template/nginx_otel-high-active-connections.json +++ b/packages/nginx_otel/kibana/alerting_rule_template/nginx_otel-high-active-connections.json @@ -1,37 +1,54 @@ { - "id": "nginx_otel-high-active-connections", - "type": "alerting_rule_template", - "attributes": { - "name": "[Nginx OTel] High active connections", - "tags": [ - "Nginx OTel" - ], - "ruleTypeId": ".es-query", - "schedule": { - "interval": "1m" + "id": "nginx_otel-high-active-connections", + "type": "alerting_rule_template", + "attributes": { + "description": "Alerts when average active connections exceed 256. Approaching `worker_connections` ceiling.", + "name": "[Nginx OTel] High active connections", + "ruleTypeId": ".es-query", + "tags": [ + "Nginx OTel" + ], + "schedule": { + "interval": "1m" + }, + "alertDelay": { + "active": 3 + }, + "artifacts": { + "dashboards": [ + { + "id": "nginx_otel-a1b2c3d4-e5f6-7890-abcd-ef1234567890" }, - "params": { - "searchType": "esqlQuery", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "esqlQuery": { - "esql": "FROM metrics-nginxreceiver.otel-*\n// Filter to active connection state gauges\n| WHERE state == \"active\"\n// Aggregate active connections per host\n| STATS avg_active = AVG(nginx.connections_current), max_active = MAX(nginx.connections_current), sample_count = COUNT(*) BY host.name\n// Require minimum samples to avoid false positives during startup\n| WHERE sample_count >= 3\n// Alert when average active connections exceed capacity threshold\n// Adjust threshold based on your NGINX worker_connections setting\n| WHERE avg_active > 256\n| SORT avg_active DESC\n| LIMIT 10" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "groupBy": "row", - "termField": "host.name", - "termSize": 10 + { + "id": "nginx_otel-b2c3d4e5-f6a7-8901-bcde-f12345678901" }, - "alertDelay": { - "active": 3 + { + "id": "nginx_otel-c3d4e5f6-a7b8-9012-cdef-123456789012" } + ], + "investigation_guide": { + "blob": "## NGINX high active connections\n\n### What fired\nAverage active connections exceeded 256 during the window (min 3 samples).\n\n### Why it matters\nActive connection saturation predicts dropped connections. Usually a traffic surge, slow upstream holding workers, or under-sized `worker_connections`.\n\n### Triage\n1. Inspect connection state breakdown (reading/writing/waiting).\n2. Check upstream latency.\n\n### Remediation\n- Raise `worker_connections`.\n- Fix slow upstream.\n\n### Tuning\n- 256 is a default-configuration threshold; tune to your `worker_connections`.\n" + } }, - "managed": true, - "coreMigrationVersion": "8.8.0", - "typeMigrationVersion": "10.1.0" + "params": { + "searchType": "esqlQuery", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "esqlQuery": { + "esql": "FROM metrics-nginxreceiver.otel-*\n// Filter to active connection state gauges\n| WHERE state == \"active\"\n// Aggregate active connections per host\n| STATS avg_active = AVG(nginx.connections_current), max_active = MAX(nginx.connections_current), sample_count = COUNT(*) BY host.name\n// Require minimum samples to avoid false positives during startup\n| WHERE sample_count >= 3\n// Alert when average active connections exceed capacity threshold\n// Adjust threshold based on your NGINX worker_connections setting\n| WHERE avg_active > 256\n| SORT avg_active DESC\n| LIMIT 10" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "groupBy": "row", + "termField": "host.name", + "termSize": 10 + } + }, + "managed": true, + "coreMigrationVersion": "8.8.0", + "typeMigrationVersion": "10.1.0" } diff --git a/packages/nginx_otel/manifest.yml b/packages/nginx_otel/manifest.yml index 9be61182d5b..dac8f1b54b4 100644 --- a/packages/nginx_otel/manifest.yml +++ b/packages/nginx_otel/manifest.yml @@ -1,7 +1,7 @@ format_version: 3.5.5 name: nginx_otel title: "NGINX OpenTelemetry Assets" -version: 0.3.0 +version: 0.4.0 source: license: "Elastic-2.0" description: "NGINX Assets from OpenTelemetry Collector" @@ -12,7 +12,7 @@ categories: - opentelemetry conditions: kibana: - version: "^9.3.0" + version: "^9.4.0" elastic: subscription: "basic" discovery: diff --git a/packages/nvidia_gpu_otel/changelog.yml b/packages/nvidia_gpu_otel/changelog.yml index 78482719beb..970a4b420f0 100644 --- a/packages/nvidia_gpu_otel/changelog.yml +++ b/packages/nvidia_gpu_otel/changelog.yml @@ -1,4 +1,9 @@ # newer versions go on top +- version: "0.2.0" + changes: + - description: Add description and artifact fields to alerting rule template. + type: enhancement + link: https://github.com/elastic/integrations/pull/18506 - version: "0.1.0" changes: - description: Initial draft of the package diff --git a/packages/nvidia_gpu_otel/kibana/alerting_rule_template/nvidia_gpu_otel-clock-frequency-throttled.json b/packages/nvidia_gpu_otel/kibana/alerting_rule_template/nvidia_gpu_otel-clock-frequency-throttled.json index f29a08eec83..72ed67f1f55 100644 --- a/packages/nvidia_gpu_otel/kibana/alerting_rule_template/nvidia_gpu_otel-clock-frequency-throttled.json +++ b/packages/nvidia_gpu_otel/kibana/alerting_rule_template/nvidia_gpu_otel-clock-frequency-throttled.json @@ -1,40 +1,51 @@ { - "id": "nvidia_gpu_otel-clock-frequency-throttled", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[NVIDIA GPU OTel] Clock frequency throttled under load", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "nvidia_gpu" - ], - "schedule": { - "interval": "5m" - }, - "alertDelay": { - "active": 3 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-nvidia_gpu.otel-default\n| WHERE DCGM_FI_DEV_SM_CLOCK IS NOT NULL\n AND DCGM_FI_DEV_GPU_UTIL IS NOT NULL\n AND DCGM_FI_DEV_GPU_TEMP IS NOT NULL\n| STATS\n avg_sm_clock = AVG(DCGM_FI_DEV_SM_CLOCK),\n max_sm_clock = MAX(DCGM_FI_DEV_SM_CLOCK),\n avg_util = AVG(DCGM_FI_DEV_GPU_UTIL),\n avg_temp = AVG(DCGM_FI_DEV_GPU_TEMP)\n BY Hostname, UUID, modelName\n// Detect throttling: GPU is under load (>50% util) but clock is low\n// relative to its observed max — indicates thermal or power throttling\n// Adjust the clock ratio threshold based on your GPU model's expected boost clocks\n| WHERE avg_util > 50 AND max_sm_clock > 0\n| EVAL clock_ratio = ROUND(avg_sm_clock / max_sm_clock * 100.0, 2)\n| WHERE clock_ratio < 70\n| SORT clock_ratio ASC" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "UUID", - "termSize": 50 + "id": "nvidia_gpu_otel-clock-frequency-throttled", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when a GPU is under load (>50% utilization) but the SM clock is more than 30% below its observed maximum, indicating thermal or power throttling.", + "name": "[NVIDIA GPU OTel] Clock frequency throttled under load", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "nvidia_gpu" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "nvidia_gpu_otel-overview" } + ], + "investigation_guide": { + "blob": "## NVIDIA GPU clock frequency throttled under load\n\n### What fired\nGPU `avg_util > 50%` while `avg_sm_clock / max_sm_clock < 70%` during the window.\n\n### Why it matters\nWhen a GPU is busy but its clock is held below its observed boost ceiling, the device is throttling \u2014 usually because of temperature, power cap, or PCIe / NVLink limits. Throttling silently reduces compute throughput, hurting model training time and inference QoS.\n\n### Triage\n1. Cross-check `DCGM_FI_DEV_GPU_TEMP` \u2014 temperatures above 83\u00b0C usually trigger thermal throttling.\n2. Inspect `DCGM_FI_DEV_POWER_USAGE` against the GPU's power cap.\n3. Look at host-level airflow and chassis temperature.\n4. Check whether the workload pattern matches the throttling intervals.\n\n### Remediation\n- Improve cooling or relocate workloads to less constrained nodes.\n- Raise the persistence-mode power limit (`nvidia-smi -pl`) if the policy allows.\n- Reduce concurrent GPU jobs on the host.\n\n### Tuning\n- 70% clock-ratio threshold; tune to your GPU model's expected boost behaviour.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-nvidia_gpu.otel-default\n| WHERE DCGM_FI_DEV_SM_CLOCK IS NOT NULL\n AND DCGM_FI_DEV_GPU_UTIL IS NOT NULL\n AND DCGM_FI_DEV_GPU_TEMP IS NOT NULL\n| STATS\n avg_sm_clock = AVG(DCGM_FI_DEV_SM_CLOCK),\n max_sm_clock = MAX(DCGM_FI_DEV_SM_CLOCK),\n avg_util = AVG(DCGM_FI_DEV_GPU_UTIL),\n avg_temp = AVG(DCGM_FI_DEV_GPU_TEMP)\n BY Hostname, UUID, modelName\n// Detect throttling: GPU is under load (>50% util) but clock is low\n// relative to its observed max \u2014 indicates thermal or power throttling\n// Adjust the clock ratio threshold based on your GPU model's expected boost clocks\n| WHERE avg_util > 50 AND max_sm_clock > 0\n| EVAL clock_ratio = ROUND(avg_sm_clock / max_sm_clock * 100.0, 2)\n| WHERE clock_ratio < 70\n| SORT clock_ratio ASC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "UUID", + "termSize": 50 } + } } diff --git a/packages/nvidia_gpu_otel/kibana/alerting_rule_template/nvidia_gpu_otel-framebuffer-memory-high.json b/packages/nvidia_gpu_otel/kibana/alerting_rule_template/nvidia_gpu_otel-framebuffer-memory-high.json index 05f99af4665..3c0003cef63 100644 --- a/packages/nvidia_gpu_otel/kibana/alerting_rule_template/nvidia_gpu_otel-framebuffer-memory-high.json +++ b/packages/nvidia_gpu_otel/kibana/alerting_rule_template/nvidia_gpu_otel-framebuffer-memory-high.json @@ -1,40 +1,51 @@ { - "id": "nvidia_gpu_otel-framebuffer-memory-high", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[NVIDIA GPU OTel] Framebuffer memory utilization high", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "nvidia_gpu" - ], - "schedule": { - "interval": "1m" - }, - "alertDelay": { - "active": 3 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-nvidia_gpu.otel-default\n| WHERE DCGM_FI_DEV_FB_USED IS NOT NULL AND DCGM_FI_DEV_FB_FREE IS NOT NULL\n| EVAL total_fb = DCGM_FI_DEV_FB_USED + DCGM_FI_DEV_FB_FREE\n| EVAL fb_used_pct = ROUND(DCGM_FI_DEV_FB_USED / total_fb * 100.0, 2)\n// Alert when framebuffer usage exceeds 90%\n// Adjust threshold based on workload — ML training may routinely use 85-95%\n| WHERE fb_used_pct > 90\n| STATS\n max_used_pct = MAX(fb_used_pct),\n avg_used_pct = AVG(fb_used_pct),\n min_free_mb = MIN(DCGM_FI_DEV_FB_FREE)\n BY Hostname, UUID, modelName\n| SORT max_used_pct DESC" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "UUID", - "termSize": 50 + "id": "nvidia_gpu_otel-framebuffer-memory-high", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when GPU framebuffer (VRAM) usage exceeds 90% (`DCGM_FI_DEV_FB_USED / total > 0.90`). Near-full VRAM risks OOM kills for ML workloads.", + "name": "[NVIDIA GPU OTel] Framebuffer memory utilization high", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "nvidia_gpu" + ], + "schedule": { + "interval": "1m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "nvidia_gpu_otel-overview" } + ], + "investigation_guide": { + "blob": "## NVIDIA GPU framebuffer memory utilization high\n\n### What fired\nFramebuffer usage exceeded 90% on at least one GPU during the window.\n\n### Why it matters\nGPU memory is the most common failure mode for ML and inference workloads. Above 90% utilisation, large allocations fail with `cudaErrorMemoryAllocation` and training jobs crash.\n\n### Triage\n1. Identify the GPU via Hostname / UUID / modelName.\n2. Inspect the workload \u2014 recent batch-size or model-size changes.\n3. Check for memory leaks in long-running processes.\n\n### Remediation\n- Reduce batch size or enable gradient checkpointing.\n- Move workloads to a GPU with more VRAM.\n- Restart leaking processes.\n\n### Tuning\n- 90% threshold. ML training routinely operates at 85-95%; tune to your workload tolerance.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-nvidia_gpu.otel-default\n| WHERE DCGM_FI_DEV_FB_USED IS NOT NULL AND DCGM_FI_DEV_FB_FREE IS NOT NULL\n| EVAL total_fb = DCGM_FI_DEV_FB_USED + DCGM_FI_DEV_FB_FREE\n| EVAL fb_used_pct = ROUND(DCGM_FI_DEV_FB_USED / total_fb * 100.0, 2)\n// Alert when framebuffer usage exceeds 90%\n// Adjust threshold based on workload \u2014 ML training may routinely use 85-95%\n| WHERE fb_used_pct > 90\n| STATS\n max_used_pct = MAX(fb_used_pct),\n avg_used_pct = AVG(fb_used_pct),\n min_free_mb = MIN(DCGM_FI_DEV_FB_FREE)\n BY Hostname, UUID, modelName\n| SORT max_used_pct DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "UUID", + "termSize": 50 } + } } diff --git a/packages/nvidia_gpu_otel/kibana/alerting_rule_template/nvidia_gpu_otel-gpu-idle-during-expected-workload.json b/packages/nvidia_gpu_otel/kibana/alerting_rule_template/nvidia_gpu_otel-gpu-idle-during-expected-workload.json index 48ea9886566..dda344fb421 100644 --- a/packages/nvidia_gpu_otel/kibana/alerting_rule_template/nvidia_gpu_otel-gpu-idle-during-expected-workload.json +++ b/packages/nvidia_gpu_otel/kibana/alerting_rule_template/nvidia_gpu_otel-gpu-idle-during-expected-workload.json @@ -1,40 +1,51 @@ { - "id": "nvidia_gpu_otel-gpu-idle-during-expected-workload", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[NVIDIA GPU OTel] GPU idle during expected workload", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "nvidia_gpu" - ], - "schedule": { - "interval": "5m" - }, - "alertDelay": { - "active": 6 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-nvidia_gpu.otel-default\n| WHERE DCGM_FI_DEV_GPU_UTIL IS NOT NULL\n| STATS\n avg_util = AVG(DCGM_FI_DEV_GPU_UTIL),\n max_util = MAX(DCGM_FI_DEV_GPU_UTIL)\n BY Hostname, UUID, modelName\n// Detects GPUs that are completely idle — may indicate a stuck job,\n// misconfigured workload, or wasted resources\n// Adjust threshold and alertDelay based on expected idle periods\n| WHERE max_util < 1\n| SORT Hostname ASC" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 30, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "UUID", - "termSize": 50 + "id": "nvidia_gpu_otel-gpu-idle-during-expected-workload", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when a GPU sees zero utilization (`max_util < 1`) over 30 minutes. An idle expensive resource often indicates a stuck or misconfigured workload.", + "name": "[NVIDIA GPU OTel] GPU idle during expected workload", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "nvidia_gpu" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 6 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "nvidia_gpu_otel-overview" } + ], + "investigation_guide": { + "blob": "## NVIDIA GPU idle during expected workload\n\n### What fired\n`MAX(DCGM_FI_DEV_GPU_UTIL) < 1` over a 30-minute window.\n\n### Why it matters\nLong idle periods on a GPU that should be busy waste expensive infrastructure. Common causes: stuck training jobs waiting on data, application crashes that left the process running but not driving the GPU, scheduler misconfiguration.\n\n### Triage\n1. Identify which GPU and host.\n2. Check the running CUDA processes (`nvidia-smi`).\n3. Inspect the workload scheduler / orchestrator (Kubernetes / Slurm) for the pod / job state.\n4. Look at the input data pipeline \u2014 is the loader the bottleneck?\n\n### Remediation\n- Restart the stuck job.\n- Free the GPU for other workloads.\n- Tune data loading to keep the GPU fed.\n\n### Tuning\n- Filter on production hours / job state to avoid alerting on intentionally idle nodes.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-nvidia_gpu.otel-default\n| WHERE DCGM_FI_DEV_GPU_UTIL IS NOT NULL\n| STATS\n avg_util = AVG(DCGM_FI_DEV_GPU_UTIL),\n max_util = MAX(DCGM_FI_DEV_GPU_UTIL)\n BY Hostname, UUID, modelName\n// Detects GPUs that are completely idle \u2014 may indicate a stuck job,\n// misconfigured workload, or wasted resources\n// Adjust threshold and alertDelay based on expected idle periods\n| WHERE max_util < 1\n| SORT Hostname ASC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 30, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "UUID", + "termSize": 50 } + } } diff --git a/packages/nvidia_gpu_otel/kibana/alerting_rule_template/nvidia_gpu_otel-gpu-temperature-critical.json b/packages/nvidia_gpu_otel/kibana/alerting_rule_template/nvidia_gpu_otel-gpu-temperature-critical.json index de4fdb8d210..a12452dd351 100644 --- a/packages/nvidia_gpu_otel/kibana/alerting_rule_template/nvidia_gpu_otel-gpu-temperature-critical.json +++ b/packages/nvidia_gpu_otel/kibana/alerting_rule_template/nvidia_gpu_otel-gpu-temperature-critical.json @@ -1,40 +1,51 @@ { - "id": "nvidia_gpu_otel-gpu-temperature-critical", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[NVIDIA GPU OTel] GPU temperature critical", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "nvidia_gpu" - ], - "schedule": { - "interval": "1m" - }, - "alertDelay": { - "active": 3 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-nvidia_gpu.otel-default\n| WHERE DCGM_FI_DEV_GPU_TEMP IS NOT NULL\n// Threshold in Celsius — adjust based on your GPU model and data center cooling\n// Most data center GPUs throttle at 83-90°C\n| WHERE DCGM_FI_DEV_GPU_TEMP > 83\n| STATS\n max_temp = MAX(DCGM_FI_DEV_GPU_TEMP),\n avg_temp = AVG(DCGM_FI_DEV_GPU_TEMP)\n BY Hostname, UUID, modelName\n| SORT max_temp DESC" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "UUID", - "termSize": 50 + "id": "nvidia_gpu_otel-gpu-temperature-critical", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when GPU temperature exceeds 83\u00b0C. Most data-center GPUs throttle between 83\u00b0C and 90\u00b0C; sustained high temperature damages the hardware.", + "name": "[NVIDIA GPU OTel] GPU temperature critical", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "nvidia_gpu" + ], + "schedule": { + "interval": "1m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "nvidia_gpu_otel-overview" } + ], + "investigation_guide": { + "blob": "## NVIDIA GPU temperature critical\n\n### What fired\n`DCGM_FI_DEV_GPU_TEMP > 83\u00b0C` during the window.\n\n### Why it matters\nGPUs throttle to protect themselves above their configured thermal limit. Sustained operation near the limit reduces throughput, increases failure rates, and shortens hardware lifespan.\n\n### Triage\n1. Identify the affected host / GPU.\n2. Check chassis cooling, fan health, and ambient temperature.\n3. Correlate with clock-frequency throttling on the same GPU.\n4. Look at neighbour GPUs in the same chassis.\n\n### Remediation\n- Improve airflow / replace failed fans.\n- Reduce concurrent GPU jobs on the host.\n- Migrate the workload to a cooler chassis.\n\n### Tuning\n- 83\u00b0C threshold; tune to your GPU model's documented throttle threshold.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-nvidia_gpu.otel-default\n| WHERE DCGM_FI_DEV_GPU_TEMP IS NOT NULL\n// Threshold in Celsius \u2014 adjust based on your GPU model and data center cooling\n// Most data center GPUs throttle at 83-90\u00b0C\n| WHERE DCGM_FI_DEV_GPU_TEMP > 83\n| STATS\n max_temp = MAX(DCGM_FI_DEV_GPU_TEMP),\n avg_temp = AVG(DCGM_FI_DEV_GPU_TEMP)\n BY Hostname, UUID, modelName\n| SORT max_temp DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "UUID", + "termSize": 50 } + } } diff --git a/packages/nvidia_gpu_otel/kibana/alerting_rule_template/nvidia_gpu_otel-gpu-utilization-saturated.json b/packages/nvidia_gpu_otel/kibana/alerting_rule_template/nvidia_gpu_otel-gpu-utilization-saturated.json index b969dc2d2f0..6c75f0a889d 100644 --- a/packages/nvidia_gpu_otel/kibana/alerting_rule_template/nvidia_gpu_otel-gpu-utilization-saturated.json +++ b/packages/nvidia_gpu_otel/kibana/alerting_rule_template/nvidia_gpu_otel-gpu-utilization-saturated.json @@ -1,40 +1,51 @@ { - "id": "nvidia_gpu_otel-gpu-utilization-saturated", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[NVIDIA GPU OTel] GPU utilization saturated", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "nvidia_gpu" - ], - "schedule": { - "interval": "5m" - }, - "alertDelay": { - "active": 3 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-nvidia_gpu.otel-default\n| WHERE DCGM_FI_DEV_GPU_UTIL IS NOT NULL\n| STATS\n avg_util = AVG(DCGM_FI_DEV_GPU_UTIL),\n max_util = MAX(DCGM_FI_DEV_GPU_UTIL)\n BY Hostname, UUID, modelName\n// Sustained high utilization above 95% indicates compute saturation\n// Adjust threshold based on workload expectations\n| WHERE avg_util > 95\n| SORT avg_util DESC" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "UUID", - "termSize": 50 + "id": "nvidia_gpu_otel-gpu-utilization-saturated", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when GPU utilization sustains above 95% over 15 minutes. Saturation may be desired for training, but for inference workloads it caps QPS.", + "name": "[NVIDIA GPU OTel] GPU utilization saturated", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "nvidia_gpu" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "nvidia_gpu_otel-overview" } + ], + "investigation_guide": { + "blob": "## NVIDIA GPU utilization saturated\n\n### What fired\n`AVG(DCGM_FI_DEV_GPU_UTIL) > 95%` over the 15-minute window.\n\n### Why it matters\nSustained high utilisation is fine for batch training but for inference / serving it means tail latency is rising and capacity headroom is gone \u2014 any traffic burst will queue.\n\n### Triage\n1. Determine the workload type (training vs inference).\n2. For inference: check request queue depth and latency.\n3. Inspect framebuffer usage and clock throttling for compounding issues.\n\n### Remediation\n- Scale out by adding GPUs / nodes.\n- Optimise model (quantisation, batching) to reduce GPU time per request.\n\n### Tuning\n- 95% threshold; raise for training fleets where 100% is the goal.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-nvidia_gpu.otel-default\n| WHERE DCGM_FI_DEV_GPU_UTIL IS NOT NULL\n| STATS\n avg_util = AVG(DCGM_FI_DEV_GPU_UTIL),\n max_util = MAX(DCGM_FI_DEV_GPU_UTIL)\n BY Hostname, UUID, modelName\n// Sustained high utilization above 95% indicates compute saturation\n// Adjust threshold based on workload expectations\n| WHERE avg_util > 95\n| SORT avg_util DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "UUID", + "termSize": 50 } + } } diff --git a/packages/nvidia_gpu_otel/kibana/alerting_rule_template/nvidia_gpu_otel-xid-errors-detected.json b/packages/nvidia_gpu_otel/kibana/alerting_rule_template/nvidia_gpu_otel-xid-errors-detected.json index 4aafab63025..fe4c9397f67 100644 --- a/packages/nvidia_gpu_otel/kibana/alerting_rule_template/nvidia_gpu_otel-xid-errors-detected.json +++ b/packages/nvidia_gpu_otel/kibana/alerting_rule_template/nvidia_gpu_otel-xid-errors-detected.json @@ -1,40 +1,51 @@ { - "id": "nvidia_gpu_otel-xid-errors-detected", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[NVIDIA GPU OTel] XID errors detected", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "nvidia_gpu" - ], - "schedule": { - "interval": "1m" - }, - "alertDelay": { - "active": 1 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-nvidia_gpu.otel-default\n| WHERE DCGM_FI_DEV_XID_ERRORS IS NOT NULL\n// Non-zero XID errors indicate GPU hardware or software faults\n// err_code identifies the specific XID error type\n| WHERE DCGM_FI_DEV_XID_ERRORS > 0\n| STATS\n max_xid = MAX(DCGM_FI_DEV_XID_ERRORS),\n occurrences = COUNT(*)\n BY Hostname, UUID, modelName, err_code, err_msg\n| SORT occurrences DESC" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "UUID", - "termSize": 50 + "id": "nvidia_gpu_otel-xid-errors-detected", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when any NVIDIA XID errors are reported (`DCGM_FI_DEV_XID_ERRORS > 0`). XIDs indicate GPU hardware or software faults \u2014 often precede GPU failure.", + "name": "[NVIDIA GPU OTel] XID errors detected", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "nvidia_gpu" + ], + "schedule": { + "interval": "1m" + }, + "alertDelay": { + "active": 1 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "nvidia_gpu_otel-overview" } + ], + "investigation_guide": { + "blob": "## NVIDIA GPU XID errors detected\n\n### What fired\n`DCGM_FI_DEV_XID_ERRORS > 0` on a GPU during the window.\n\n### Why it matters\nXID errors are NVIDIA's primary fault signal. Some XIDs (e.g. 13, 31) are application bugs; others (e.g. 79, 119) indicate driver / hardware faults that often precede GPU failure or node loss. Dimensioned by `err_code` and `err_msg`.\n\n### Triage\n1. Identify the affected GPU and the specific XID code from `err_code`.\n2. Cross-reference the XID code against NVIDIA's XID error documentation.\n3. Check `dmesg` and `nvidia-smi` for further context.\n4. Inspect host-level logs around the timestamp.\n\n### Remediation\n- Restart the affected workload (most application-class XIDs are recoverable).\n- Reset or RMA the GPU for hardware-class XIDs.\n- Drain workloads from the affected host.\n\n### Tuning\n- Fires on any XID. All XIDs warrant at least a glance \u2014 do not loosen the threshold.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-nvidia_gpu.otel-default\n| WHERE DCGM_FI_DEV_XID_ERRORS IS NOT NULL\n// Non-zero XID errors indicate GPU hardware or software faults\n// err_code identifies the specific XID error type\n| WHERE DCGM_FI_DEV_XID_ERRORS > 0\n| STATS\n max_xid = MAX(DCGM_FI_DEV_XID_ERRORS),\n occurrences = COUNT(*)\n BY Hostname, UUID, modelName, err_code, err_msg\n| SORT occurrences DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "UUID", + "termSize": 50 } + } } diff --git a/packages/nvidia_gpu_otel/manifest.yml b/packages/nvidia_gpu_otel/manifest.yml index ce0c26d07f9..8b3fe1955b2 100644 --- a/packages/nvidia_gpu_otel/manifest.yml +++ b/packages/nvidia_gpu_otel/manifest.yml @@ -1,7 +1,7 @@ format_version: 3.5.7 name: nvidia_gpu_otel title: "NVIDIA GPU OpenTelemetry Assets" -version: 0.1.0 +version: 0.2.0 source: license: "Elastic-2.0" description: "NVIDIA GPU Assets for OpenTelemetry Collector" @@ -13,7 +13,7 @@ categories: - os_system conditions: kibana: - version: "^9.2.1" + version: "^9.4.0" elastic: subscription: "basic" discovery: diff --git a/packages/oracle_otel/changelog.yml b/packages/oracle_otel/changelog.yml index d1cd3ce205d..1d7e3d80650 100644 --- a/packages/oracle_otel/changelog.yml +++ b/packages/oracle_otel/changelog.yml @@ -1,4 +1,9 @@ # newer versions go on top +- version: "0.2.0" + changes: + - description: Add description and artifact fields to alerting rule template. + type: enhancement + link: https://github.com/elastic/integrations/pull/18506 - version: "0.1.0" changes: - description: Initial draft of the Oracle OTel content package diff --git a/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-buffer-cache-hit-ratio-low.json b/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-buffer-cache-hit-ratio-low.json index fb24896f53f..7d4406c72f5 100644 --- a/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-buffer-cache-hit-ratio-low.json +++ b/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-buffer-cache-hit-ratio-low.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when the Oracle buffer cache hit ratio falls below 90%. Below 90% indicates cache thrashing and disk-bound reads.", "name": "[Oracle OTel] Buffer cache hit ratio low", "ruleTypeId": ".es-query", "tags": [ @@ -19,13 +20,28 @@ "lookBackWindow": 8, "statusChangeThreshold": 4 }, + "artifacts": { + "dashboards": [ + { + "id": "oracle_otel-overview" + }, + { + "id": "oracle_otel-resource-capacity" + } + ], + "investigation_guide": { + "blob": "## Oracle buffer cache hit ratio low\n\n### What fired\n`1 - physical_reads/logical_reads < 0.90` over the window.\n\n### Why it matters\nOracle's buffer cache absorbs most reads. A sub-90% hit ratio means many reads are going to disk \u2014 a significant performance regression and often a sizing problem.\n\n### Triage\n1. Check SGA sizing and automatic memory management settings.\n2. Inspect queries doing many physical reads.\n\n### Remediation\n- Increase SGA / buffer cache size.\n- Tune offending queries.\n\n### Tuning\n- 90% threshold; tune per environment.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "TS metrics-oracledbreceiver.otel-*\n// logical_reads and physical_reads are co-located in Reads TSDB group; INCREASE for counter deltas\n| WHERE oracledb.logical_reads IS NOT NULL\n AND oracledb.physical_reads IS NOT NULL\n| STATS delta_physical = SUM(INCREASE(oracledb.physical_reads)),\n delta_logical = SUM(INCREASE(oracledb.logical_reads))\n BY oracledb.instance.name, host.name\n| WHERE delta_logical > 0\n// Hit ratio = 1 - (physical / logical); < 90% indicates buffer cache thrashing\n// Adjust threshold (e.g. 0.95 for 95%) to tune sensitivity\n| EVAL hit_ratio_pct = ROUND((1.0 - delta_physical / delta_logical) * 100.0, 1)\n| WHERE hit_ratio_pct < 90\n| SORT hit_ratio_pct ASC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-dml-lock-utilisation-high.json b/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-dml-lock-utilisation-high.json index 4bb03342de5..fdfd02fbaae 100644 --- a/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-dml-lock-utilisation-high.json +++ b/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-dml-lock-utilisation-high.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when DML lock utilisation exceeds 85% (`dml_locks.usage / dml_locks.limit`). Precedes transaction queueing and lock errors.", "name": "[Oracle OTel] DML lock utilisation high", "ruleTypeId": ".es-query", "tags": [ @@ -19,13 +20,28 @@ "lookBackWindow": 8, "statusChangeThreshold": 4 }, + "artifacts": { + "dashboards": [ + { + "id": "oracle_otel-transaction-health" + }, + { + "id": "oracle_otel-overview" + } + ], + "investigation_guide": { + "blob": "## Oracle DML lock utilisation high\n\n### What fired\nDML lock utilisation exceeded 85% during the window.\n\n### Why it matters\nDML locks are exhausted by concurrent DDL-like operations. Nearing the limit causes session queueing and ORA-00055 errors.\n\n### Triage\n1. Inspect the workload for unusual DDL patterns.\n\n### Remediation\n- Raise `dml_locks` init parameter.\n\n### Tuning\n- 85% threshold; tighten for critical OLTP.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-oracledbreceiver.otel-*\n// DML locks usage and limit are in separate TSDB document groups; aggregate across time window\n| STATS usage = MAX(oracledb.dml_locks.usage),\n limit_val = MAX(oracledb.dml_locks.limit)\n BY oracledb.instance.name, host.name\n// Exclude unlimited (-1)\n| WHERE limit_val > 0\n// Utilisation > 85% precedes lock-related errors and transaction queuing\n// Adjust threshold to tune sensitivity\n| EVAL utilisation_pct = ROUND(usage * 100.0 / limit_val, 1)\n| WHERE utilisation_pct > 85\n| SORT utilisation_pct DESC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-enqueue-deadlocks-detected.json b/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-enqueue-deadlocks-detected.json index ea24f4cf40e..121604f5665 100644 --- a/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-enqueue-deadlocks-detected.json +++ b/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-enqueue-deadlocks-detected.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `oracledb.enqueue_deadlocks` increases. Deadlocks cause ORA-00060 and rolled-back transactions.", "name": "[Oracle OTel] Enqueue deadlocks detected", "ruleTypeId": ".es-query", "tags": [ @@ -19,13 +20,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "oracle_otel-transaction-health" + }, + { + "id": "oracle_otel-overview" + } + ], + "investigation_guide": { + "blob": "## Oracle enqueue deadlocks detected\n\n### What fired\nCounter `oracledb.enqueue_deadlocks` increased during the window.\n\n### Why it matters\nDeadlocks are resolved by aborting one transaction (ORA-00060). Sustained deadlocks point to inconsistent access order or overly broad transactions.\n\n### Triage\n1. Inspect the alert log for deadlock graphs.\n2. Identify the offending SQL.\n\n### Remediation\n- Enforce consistent access order.\n- Use finer-grained locking.\n\n### Tuning\n- Fires on any increase.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { - "esql": "TS metrics-oracledbreceiver.otel-*\n// enqueue_deadlocks is a cumulative counter; INCREASE gives new deadlocks in window\n| WHERE oracledb.enqueue_deadlocks IS NOT NULL\n| STATS delta = SUM(INCREASE(oracledb.enqueue_deadlocks))\n BY oracledb.instance.name, host.name\n// Alert on any increase — deadlocks cause ORA-00060 and rolled-back transactions\n// INCREASE handles counter resets on instance restart\n| WHERE delta > 0\n| SORT delta DESC" + "esql": "TS metrics-oracledbreceiver.otel-*\n// enqueue_deadlocks is a cumulative counter; INCREASE gives new deadlocks in window\n| WHERE oracledb.enqueue_deadlocks IS NOT NULL\n| STATS delta = SUM(INCREASE(oracledb.enqueue_deadlocks))\n BY oracledb.instance.name, host.name\n// Alert on any increase \u2014 deadlocks cause ORA-00060 and rolled-back transactions\n// INCREASE handles counter resets on instance restart\n| WHERE delta > 0\n| SORT delta DESC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-enqueue-lock-utilisation-high.json b/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-enqueue-lock-utilisation-high.json index 572d6b69fec..a75e7932cd1 100644 --- a/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-enqueue-lock-utilisation-high.json +++ b/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-enqueue-lock-utilisation-high.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when enqueue lock utilisation exceeds 85%. Precedes enqueue wait events and contention.", "name": "[Oracle OTel] Enqueue lock utilisation high", "ruleTypeId": ".es-query", "tags": [ @@ -19,13 +20,28 @@ "lookBackWindow": 8, "statusChangeThreshold": 4 }, + "artifacts": { + "dashboards": [ + { + "id": "oracle_otel-transaction-health" + }, + { + "id": "oracle_otel-resource-capacity" + } + ], + "investigation_guide": { + "blob": "## Oracle enqueue lock utilisation high\n\n### What fired\nEnqueue locks utilisation exceeded 85%.\n\n### Why it matters\nEnqueues are Oracle's shared-resource locks. Near-limit usage correlates with enqueue waits and ORA errors.\n\n### Triage\n1. Identify top enqueue waits from ASH / AWR.\n\n### Remediation\n- Raise enqueue_resources.\n- Address hot-row contention in the workload.\n\n### Tuning\n- 85% threshold.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-oracledbreceiver.otel-*\n// Enqueue locks usage and limit are in separate TSDB document groups; aggregate across time window\n| STATS usage = MAX(oracledb.enqueue_locks.usage),\n limit_val = MAX(oracledb.enqueue_locks.limit)\n BY oracledb.instance.name, host.name\n| WHERE limit_val > 0\n// Utilisation > 85% precedes lock contention and enqueue wait events\n// Adjust threshold to tune sensitivity\n| EVAL utilisation_pct = ROUND(usage * 100.0 / limit_val, 1)\n| WHERE utilisation_pct > 85\n| SORT utilisation_pct DESC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-hard-parse-ratio-high.json b/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-hard-parse-ratio-high.json index ec03454850f..71ca45223c8 100644 --- a/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-hard-parse-ratio-high.json +++ b/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-hard-parse-ratio-high.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when the hard-parse ratio exceeds 10%. Indicates shared pool pressure or absent bind variables.", "name": "[Oracle OTel] Hard parse ratio high", "ruleTypeId": ".es-query", "tags": [ @@ -19,13 +20,28 @@ "lookBackWindow": 8, "statusChangeThreshold": 4 }, + "artifacts": { + "dashboards": [ + { + "id": "oracle_otel-sql-execution" + }, + { + "id": "oracle_otel-query-performance" + } + ], + "investigation_guide": { + "blob": "## Oracle hard parse ratio high\n\n### What fired\n`hard_parses / parse_calls > 10%` over the window.\n\n### Why it matters\nHard parses are expensive \u2014 they fully re-plan a statement. High ratios indicate shared pool churn, ad-hoc SQL, or missing bind variables. This is a classic performance antipattern.\n\n### Triage\n1. Look at top literal-heavy SQL in ASH.\n2. Check shared pool size.\n\n### Remediation\n- Force cursor sharing or adopt bind variables.\n- Raise shared pool size.\n\n### Tuning\n- 10% threshold; tighten for OLTP.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "TS metrics-oracledbreceiver.otel-*\n// hard_parses and parse_calls are co-located in Parses TSDB group; INCREASE for counter deltas\n| WHERE oracledb.parse_calls IS NOT NULL\n AND oracledb.hard_parses IS NOT NULL\n| STATS delta_hard = SUM(INCREASE(oracledb.hard_parses)),\n delta_total = SUM(INCREASE(oracledb.parse_calls))\n BY oracledb.instance.name, host.name\n| WHERE delta_total > 0\n// Ratio > 10% indicates shared pool pressure, ad-hoc SQL, or missing bind variables\n// Adjust threshold (e.g. 0.05 for 5%) for stricter monitoring\n| EVAL hard_parse_ratio_pct = ROUND(delta_hard / delta_total * 100.0, 1)\n| WHERE hard_parse_ratio_pct > 10\n| SORT hard_parse_ratio_pct DESC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-process-utilisation-high.json b/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-process-utilisation-high.json index bb9f4901efb..9ec2581a6a6 100644 --- a/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-process-utilisation-high.json +++ b/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-process-utilisation-high.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when Oracle process utilisation exceeds 85%. Nearing the process limit causes ORA-00020.", "name": "[Oracle OTel] Process utilisation high", "ruleTypeId": ".es-query", "tags": [ @@ -19,13 +20,28 @@ "lookBackWindow": 8, "statusChangeThreshold": 4 }, + "artifacts": { + "dashboards": [ + { + "id": "oracle_otel-resource-capacity" + }, + { + "id": "oracle_otel-overview" + } + ], + "investigation_guide": { + "blob": "## Oracle process utilisation high\n\n### What fired\n`processes.usage / processes.limit > 85%`.\n\n### Why it matters\nWhen Oracle hits its `processes` init limit, new connections fail with ORA-00020. Approaching the ceiling threatens availability.\n\n### Triage\n1. Inspect session / process count trend.\n\n### Remediation\n- Raise `processes` init parameter.\n- Tune client-side pooling.\n\n### Tuning\n- 85% threshold.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { - "esql": "FROM metrics-oracledbreceiver.otel-*\n// Process usage and limit are in separate TSDB document groups; aggregate across time window\n| STATS usage = MAX(oracledb.processes.usage),\n limit_val = MAX(oracledb.processes.limit)\n BY oracledb.instance.name, host.name\n| WHERE limit_val > 0\n// Utilisation threshold: 85% — approaching process limit causes ORA-00020 connection refusals\n// Adjust threshold to tune sensitivity (e.g. 80 for earlier warning)\n| EVAL utilisation_pct = ROUND(usage * 100.0 / limit_val, 1)\n| WHERE utilisation_pct > 85\n| SORT utilisation_pct DESC" + "esql": "FROM metrics-oracledbreceiver.otel-*\n// Process usage and limit are in separate TSDB document groups; aggregate across time window\n| STATS usage = MAX(oracledb.processes.usage),\n limit_val = MAX(oracledb.processes.limit)\n BY oracledb.instance.name, host.name\n| WHERE limit_val > 0\n// Utilisation threshold: 85% \u2014 approaching process limit causes ORA-00020 connection refusals\n// Adjust threshold to tune sensitivity (e.g. 80 for earlier warning)\n| EVAL utilisation_pct = ROUND(usage * 100.0 / limit_val, 1)\n| WHERE utilisation_pct > 85\n| SORT utilisation_pct DESC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-rollback-to-commit-ratio-high.json b/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-rollback-to-commit-ratio-high.json index 758dd51807c..2db46a4028b 100644 --- a/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-rollback-to-commit-ratio-high.json +++ b/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-rollback-to-commit-ratio-high.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when rollback-to-total ratio exceeds 10%. Systemic transaction failures.", "name": "[Oracle OTel] Rollback-to-commit ratio high", "ruleTypeId": ".es-query", "tags": [ @@ -19,13 +20,28 @@ "lookBackWindow": 8, "statusChangeThreshold": 4 }, + "artifacts": { + "dashboards": [ + { + "id": "oracle_otel-transaction-health" + }, + { + "id": "oracle_otel-overview" + } + ], + "investigation_guide": { + "blob": "## Oracle rollback-to-commit ratio high\n\n### What fired\n`rollbacks / (commits + rollbacks) > 10%`.\n\n### Why it matters\nHigh rollback ratios signal systematic transaction failures \u2014 constraint violations, timeouts, deadlocks. Not a performance metric but a correctness one.\n\n### Triage\n1. Inspect ASH / error logs for dominant failure modes.\n\n### Remediation\n- Fix the application logic causing rollbacks.\n\n### Tuning\n- 10% threshold; tighten to 1-5% for critical OLTP.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "TS metrics-oracledbreceiver.otel-*\n// user_commits and user_rollbacks are in separate TSDB groups; INCREASE for counter deltas\n| STATS delta_commits = SUM(INCREASE(oracledb.user_commits)),\n delta_rollbacks = SUM(INCREASE(oracledb.user_rollbacks))\n BY oracledb.instance.name, host.name\n| EVAL total_tx = delta_commits + delta_rollbacks\n| WHERE total_tx > 0\n// Ratio > 10% signals systematic transaction failures (constraint violations, timeouts)\n// Adjust threshold (e.g. 0.05 for 5%) to tune sensitivity\n| EVAL rollback_ratio_pct = ROUND(delta_rollbacks / total_tx * 100.0, 1)\n| WHERE rollback_ratio_pct > 10\n| SORT rollback_ratio_pct DESC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-session-utilisation-high.json b/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-session-utilisation-high.json index 78d444c2bf8..d7be03a2836 100644 --- a/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-session-utilisation-high.json +++ b/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-session-utilisation-high.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when session utilisation exceeds 85%. Session exhaustion prevents new connections.", "name": "[Oracle OTel] Session utilisation high", "ruleTypeId": ".es-query", "tags": [ @@ -19,13 +20,28 @@ "lookBackWindow": 8, "statusChangeThreshold": 4 }, + "artifacts": { + "dashboards": [ + { + "id": "oracle_otel-resource-capacity" + }, + { + "id": "oracle_otel-overview" + } + ], + "investigation_guide": { + "blob": "## Oracle session utilisation high\n\n### What fired\nPeak session count exceeded 85% of the session limit.\n\n### Why it matters\nOracle `sessions` bounds total concurrent user and system sessions. Exhaustion causes connection errors for applications.\n\n### Triage\n1. Check application pool sizing.\n\n### Remediation\n- Raise `sessions` init parameter.\n- Tune pooling.\n\n### Tuning\n- 85% threshold.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { - "esql": "FROM metrics-oracledbreceiver.otel-*\n// Sessions usage is split by session_type/session_status; limit is separate document group\n// First aggregate per-timestamp to get total sessions at each scrape point\n| STATS sessions_at_ts = SUM(oracledb.sessions.usage),\n limit_val = MAX(oracledb.sessions.limit)\n BY @timestamp, oracledb.instance.name, host.name\n// Then take peak sessions across the time window\n| STATS peak_sessions = MAX(sessions_at_ts),\n limit_val = MAX(limit_val)\n BY oracledb.instance.name, host.name\n| WHERE limit_val > 0\n// Utilisation threshold: 85% — approaching session limit prevents new connections\n// Adjust threshold to tune sensitivity\n| EVAL utilisation_pct = ROUND(peak_sessions * 100.0 / limit_val, 1)\n| WHERE utilisation_pct > 85\n| SORT utilisation_pct DESC" + "esql": "FROM metrics-oracledbreceiver.otel-*\n// Sessions usage is split by session_type/session_status; limit is separate document group\n// First aggregate per-timestamp to get total sessions at each scrape point\n| STATS sessions_at_ts = SUM(oracledb.sessions.usage),\n limit_val = MAX(oracledb.sessions.limit)\n BY @timestamp, oracledb.instance.name, host.name\n// Then take peak sessions across the time window\n| STATS peak_sessions = MAX(sessions_at_ts),\n limit_val = MAX(limit_val)\n BY oracledb.instance.name, host.name\n| WHERE limit_val > 0\n// Utilisation threshold: 85% \u2014 approaching session limit prevents new connections\n// Adjust threshold to tune sensitivity\n| EVAL utilisation_pct = ROUND(peak_sessions * 100.0 / limit_val, 1)\n| WHERE utilisation_pct > 85\n| SORT utilisation_pct DESC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-tablespace-utilisation-high.json b/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-tablespace-utilisation-high.json index 1ec2c4946be..bd57d5e6ba4 100644 --- a/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-tablespace-utilisation-high.json +++ b/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-tablespace-utilisation-high.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when a tablespace utilisation exceeds 90%. Critical for SYSTEM/SYSAUX; write failures at full.", "name": "[Oracle OTel] Tablespace utilisation high", "ruleTypeId": ".es-query", "tags": [ @@ -19,13 +20,28 @@ "lookBackWindow": 8, "statusChangeThreshold": 4 }, + "artifacts": { + "dashboards": [ + { + "id": "oracle_otel-resource-capacity" + }, + { + "id": "oracle_otel-overview" + } + ], + "investigation_guide": { + "blob": "## Oracle tablespace utilisation high\n\n### What fired\nTablespace utilisation exceeded 90% during the window.\n\n### Why it matters\nFull tablespaces cause ORA-01653 / ORA-01654 write failures. SYSTEM/SYSAUX full is particularly dangerous.\n\n### Triage\n1. Identify large segments.\n2. Check autoextend and maxsize.\n\n### Remediation\n- Extend datafiles or add new ones.\n- Archive/purge historical data.\n\n### Tuning\n- 90% threshold; tighten for SYSTEM/SYSAUX.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { - "esql": "FROM metrics-oracledbreceiver.otel-*\n// Tablespace metrics are dimensioned by attributes.tablespace_name\n| WHERE oracledb.tablespace_size.usage IS NOT NULL\n AND oracledb.tablespace_size.limit IS NOT NULL\n| STATS usage = MAX(oracledb.tablespace_size.usage),\n limit_val = MAX(oracledb.tablespace_size.limit)\n BY oracledb.instance.name, host.name, attributes.tablespace_name\n// Exclude unlimited tablespaces (limit -1)\n| WHERE limit_val > 0\n// Utilisation threshold: 90% — critical for SYSTEM/SYSAUX; write failures at full\n// Adjust threshold per environment (e.g. 85 for stricter monitoring)\n| EVAL utilisation_pct = ROUND(usage * 100.0 / limit_val, 1)\n| WHERE utilisation_pct > 90\n| SORT utilisation_pct DESC" + "esql": "FROM metrics-oracledbreceiver.otel-*\n// Tablespace metrics are dimensioned by attributes.tablespace_name\n| WHERE oracledb.tablespace_size.usage IS NOT NULL\n AND oracledb.tablespace_size.limit IS NOT NULL\n| STATS usage = MAX(oracledb.tablespace_size.usage),\n limit_val = MAX(oracledb.tablespace_size.limit)\n BY oracledb.instance.name, host.name, attributes.tablespace_name\n// Exclude unlimited tablespaces (limit -1)\n| WHERE limit_val > 0\n// Utilisation threshold: 90% \u2014 critical for SYSTEM/SYSAUX; write failures at full\n// Adjust threshold per environment (e.g. 85 for stricter monitoring)\n| EVAL utilisation_pct = ROUND(usage * 100.0 / limit_val, 1)\n| WHERE utilisation_pct > 90\n| SORT utilisation_pct DESC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-transaction-limit-utilisation-high.json b/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-transaction-limit-utilisation-high.json index af2424c55a5..92f5d7b303e 100644 --- a/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-transaction-limit-utilisation-high.json +++ b/packages/oracle_otel/kibana/alerting_rule_template/oracle_otel-transaction-limit-utilisation-high.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when transaction utilisation exceeds 85%. Limit exhaustion prevents new transactions.", "name": "[Oracle OTel] Transaction limit utilisation high", "ruleTypeId": ".es-query", "tags": [ @@ -19,13 +20,28 @@ "lookBackWindow": 8, "statusChangeThreshold": 4 }, + "artifacts": { + "dashboards": [ + { + "id": "oracle_otel-resource-capacity" + }, + { + "id": "oracle_otel-transaction-health" + } + ], + "investigation_guide": { + "blob": "## Oracle transaction limit utilisation high\n\n### What fired\n`transactions.usage / transactions.limit > 85%`.\n\n### Why it matters\nThe `transactions` init parameter bounds concurrent active transactions. Exhaustion halts new DML.\n\n### Triage\n1. Look at the active transaction pattern.\n\n### Remediation\n- Raise `transactions` init parameter.\n\n### Tuning\n- 85% threshold.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { - "esql": "FROM metrics-oracledbreceiver.otel-*\n// Transaction usage and limit are in separate TSDB document groups; aggregate across time window\n| STATS usage = MAX(oracledb.transactions.usage),\n limit_val = MAX(oracledb.transactions.limit)\n BY oracledb.instance.name, host.name\n// Exclude unlimited (-1)\n| WHERE limit_val > 0\n// Utilisation > 85% — transaction limit exhaustion prevents new transactions\n// Adjust threshold to tune sensitivity\n| EVAL utilisation_pct = ROUND(usage * 100.0 / limit_val, 1)\n| WHERE utilisation_pct > 85\n| SORT utilisation_pct DESC" + "esql": "FROM metrics-oracledbreceiver.otel-*\n// Transaction usage and limit are in separate TSDB document groups; aggregate across time window\n| STATS usage = MAX(oracledb.transactions.usage),\n limit_val = MAX(oracledb.transactions.limit)\n BY oracledb.instance.name, host.name\n// Exclude unlimited (-1)\n| WHERE limit_val > 0\n// Utilisation > 85% \u2014 transaction limit exhaustion prevents new transactions\n// Adjust threshold to tune sensitivity\n| EVAL utilisation_pct = ROUND(usage * 100.0 / limit_val, 1)\n| WHERE utilisation_pct > 85\n| SORT utilisation_pct DESC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/oracle_otel/manifest.yml b/packages/oracle_otel/manifest.yml index ac6ebea4453..c8af316e5a9 100644 --- a/packages/oracle_otel/manifest.yml +++ b/packages/oracle_otel/manifest.yml @@ -1,7 +1,7 @@ format_version: 3.5.0 name: oracle_otel title: "Oracle OpenTelemetry Assets" -version: 0.1.0 +version: 0.2.0 source: license: "Elastic-2.0" description: "Oracle Assets for OpenTelemetry Collector" @@ -12,7 +12,7 @@ categories: - opentelemetry conditions: kibana: - version: "^9.3.0" + version: "^9.4.0" elastic: subscription: "basic" discovery: diff --git a/packages/postgresql_otel/changelog.yml b/packages/postgresql_otel/changelog.yml index 8cbe71e78de..147f468287e 100644 --- a/packages/postgresql_otel/changelog.yml +++ b/packages/postgresql_otel/changelog.yml @@ -1,4 +1,9 @@ # newer versions go on top +- version: "0.4.0" + changes: + - description: Add description and artifact fields to alerting rule template. + type: enhancement + link: https://github.com/elastic/integrations/pull/18506 - version: "0.3.0" changes: - description: Add dashboards and add SLO, alert assets diff --git a/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-backend-buffer-writes.json b/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-backend-buffer-writes.json index a3ed94587ba..117dcf5e0ed 100644 --- a/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-backend-buffer-writes.json +++ b/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-backend-buffer-writes.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when backend-sourced buffer writes exceed 100 in the window. Background writer cannot keep up \u2014 write pressure.", "name": "[PostgreSQL OTel] Backend buffer writes high", "ruleTypeId": ".es-query", "tags": [ @@ -20,6 +21,19 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "postgresql_otel-io-health" + }, + { + "id": "postgresql_otel-overview" + } + ], + "investigation_guide": { + "blob": "## PostgreSQL backend buffer writes high\n\n### What fired\n`postgresql.bgwriter.buffers.writes` with `source=backend` summed > 100 in the window.\n\n### Why it matters\nWhen the bgwriter cannot keep up, backends must flush dirty buffers themselves \u2014 hurting query latency. Sustained backend writes predict I/O contention.\n\n### Triage\n1. Inspect `bgwriter.maxwritten` stops.\n2. Check checkpoint frequency.\n\n### Remediation\n- Tune bgwriter parameters.\n- Upgrade disk IOPS.\n\n### Tuning\n- Threshold 100; tune per scrape cadence.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { diff --git a/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-bgwriter-maxwritten.json b/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-bgwriter-maxwritten.json index 1221af99480..39594f1b492 100644 --- a/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-bgwriter-maxwritten.json +++ b/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-bgwriter-maxwritten.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `postgresql.bgwriter.maxwritten > 0`. bgwriter stopped because it hit the per-round write ceiling \u2014 structural write pressure.", "name": "[PostgreSQL OTel] Bgwriter maxwritten stops", "ruleTypeId": ".es-query", "tags": [ @@ -20,6 +21,19 @@ "lookBackWindow": 4, "statusChangeThreshold": 2 }, + "artifacts": { + "dashboards": [ + { + "id": "postgresql_otel-io-health" + }, + { + "id": "postgresql_otel-overview" + } + ], + "investigation_guide": { + "blob": "## PostgreSQL bgwriter maxwritten stops\n\n### What fired\n`bgwriter.maxwritten` increased during the window.\n\n### Why it matters\nBgwriter stops writing when it exceeds `bgwriter_lru_maxpages` in a round. Sustained stops mean the write workload exceeds bgwriter capacity.\n\n### Triage\n1. Check `bgwriter_delay` and `_maxpages`.\n\n### Remediation\n- Tune bgwriter more aggressively.\n\n### Tuning\n- Fires on any increase.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { diff --git a/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-connection-utilization-high.json b/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-connection-utilization-high.json index f8d91fbb778..594f71b8ed1 100644 --- a/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-connection-utilization-high.json +++ b/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-connection-utilization-high.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `backends / connection.max > 80%`. Near-exhaustion blocks new clients.", "name": "[PostgreSQL OTel] Connection utilization high", "ruleTypeId": ".es-query", "tags": [ @@ -20,6 +21,19 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "postgresql_otel-connections" + }, + { + "id": "postgresql_otel-overview" + } + ], + "investigation_guide": { + "blob": "## PostgreSQL connection utilization high\n\n### What fired\n`total backends / connection.max > 80%`.\n\n### Why it matters\nExhausting `max_connections` prevents new clients from connecting \u2014 application outage. Usually indicates a pool misconfiguration.\n\n### Triage\n1. Inspect app-side pools.\n2. Look for long-idle backends.\n\n### Remediation\n- Fix pooling / add pgbouncer.\n- Raise `max_connections` only with memory headroom.\n\n### Tuning\n- 80% threshold.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { diff --git a/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-deadlocks-detected.json b/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-deadlocks-detected.json index 294fa677c82..24fb08a15c1 100644 --- a/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-deadlocks-detected.json +++ b/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-deadlocks-detected.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `postgresql.deadlocks > 0` per database. Concurrency issues in the application.", "name": "[PostgreSQL OTel] Deadlocks detected", "ruleTypeId": ".es-query", "tags": [ @@ -20,6 +21,19 @@ "lookBackWindow": 4, "statusChangeThreshold": 2 }, + "artifacts": { + "dashboards": [ + { + "id": "postgresql_otel-locks" + }, + { + "id": "postgresql_otel-workload" + } + ], + "investigation_guide": { + "blob": "## PostgreSQL deadlocks detected\n\n### What fired\n`postgresql.deadlocks` increased during the window.\n\n### Why it matters\nDeadlocks abort a transaction to break a cycle. Sustained deadlocks point to inconsistent access order.\n\n### Triage\n1. Inspect server log for deadlock details.\n2. Look at application access patterns.\n\n### Remediation\n- Fix access ordering.\n\n### Tuning\n- Fires on any non-zero count.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { diff --git a/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-exclusive-lock-contention.json b/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-exclusive-lock-contention.json index 92f2300035b..7f6e99885b2 100644 --- a/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-exclusive-lock-contention.json +++ b/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-exclusive-lock-contention.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `ExclusiveLock` or `AccessExclusiveLock` counts exceed 5. Exclusive locks block all other access.", "name": "[PostgreSQL OTel] Exclusive lock contention", "ruleTypeId": ".es-query", "tags": [ @@ -20,6 +21,19 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "postgresql_otel-locks" + }, + { + "id": "postgresql_otel-workload" + } + ], + "investigation_guide": { + "blob": "## PostgreSQL exclusive lock contention\n\n### What fired\nCount of exclusive locks exceeded 5 during the window.\n\n### Why it matters\nExclusiveLock blocks most other lock modes; AccessExclusiveLock blocks everything. Accumulating exclusive locks usually means a DDL-like statement is running or stuck.\n\n### Triage\n1. Check for long-running DDL / maintenance.\n\n### Remediation\n- Finish or kill the long-running statement.\n\n### Tuning\n- 5 locks; tighten per environment.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { diff --git a/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-high-rollback-rate.json b/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-high-rollback-rate.json index 7c756ecfc46..114dbabc155 100644 --- a/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-high-rollback-rate.json +++ b/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-high-rollback-rate.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when rollbacks / total > 1% with at least 10 commits. Application errors or contention.", "name": "[PostgreSQL OTel] High rollback rate", "ruleTypeId": ".es-query", "tags": [ @@ -20,6 +21,19 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "postgresql_otel-workload" + }, + { + "id": "postgresql_otel-overview" + } + ], + "investigation_guide": { + "blob": "## PostgreSQL high rollback rate\n\n### What fired\nRollback rate exceeded 1% over the window (min 10 commits).\n\n### Why it matters\nRollbacks above baseline indicate application errors, deadlocks, or contention. Healthy systems run at near-zero rollback rate.\n\n### Triage\n1. Correlate with deadlocks and lock waits.\n2. Inspect application logs.\n\n### Remediation\n- Fix application logic.\n\n### Tuning\n- 1% threshold; tighten for critical workloads.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { diff --git a/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-high-sequential-scans.json b/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-high-sequential-scans.json index fea5c9df00a..1307b91cff2 100644 --- a/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-high-sequential-scans.json +++ b/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-high-sequential-scans.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when sequential scans exceed 1000 in the window. Missing indexes or bad query plans.", "name": "[PostgreSQL OTel] High sequential scans", "ruleTypeId": ".es-query", "tags": [ @@ -20,6 +21,19 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "postgresql_otel-query-performance" + }, + { + "id": "postgresql_otel-overview" + } + ], + "investigation_guide": { + "blob": "## PostgreSQL high sequential scans\n\n### What fired\nSummed sequential scans exceeded 1000 during the window.\n\n### Why it matters\nSustained seqscans on large tables indicate missing indexes or plans that ignore existing indexes. Hurts performance and disk I/O.\n\n### Triage\n1. Identify the top offending tables.\n2. Inspect query plans.\n\n### Remediation\n- Add appropriate indexes.\n- Refresh planner statistics.\n\n### Tuning\n- 1000 threshold; tune to table count.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { diff --git a/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-high-temp-io.json b/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-high-temp-io.json index 26df24f6a80..ab5db73ea15 100644 --- a/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-high-temp-io.json +++ b/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-high-temp-io.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `postgresql.temp.io > 100 MB` in the window. Queries spilling to disk due to insufficient work_mem.", "name": "[PostgreSQL OTel] High temp I/O volume", "ruleTypeId": ".es-query", "tags": [ @@ -20,6 +21,19 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "postgresql_otel-query-performance" + }, + { + "id": "postgresql_otel-io-health" + } + ], + "investigation_guide": { + "blob": "## PostgreSQL high temp I/O volume\n\n### What fired\nTemp bytes written exceeded 100 MB during the window.\n\n### Why it matters\nTemp files mean sort or hash operations exceeded `work_mem` and spilled to disk. Slower by orders of magnitude.\n\n### Triage\n1. Identify the slow queries spilling.\n\n### Remediation\n- Raise `work_mem` for sessions/roles.\n- Rewrite queries to reduce memory need.\n\n### Tuning\n- 100 MB threshold.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { diff --git a/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-long-checkpoint-duration.json b/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-long-checkpoint-duration.json index 520927b8c40..8f24ef947b4 100644 --- a/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-long-checkpoint-duration.json +++ b/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-long-checkpoint-duration.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when total checkpoint duration exceeds 30 s in the window. Long checkpoints cause I/O spikes.", "name": "[PostgreSQL OTel] Long checkpoint duration", "ruleTypeId": ".es-query", "tags": [ @@ -20,6 +21,19 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "postgresql_otel-io-health" + }, + { + "id": "postgresql_otel-overview" + } + ], + "investigation_guide": { + "blob": "## PostgreSQL long checkpoint duration\n\n### What fired\nTotal `bgwriter.duration` exceeded 30,000 ms over the window.\n\n### Why it matters\nCheckpoints stall writes and stress disk. Long durations indicate write pressure or too-infrequent checkpoints.\n\n### Triage\n1. Tune checkpoint parameters.\n\n### Remediation\n- Raise `max_wal_size` / tune `checkpoint_timeout`.\n\n### Tuning\n- 30 s threshold; tune to cadence.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { diff --git a/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-long-running-queries.json b/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-long-running-queries.json index 29ba063efee..4794a68e6b5 100644 --- a/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-long-running-queries.json +++ b/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-long-running-queries.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when active queries run longer than 5 minutes (`db.server.query_sample`). Long queries may block others.", "name": "[PostgreSQL OTel] Long-running queries", "ruleTypeId": ".es-query", "tags": [ @@ -20,6 +21,19 @@ "lookBackWindow": 4, "statusChangeThreshold": 2 }, + "artifacts": { + "dashboards": [ + { + "id": "postgresql_otel-active-queries" + }, + { + "id": "postgresql_otel-query-performance" + } + ], + "investigation_guide": { + "blob": "## PostgreSQL long-running queries\n\n### What fired\nActive query sample with `total_exec_time > 300000 ms` (5 min).\n\n### Why it matters\nLong-running queries hold locks, consume memory, and block VACUUM. Occasional long queries are expected (reports, maintenance); unexpected ones need intervention.\n\n### Triage\n1. Inspect the query and its plan.\n2. Check wait events for the session.\n\n### Remediation\n- Kill rogue queries.\n- Add timeouts (`statement_timeout`).\n\n### Tuning\n- 5 min threshold; tune to SLA.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { diff --git a/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-low-buffer-hit-ratio.json b/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-low-buffer-hit-ratio.json index fc1e9d568aa..d9f8d3d402c 100644 --- a/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-low-buffer-hit-ratio.json +++ b/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-low-buffer-hit-ratio.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when buffer cache hit ratio drops below 99%. Cache pressure pushes reads to disk.", "name": "[PostgreSQL OTel] Low buffer hit ratio", "ruleTypeId": ".es-query", "tags": [ @@ -20,6 +21,19 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "postgresql_otel-io-health" + }, + { + "id": "postgresql_otel-overview" + } + ], + "investigation_guide": { + "blob": "## PostgreSQL low buffer hit ratio\n\n### What fired\n`blks_hit / (blks_hit + blks_read) < 99%` with at least 1000 reads.\n\n### Why it matters\nHealthy PostgreSQL runs at > 99% hit ratio. Sub-99% indicates shared_buffers pressure or a working-set change.\n\n### Triage\n1. Check `shared_buffers` sizing.\n2. Inspect recent workload changes.\n\n### Remediation\n- Raise `shared_buffers`.\n- Tune expensive queries.\n\n### Tuning\n- 99% threshold; lower for read-heavy analytic workloads.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { diff --git a/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-queries-waiting-on-locks.json b/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-queries-waiting-on-locks.json index 00aa6e48181..27cd919a8fa 100644 --- a/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-queries-waiting-on-locks.json +++ b/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-queries-waiting-on-locks.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when any queries are blocked on Lock wait events. Direct blocking indicator.", "name": "[PostgreSQL OTel] Queries waiting on locks", "ruleTypeId": ".es-query", "tags": [ @@ -20,6 +21,19 @@ "lookBackWindow": 4, "statusChangeThreshold": 2 }, + "artifacts": { + "dashboards": [ + { + "id": "postgresql_otel-locks" + }, + { + "id": "postgresql_otel-active-queries" + } + ], + "investigation_guide": { + "blob": "## PostgreSQL queries waiting on locks\n\n### What fired\n`wait_event_type=Lock` samples > 0 during the window.\n\n### Why it matters\nQueries blocked on lock waits experience latency directly proportional to the blocker's duration. Cascading blocking storms can take down an app.\n\n### Triage\n1. Find the blocker via `pg_stat_activity` / `pg_locks`.\n\n### Remediation\n- Kill or finish the blocker.\n- Reduce transaction hold time.\n\n### Tuning\n- Fires on any blocked query.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { diff --git a/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-slow-top-queries.json b/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-slow-top-queries.json index 2c5fbacf785..3dc5927904e 100644 --- a/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-slow-top-queries.json +++ b/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-slow-top-queries.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when a `db.server.top_query` delta total_exec_time exceeds 10 s. Top queries consuming disproportionate time.", "name": "[PostgreSQL OTel] Slow top queries", "ruleTypeId": ".es-query", "tags": [ @@ -20,6 +21,19 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "postgresql_otel-query-performance" + }, + { + "id": "postgresql_otel-active-queries" + } + ], + "investigation_guide": { + "blob": "## PostgreSQL slow top queries\n\n### What fired\n`attributes.postgresql.total_exec_time > 10000 ms` delta per scrape on a top query.\n\n### Why it matters\nTop queries from pg_stat_statements dominate total workload. A query consuming 10+ seconds per scrape interval is a prime optimisation target.\n\n### Triage\n1. Inspect the query text and plan.\n2. Check for missing indexes / poor stats.\n\n### Remediation\n- Optimise the query.\n- Consider query rewrite or index changes.\n\n### Tuning\n- 10 s threshold; tune to scrape cadence.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { diff --git a/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-temp-files-created.json b/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-temp-files-created.json index 18ac05d04ba..031ec6862c3 100644 --- a/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-temp-files-created.json +++ b/packages/postgresql_otel/kibana/alerting_rule_template/postgresql_otel-temp-files-created.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `postgresql.temp_files > 0`. Queries spilling to disk indicates work_mem pressure.", "name": "[PostgreSQL OTel] Temp files created", "ruleTypeId": ".es-query", "tags": [ @@ -20,6 +21,19 @@ "lookBackWindow": 4, "statusChangeThreshold": 2 }, + "artifacts": { + "dashboards": [ + { + "id": "postgresql_otel-query-performance" + }, + { + "id": "postgresql_otel-io-health" + } + ], + "investigation_guide": { + "blob": "## PostgreSQL temp files created\n\n### What fired\n`postgresql.temp_files` increased during the window.\n\n### Why it matters\nAny temp file creation means a query exceeded `work_mem` and spilled to disk. Frequent spilling is a strong tuning signal.\n\n### Triage\n1. Identify the queries causing temp files.\n\n### Remediation\n- Raise `work_mem`.\n- Rewrite queries to reduce memory.\n\n### Tuning\n- Fires on any creation.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { diff --git a/packages/postgresql_otel/manifest.yml b/packages/postgresql_otel/manifest.yml index a2150382863..c07b57a9d5a 100644 --- a/packages/postgresql_otel/manifest.yml +++ b/packages/postgresql_otel/manifest.yml @@ -1,7 +1,7 @@ format_version: 3.5.0 name: postgresql_otel title: "PostgreSQL OpenTelemetry Assets" -version: 0.3.0 +version: 0.4.0 source: license: "Elastic-2.0" description: "PostgreSQL Assets for OpenTelemetry Collector" @@ -12,7 +12,7 @@ categories: - opentelemetry conditions: kibana: - version: "^9.3.0" + version: "^9.4.0" elastic: subscription: "basic" discovery: diff --git a/packages/rabbitmq_otel/changelog.yml b/packages/rabbitmq_otel/changelog.yml index 36374b8c24a..8019a35042a 100644 --- a/packages/rabbitmq_otel/changelog.yml +++ b/packages/rabbitmq_otel/changelog.yml @@ -1,4 +1,9 @@ # newer versions go on top +- version: "0.2.0" + changes: + - description: Add description and artifact fields to alerting rule template. + type: enhancement + link: https://github.com/elastic/integrations/pull/18506 - version: "0.1.0" changes: - description: First release of the RabbitMQ OpenTelemetry content package diff --git a/packages/rabbitmq_otel/kibana/alerting_rule_template/rabbitmq_otel-disk-alarm.json b/packages/rabbitmq_otel/kibana/alerting_rule_template/rabbitmq_otel-disk-alarm.json index db4261fad09..6d69b36dc56 100644 --- a/packages/rabbitmq_otel/kibana/alerting_rule_template/rabbitmq_otel-disk-alarm.json +++ b/packages/rabbitmq_otel/kibana/alerting_rule_template/rabbitmq_otel-disk-alarm.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when a RabbitMQ node raises the disk free alarm (`rabbitmq.node.disk_free_alarm > 0`). Publishing is halted on the affected node.", "name": "[RabbitMQ OTel] Disk alarm triggered", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 10, "statusChangeThreshold": 4 }, + "artifacts": { + "dashboards": [ + { + "id": "rabbitmq_otel-nodes" + }, + { + "id": "rabbitmq_otel-overview" + } + ], + "investigation_guide": { + "blob": "## RabbitMQ disk alarm triggered\n\n### What fired\n`rabbitmq.node.disk_free_alarm > 0` on at least one node.\n\n### Why it matters\nWhen a node's free disk drops below the configured threshold, RabbitMQ halts all publishing on that node to protect against running out of space entirely.\n\n### Triage\n1. Identify the affected node and check disk usage.\n2. Inspect queue depths and retention.\n\n### Remediation\n- Free disk space (archive / delete old data).\n- Grow the volume.\n\n### Tuning\n- Always P1.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-rabbitmqreceiver.otel-default\n// Disk alarm: when triggered (1), publishing is halted\n// Requires node-level metrics enabled in receiver config\n| WHERE metrics.rabbitmq.node.disk_free_alarm IS NOT NULL\n AND metrics.rabbitmq.node.disk_free_alarm > 0\n| STATS alarm_count = COUNT(*)\n BY resource.attributes.rabbitmq.node.name\n| WHERE alarm_count > 0" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 5, diff --git a/packages/rabbitmq_otel/kibana/alerting_rule_template/rabbitmq_otel-high-ready-queue-depth.json b/packages/rabbitmq_otel/kibana/alerting_rule_template/rabbitmq_otel-high-ready-queue-depth.json index d064bad5590..7f4204b0e9c 100644 --- a/packages/rabbitmq_otel/kibana/alerting_rule_template/rabbitmq_otel-high-ready-queue-depth.json +++ b/packages/rabbitmq_otel/kibana/alerting_rule_template/rabbitmq_otel-high-ready-queue-depth.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when any queue has more than 1000 ready messages. Consumers are not keeping up.", "name": "[RabbitMQ OTel] High ready queue depth", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 10, "statusChangeThreshold": 4 }, + "artifacts": { + "dashboards": [ + { + "id": "rabbitmq_otel-queues" + }, + { + "id": "rabbitmq_otel-overview" + } + ], + "investigation_guide": { + "blob": "## RabbitMQ high ready queue depth\n\n### What fired\nReady message count exceeded 1000 on a queue.\n\n### Why it matters\nReady messages are waiting for a consumer. Growing queue depth means consumption cannot keep up with production.\n\n### Triage\n1. Check consumer count and ack rate.\n2. Inspect flow state (is the queue under back-pressure?).\n\n### Remediation\n- Scale consumers.\n- Fix slow consumers.\n\n### Tuning\n- 1000 threshold; tune to workload.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-rabbitmqreceiver.otel-default\n// Filter to ready messages (waiting for delivery)\n| WHERE attributes.state == \"ready\"\n AND metrics.rabbitmq.message.current IS NOT NULL\n// Get max ready count per queue in the time window\n| STATS ready_count = MAX(metrics.rabbitmq.message.current)\n BY resource.attributes.rabbitmq.queue.name,\n resource.attributes.rabbitmq.vhost.name,\n resource.attributes.rabbitmq.node.name\n// Alert threshold: adjust 1000 to tune for expected backlog size\n| WHERE ready_count > 1000\n| SORT ready_count DESC\n| LIMIT 20" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/rabbitmq_otel/kibana/alerting_rule_template/rabbitmq_otel-high-unacknowledged-messages.json b/packages/rabbitmq_otel/kibana/alerting_rule_template/rabbitmq_otel-high-unacknowledged-messages.json index daf8fbd1d85..98d0415edc4 100644 --- a/packages/rabbitmq_otel/kibana/alerting_rule_template/rabbitmq_otel-high-unacknowledged-messages.json +++ b/packages/rabbitmq_otel/kibana/alerting_rule_template/rabbitmq_otel-high-unacknowledged-messages.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when unacked messages exceed 500 on a queue. Consumers delivered but never acked.", "name": "[RabbitMQ OTel] High unacknowledged message count", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 10, "statusChangeThreshold": 4 }, + "artifacts": { + "dashboards": [ + { + "id": "rabbitmq_otel-queues" + }, + { + "id": "rabbitmq_otel-overview" + } + ], + "investigation_guide": { + "blob": "## RabbitMQ high unacknowledged message count\n\n### What fired\nUnacknowledged message count exceeded 500 on a queue.\n\n### Why it matters\nUnacked messages are delivered but not yet confirmed by the consumer. Growing unacked counts mean consumers are slow, stuck, or crashing before ack.\n\n### Triage\n1. Check consumer processing time.\n2. Look for consumer crashes.\n\n### Remediation\n- Raise prefetch thoughtfully or reduce it to protect memory.\n- Fix consumer code.\n\n### Tuning\n- 500 threshold; tune to workload.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-rabbitmqreceiver.otel-default\n// Unacknowledged: delivered but not yet acked; buildup indicates consumer issues\n| WHERE attributes.state == \"unacknowledged\"\n AND metrics.rabbitmq.message.current IS NOT NULL\n| STATS unack_count = MAX(metrics.rabbitmq.message.current)\n BY resource.attributes.rabbitmq.queue.name,\n resource.attributes.rabbitmq.vhost.name,\n resource.attributes.rabbitmq.node.name\n// Alert threshold: adjust 500 to tune for consumer processing expectations\n| WHERE unack_count > 500\n| SORT unack_count DESC\n| LIMIT 20" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/rabbitmq_otel/kibana/alerting_rule_template/rabbitmq_otel-memory-alarm.json b/packages/rabbitmq_otel/kibana/alerting_rule_template/rabbitmq_otel-memory-alarm.json index 81aef65ed69..a36ccd2ff76 100644 --- a/packages/rabbitmq_otel/kibana/alerting_rule_template/rabbitmq_otel-memory-alarm.json +++ b/packages/rabbitmq_otel/kibana/alerting_rule_template/rabbitmq_otel-memory-alarm.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `rabbitmq.node.mem_alarm > 0`. The node has raised the memory alarm \u2014 producers are blocked.", "name": "[RabbitMQ OTel] Memory alarm triggered", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 10, "statusChangeThreshold": 4 }, + "artifacts": { + "dashboards": [ + { + "id": "rabbitmq_otel-nodes" + }, + { + "id": "rabbitmq_otel-overview" + } + ], + "investigation_guide": { + "blob": "## RabbitMQ memory alarm triggered\n\n### What fired\n`rabbitmq.node.mem_alarm > 0` during the window.\n\n### Why it matters\nWhen memory use exceeds the configured watermark, RabbitMQ blocks publishers. Publishers will appear to hang until memory is released.\n\n### Triage\n1. Check queue length and memory per queue.\n2. Look for unacked messages holding memory.\n\n### Remediation\n- Drain large queues / kill stuck consumers.\n- Raise `vm_memory_high_watermark`.\n\n### Tuning\n- Always P1.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-rabbitmqreceiver.otel-default\n// Memory alarm: when triggered (1), producers are blocked\n// Requires node-level metrics enabled in receiver config\n| WHERE metrics.rabbitmq.node.mem_alarm IS NOT NULL\n AND metrics.rabbitmq.node.mem_alarm > 0\n| STATS alarm_count = COUNT(*)\n BY resource.attributes.rabbitmq.node.name\n| WHERE alarm_count > 0" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 5, diff --git a/packages/rabbitmq_otel/kibana/alerting_rule_template/rabbitmq_otel-queue-zero-consumers.json b/packages/rabbitmq_otel/kibana/alerting_rule_template/rabbitmq_otel-queue-zero-consumers.json index 91c1e37492e..7edcdffe6d7 100644 --- a/packages/rabbitmq_otel/kibana/alerting_rule_template/rabbitmq_otel-queue-zero-consumers.json +++ b/packages/rabbitmq_otel/kibana/alerting_rule_template/rabbitmq_otel-queue-zero-consumers.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when a queue has ready messages but zero consumers. No one is processing the queue.", "name": "[RabbitMQ OTel] Queue with zero consumers and ready messages", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 10, "statusChangeThreshold": 4 }, + "artifacts": { + "dashboards": [ + { + "id": "rabbitmq_otel-queues" + }, + { + "id": "rabbitmq_otel-overview" + } + ], + "investigation_guide": { + "blob": "## RabbitMQ queue with zero consumers and ready messages\n\n### What fired\nQueue has `ready > 0` and `consumer_count == 0`.\n\n### Why it matters\nReady messages with no consumer mean the workload is silently stalling \u2014 often a consumer deployment that never came back up.\n\n### Triage\n1. Check consumer services and deployments.\n\n### Remediation\n- Restart consumers.\n\n### Tuning\n- Filter out queues that are intentionally dormant.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-rabbitmqreceiver.otel-default\n| WHERE resource.attributes.rabbitmq.queue.name IS NOT NULL\n// Aggregate ready messages and consumer count per queue\n| STATS\n ready_count = MAX(metrics.rabbitmq.message.current) WHERE attributes.state == \"ready\",\n consumer_count = MAX(metrics.rabbitmq.consumer.count)\n BY resource.attributes.rabbitmq.queue.name,\n resource.attributes.rabbitmq.vhost.name,\n resource.attributes.rabbitmq.node.name\n// Alert when queue has messages but no consumers\n| WHERE ready_count > 0 AND COALESCE(consumer_count, TO_LONG(0)) == 0\n| SORT ready_count DESC\n| LIMIT 20" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/rabbitmq_otel/manifest.yml b/packages/rabbitmq_otel/manifest.yml index c711e4e5362..9a80e1663b9 100644 --- a/packages/rabbitmq_otel/manifest.yml +++ b/packages/rabbitmq_otel/manifest.yml @@ -1,7 +1,7 @@ format_version: 3.5.5 name: rabbitmq_otel title: "RabbitMQ OpenTelemetry assets" -version: 0.1.0 +version: 0.2.0 source: license: "Elastic-2.0" description: "RabbitMQ Assets for OpenTelemetry Collector" @@ -12,7 +12,7 @@ categories: - opentelemetry conditions: kibana: - version: "^9.3.0" + version: "^9.4.0" elastic: subscription: "basic" discovery: diff --git a/packages/redis_otel/changelog.yml b/packages/redis_otel/changelog.yml index a1924fcd58b..700465cac7d 100644 --- a/packages/redis_otel/changelog.yml +++ b/packages/redis_otel/changelog.yml @@ -1,4 +1,9 @@ # newer versions go on top +- version: "0.2.0" + changes: + - description: Add description and artifact fields to alerting rule template. + type: enhancement + link: https://github.com/elastic/integrations/pull/18506 - version: "0.1.0" changes: - description: Initial draft of the package diff --git a/packages/redis_otel/kibana/alerting_rule_template/redis_otel-high-eviction-rate.json b/packages/redis_otel/kibana/alerting_rule_template/redis_otel-high-eviction-rate.json index c044957b426..87845ae6e0d 100644 --- a/packages/redis_otel/kibana/alerting_rule_template/redis_otel-high-eviction-rate.json +++ b/packages/redis_otel/kibana/alerting_rule_template/redis_otel-high-eviction-rate.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `INCREASE(redis.keys.evicted) > 100` in the window. Memory pressure forces key removal.", "name": "[Redis OTel] High eviction rate", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "redis_otel-memory-persistence" + }, + { + "id": "redis_otel-overview" + } + ], + "investigation_guide": { + "blob": "## Redis high eviction rate\n\n### What fired\nMore than 100 keys evicted during the window.\n\n### Why it matters\nEvictions happen when Redis reaches `maxmemory` and enforces its policy. Normal for cache workloads; problematic for data-store workloads (means data loss).\n\n### Triage\n1. Check memory trend vs `maxmemory`.\n2. Inspect key volume per database.\n\n### Remediation\n- Raise `maxmemory` or adopt an eviction policy that matches workload.\n\n### Tuning\n- 100 threshold; tune per workload.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "TS metrics-redisreceiver.otel-*\n| WHERE redis.keys.evicted IS NOT NULL\n// Counter: use INCREASE to get evictions over the time window\n| STATS evicted = SUM(INCREASE(redis.keys.evicted))\n BY server.address, server.port\n// Alert when evictions exceed threshold (adjust based on workload)\n// Sudden spikes indicate memory pressure; sustained evictions may be normal for cache workloads\n| WHERE evicted > 100\n| SORT evicted DESC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/redis_otel/kibana/alerting_rule_template/redis_otel-memory-fragmentation-swapping.json b/packages/redis_otel/kibana/alerting_rule_template/redis_otel-memory-fragmentation-swapping.json index dca10b2ddfa..17a8ad407fd 100644 --- a/packages/redis_otel/kibana/alerting_rule_template/redis_otel-memory-fragmentation-swapping.json +++ b/packages/redis_otel/kibana/alerting_rule_template/redis_otel-memory-fragmentation-swapping.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `redis.memory.fragmentation_ratio < 1.0`. RSS below used means Redis is swapping to disk.", "name": "[Redis OTel] Memory fragmentation indicates swapping", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "redis_otel-memory-persistence" + }, + { + "id": "redis_otel-overview" + } + ], + "investigation_guide": { + "blob": "## Redis memory fragmentation indicates swapping\n\n### What fired\n`redis.memory.fragmentation_ratio < 1.0` during the window.\n\n### Why it matters\nFragmentation ratio < 1.0 means the OS reports less RSS than Redis's logical used memory \u2014 Redis pages are in swap. Swap completely destroys Redis latency.\n\n### Triage\n1. Check host swap usage.\n2. Inspect memory pressure on the VM.\n\n### Remediation\n- Disable swap for Redis / add RAM.\n\n### Tuning\n- Always P1.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-redisreceiver.otel-*\n| WHERE data_stream.dataset == \"redisreceiver.otel\"\n AND redis.memory.fragmentation_ratio IS NOT NULL\n// Fragmentation ratio < 1.0 means RSS < used memory: Redis is swapping to disk\n| STATS avg_frag = AVG(redis.memory.fragmentation_ratio)\n BY server.address, server.port\n| WHERE avg_frag < 1.0\n| SORT avg_frag ASC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/redis_otel/kibana/alerting_rule_template/redis_otel-memory-near-max.json b/packages/redis_otel/kibana/alerting_rule_template/redis_otel-memory-near-max.json index a267f2b1a6c..e57e7d55070 100644 --- a/packages/redis_otel/kibana/alerting_rule_template/redis_otel-memory-near-max.json +++ b/packages/redis_otel/kibana/alerting_rule_template/redis_otel-memory-near-max.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `redis.memory.used / redis.maxmemory > 90%`. Approaching maxmemory triggers evictions or OOM.", "name": "[Redis OTel] Memory approaching maxmemory", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,28 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "redis_otel-memory-persistence" + }, + { + "id": "redis_otel-overview" + } + ], + "investigation_guide": { + "blob": "## Redis memory approaching maxmemory\n\n### What fired\nMemory usage exceeded 90% of `maxmemory`.\n\n### Why it matters\nAt 100% maxmemory, Redis either evicts keys (cache workloads) or OOMs (no eviction policy). Both are user-visible.\n\n### Triage\n1. Correlate with eviction and fragmentation trends.\n\n### Remediation\n- Raise `maxmemory` or reduce key set.\n\n### Tuning\n- 90% threshold; tune per workload.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-redisreceiver.otel-*\n| WHERE data_stream.dataset == \"redisreceiver.otel\"\n AND redis.memory.used IS NOT NULL\n AND redis.maxmemory IS NOT NULL\n AND redis.maxmemory > 0\n// Used memory as percentage of maxmemory\n| STATS used = MAX(redis.memory.used),\n max_mem = MAX(redis.maxmemory)\n BY server.address, server.port\n| EVAL usage_pct = ROUND(used / max_mem * 100.0, 2)\n// Alert when usage exceeds 90%; adjust threshold as needed\n| WHERE usage_pct > 90.0\n| SORT usage_pct DESC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/redis_otel/kibana/alerting_rule_template/redis_otel-rejected-connections.json b/packages/redis_otel/kibana/alerting_rule_template/redis_otel-rejected-connections.json index c1f4ae886d4..9b9a5806aad 100644 --- a/packages/redis_otel/kibana/alerting_rule_template/redis_otel-rejected-connections.json +++ b/packages/redis_otel/kibana/alerting_rule_template/redis_otel-rejected-connections.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `INCREASE(redis.connections.rejected) > 0`. `maxclients` reached \u2014 new clients refused.", "name": "[Redis OTel] Rejected connections", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,25 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "redis_otel-overview" + } + ], + "investigation_guide": { + "blob": "## Redis rejected connections\n\n### What fired\n`redis.connections.rejected` increased during the window.\n\n### Why it matters\nRejections happen when `maxclients` is reached. Clients hit connection errors and often retry storm, worsening the situation.\n\n### Triage\n1. Inspect current connected clients vs `maxclients`.\n2. Check client pool configurations.\n\n### Remediation\n- Raise `maxclients` and OS FD limits.\n- Fix client-side pooling.\n\n### Tuning\n- Fires on any increase.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "TS metrics-redisreceiver.otel-*\n| WHERE redis.connections.rejected IS NOT NULL\n// Counter: use INCREASE to get rejections over the time window\n| STATS rejected = SUM(INCREASE(redis.connections.rejected))\n BY server.address, server.port\n// Alert when any instance rejected at least one connection (maxclients reached)\n| WHERE rejected > 0\n| SORT rejected DESC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/redis_otel/kibana/alerting_rule_template/redis_otel-replicas-disconnected.json b/packages/redis_otel/kibana/alerting_rule_template/redis_otel-replicas-disconnected.json index fa8df7a3e3c..e46e5091525 100644 --- a/packages/redis_otel/kibana/alerting_rule_template/redis_otel-replicas-disconnected.json +++ b/packages/redis_otel/kibana/alerting_rule_template/redis_otel-replicas-disconnected.json @@ -3,6 +3,7 @@ "type": "alerting_rule_template", "managed": true, "attributes": { + "description": "Alerts when `redis.slaves.connected == 0`. Primary has no replicas \u2014 HA / read scaling lost.", "name": "[Redis OTel] Primary has no connected replicas", "ruleTypeId": ".es-query", "tags": [ @@ -20,13 +21,25 @@ "lookBackWindow": 6, "statusChangeThreshold": 3 }, + "artifacts": { + "dashboards": [ + { + "id": "redis_otel-overview" + } + ], + "investigation_guide": { + "blob": "## Redis primary has no connected replicas\n\n### What fired\n`redis.slaves.connected == 0` during the window.\n\n### Why it matters\nA primary without replicas cannot fail over and cannot serve replica reads. The failover SLA is compromised.\n\n### Triage\n1. Check replica processes / Sentinel / Cluster state.\n2. Inspect the replication link.\n\n### Remediation\n- Restore replicas or fix replication.\n\n### Tuning\n- Filter out standalone instances.\n" + } + }, "params": { "searchType": "esqlQuery", "esqlQuery": { "esql": "FROM metrics-redisreceiver.otel-*\n| WHERE data_stream.dataset == \"redisreceiver.otel\"\n AND redis.slaves.connected IS NOT NULL\n// Filter to primaries: role is primary when attributes.role exists, or check replication metrics\n// redis.slaves.connected is only relevant on primaries (replicas report 0)\n| STATS connected_replicas = MAX(redis.slaves.connected)\n BY server.address, server.port\n// Alert when a primary expects replicas but has none (adjust if standalone is acceptable)\n// Use 0 to alert on any instance with zero replicas; for HA setups this indicates risk\n| WHERE connected_replicas == 0\n| SORT server.address ASC" }, "size": 0, - "threshold": [0], + "threshold": [ + 0 + ], "thresholdComparator": ">", "timeField": "@timestamp", "timeWindowSize": 15, diff --git a/packages/redis_otel/manifest.yml b/packages/redis_otel/manifest.yml index 7a8b9474fe0..61ed2781292 100644 --- a/packages/redis_otel/manifest.yml +++ b/packages/redis_otel/manifest.yml @@ -1,7 +1,7 @@ format_version: 3.5.5 name: redis_otel title: "Redis OpenTelemetry Assets" -version: 0.1.0 +version: 0.2.0 source: license: "Elastic-2.0" description: "Redis Assets for OpenTelemetry Collector" @@ -12,7 +12,7 @@ categories: - opentelemetry conditions: kibana: - version: "^9.3.0" + version: "^9.4.0" elastic: subscription: "basic" discovery: diff --git a/packages/redisenterprise_otel/changelog.yml b/packages/redisenterprise_otel/changelog.yml index 48579674983..813ebfa8abb 100644 --- a/packages/redisenterprise_otel/changelog.yml +++ b/packages/redisenterprise_otel/changelog.yml @@ -1,4 +1,9 @@ # newer versions go on top +- version: "0.2.0" + changes: + - description: Add description and artifact fields to alerting rule template. + type: enhancement + link: https://github.com/elastic/integrations/pull/18506 - version: "0.1.0" changes: - description: Initial draft of the package diff --git a/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-certificate-expiring.json b/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-certificate-expiring.json index 24b228e9fa7..364ebff8859 100644 --- a/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-certificate-expiring.json +++ b/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-certificate-expiring.json @@ -1,40 +1,54 @@ { - "id": "redisenterprise_otel-certificate-expiring", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Redis Enterprise OTel] TLS certificate expiring soon", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "redisenterprise" - ], - "schedule": { - "interval": "1h" + "id": "redisenterprise_otel-certificate-expiring", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when any Redis Enterprise certificate expires within 30 days. Expired certs break intra-cluster and client TLS.", + "name": "[Redis Enterprise OTel] TLS certificate expiring soon", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "redisenterprise" + ], + "schedule": { + "interval": "1h" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 5, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "redisenterprise_otel-node-health" }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 5, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-redisenterprise.otel-default\n| WHERE node_cert_expiration_seconds IS NOT NULL\n| STATS min_expiry_s = MIN(node_cert_expiration_seconds) BY node, path, cluster\n| EVAL days_until_expiry = ROUND(min_expiry_s / 86400, 1)\n// Alert when any certificate expires within 30 days\n// Expired certificates break intra-cluster communication and client TLS\n// Adjust threshold: 30 days for early warning, 7 days is critical\n// The 'path' dimension identifies which certificate file is expiring\n| WHERE days_until_expiry < 30\n| SORT days_until_expiry ASC" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 6, - "timeWindowUnit": "h", - "groupBy": "row", - "termField": "node", - "termSize": 100 + { + "id": "redisenterprise_otel-overview" } + ], + "investigation_guide": { + "blob": "## Redis Enterprise TLS certificate expiring soon\n\n### What fired\nNode certificate expiration less than 30 days.\n\n### Why it matters\nRedis Enterprise uses certificates for intra-cluster mesh and client TLS. Expiration causes silent cluster partitioning and client handshake failures.\n\n### Triage\n1. Identify which certificate path is expiring.\n2. Check the certificate rotation tooling.\n\n### Remediation\n- Renew the cert via `rladmin` or the management UI.\n- Restart services if required.\n\n### Tuning\n- 30 days lead time; consider a 60-day warning.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-redisenterprise.otel-default\n| WHERE node_cert_expiration_seconds IS NOT NULL\n| STATS min_expiry_s = MIN(node_cert_expiration_seconds) BY node, path, cluster\n| EVAL days_until_expiry = ROUND(min_expiry_s / 86400, 1)\n// Alert when any certificate expires within 30 days\n// Expired certificates break intra-cluster communication and client TLS\n// Adjust threshold: 30 days for early warning, 7 days is critical\n// The 'path' dimension identifies which certificate file is expiring\n| WHERE days_until_expiry < 30\n| SORT days_until_expiry ASC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 6, + "timeWindowUnit": "h", + "groupBy": "row", + "termField": "node", + "termSize": 100 } + } } diff --git a/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-database-down.json b/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-database-down.json index 61bfc554ccb..c9f8a18dec1 100644 --- a/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-database-down.json +++ b/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-database-down.json @@ -1,40 +1,54 @@ { - "id": "redisenterprise_otel-database-down", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Redis Enterprise OTel] Database down", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "redisenterprise" - ], - "schedule": { - "interval": "1m" + "id": "redisenterprise_otel-database-down", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when `bdb_up < 1`. The database is reported down \u2014 clients cannot read or write.", + "name": "[Redis Enterprise OTel] Database down", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "redisenterprise" + ], + "schedule": { + "interval": "1m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "redisenterprise_otel-database-performance" }, - "alertDelay": { - "active": 3 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-redisenterprise.otel-default\n| WHERE bdb_up IS NOT NULL\n| STATS min_up = MIN(bdb_up) BY bdb, cluster\n// Alert when any database reports as down (bdb_up < 1)\n// This is a critical alert — a down database means clients cannot read or write\n| WHERE min_up < 1" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "bdb", - "termSize": 100 + { + "id": "redisenterprise_otel-overview" } + ], + "investigation_guide": { + "blob": "## Redis Enterprise database down\n\n### What fired\n`bdb_up < 1` during the window.\n\n### Why it matters\nA down database is a complete outage for its clients. Usually caused by shard failures or cluster management issues.\n\n### Triage\n1. Check shard status.\n2. Inspect node health.\n\n### Remediation\n- Restore shards / failover replicas.\n\n### Tuning\n- Always P1.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-redisenterprise.otel-default\n| WHERE bdb_up IS NOT NULL\n| STATS min_up = MIN(bdb_up) BY bdb, cluster\n// Alert when any database reports as down (bdb_up < 1)\n// This is a critical alert \u2014 a down database means clients cannot read or write\n| WHERE min_up < 1" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "bdb", + "termSize": 100 } + } } diff --git a/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-database-high-latency.json b/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-database-high-latency.json index ef666a29472..5808d0074d0 100644 --- a/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-database-high-latency.json +++ b/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-database-high-latency.json @@ -1,40 +1,54 @@ { - "id": "redisenterprise_otel-database-high-latency", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Redis Enterprise OTel] Database high latency", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "redisenterprise" - ], - "schedule": { - "interval": "1m" + "id": "redisenterprise_otel-database-high-latency", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when average database latency exceeds 5 ms. Redis should be sub-millisecond.", + "name": "[Redis Enterprise OTel] Database high latency", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "redisenterprise" + ], + "schedule": { + "interval": "1m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "redisenterprise_otel-database-performance" }, - "alertDelay": { - "active": 3 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-redisenterprise.otel-default\n| WHERE bdb_avg_latency IS NOT NULL\n| STATS avg_latency_s = AVG(bdb_avg_latency) BY bdb, cluster\n| EVAL avg_latency_ms = ROUND(avg_latency_s * 1000, 2)\n// Threshold: 5ms average latency — adjust based on your SLA\n// Redis should deliver sub-millisecond responses; sustained >5ms warrants investigation\n// Common causes: memory pressure, fork operations (AOF/RDB), hot shards\n| WHERE avg_latency_ms > 5\n| SORT avg_latency_ms DESC" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "bdb", - "termSize": 100 + { + "id": "redisenterprise_otel-overview" } + ], + "investigation_guide": { + "blob": "## Redis Enterprise database high latency\n\n### What fired\nAverage `bdb_avg_latency` > 5 ms during the window.\n\n### Why it matters\nSustained latency above 5 ms indicates memory pressure, persistence (fork) activity, hot shards, or CPU contention.\n\n### Triage\n1. Look at shard CPU and fragmentation.\n2. Check persistence jobs and fork durations.\n\n### Remediation\n- Rebalance shards across nodes.\n- Tune persistence settings.\n\n### Tuning\n- 5 ms threshold; tune to SLA.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-redisenterprise.otel-default\n| WHERE bdb_avg_latency IS NOT NULL\n| STATS avg_latency_s = AVG(bdb_avg_latency) BY bdb, cluster\n| EVAL avg_latency_ms = ROUND(avg_latency_s * 1000, 2)\n// Threshold: 5ms average latency \u2014 adjust based on your SLA\n// Redis should deliver sub-millisecond responses; sustained >5ms warrants investigation\n// Common causes: memory pressure, fork operations (AOF/RDB), hot shards\n| WHERE avg_latency_ms > 5\n| SORT avg_latency_ms DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "bdb", + "termSize": 100 } + } } diff --git a/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-database-memory-critical.json b/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-database-memory-critical.json index fd30be2958f..a8cad6aef07 100644 --- a/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-database-memory-critical.json +++ b/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-database-memory-critical.json @@ -1,40 +1,54 @@ { - "id": "redisenterprise_otel-database-memory-critical", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Redis Enterprise OTel] Database memory utilization critical", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "redisenterprise" - ], - "schedule": { - "interval": "5m" + "id": "redisenterprise_otel-database-memory-critical", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when database memory utilisation exceeds 90%. OOM rejections or evictions imminent.", + "name": "[Redis Enterprise OTel] Database memory utilization critical", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "redisenterprise" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "redisenterprise_otel-database-performance" }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-redisenterprise.otel-default\n| WHERE bdb_used_memory IS NOT NULL\n AND bdb_memory_limit IS NOT NULL\n AND bdb_memory_limit > 0\n| STATS\n avg_used = AVG(bdb_used_memory),\n avg_limit = AVG(bdb_memory_limit)\n BY bdb, cluster\n| EVAL utilization_pct = ROUND(avg_used / avg_limit * 100, 2)\n// Alert when memory utilization exceeds 90%\n// For non-caching workloads, exceeding the limit causes OOM rejections\n// For caching workloads with eviction policies, higher utilization is expected —\n// consider raising this threshold or disabling for cache-only databases\n| WHERE utilization_pct > 90\n| SORT utilization_pct DESC" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "bdb", - "termSize": 100 + { + "id": "redisenterprise_otel-overview" } + ], + "investigation_guide": { + "blob": "## Redis Enterprise database memory utilization critical\n\n### What fired\n`bdb_used_memory / bdb_memory_limit > 90%`.\n\n### Why it matters\nNon-caching workloads reject writes at the limit. Caching workloads evict aggressively above this point.\n\n### Triage\n1. Check eviction rate and data model.\n\n### Remediation\n- Raise `memory_limit` or shard out.\n\n### Tuning\n- 90% threshold; tune per workload.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-redisenterprise.otel-default\n| WHERE bdb_used_memory IS NOT NULL\n AND bdb_memory_limit IS NOT NULL\n AND bdb_memory_limit > 0\n| STATS\n avg_used = AVG(bdb_used_memory),\n avg_limit = AVG(bdb_memory_limit)\n BY bdb, cluster\n| EVAL utilization_pct = ROUND(avg_used / avg_limit * 100, 2)\n// Alert when memory utilization exceeds 90%\n// For non-caching workloads, exceeding the limit causes OOM rejections\n// For caching workloads with eviction policies, higher utilization is expected \u2014\n// consider raising this threshold or disabling for cache-only databases\n| WHERE utilization_pct > 90\n| SORT utilization_pct DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "bdb", + "termSize": 100 } + } } diff --git a/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-database-unexpected-evictions.json b/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-database-unexpected-evictions.json index 8cbf92066dc..c30fe42a187 100644 --- a/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-database-unexpected-evictions.json +++ b/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-database-unexpected-evictions.json @@ -1,40 +1,54 @@ { - "id": "redisenterprise_otel-database-unexpected-evictions", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Redis Enterprise OTel] Database unexpected key evictions", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "redisenterprise" - ], - "schedule": { - "interval": "5m" + "id": "redisenterprise_otel-database-unexpected-evictions", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when `bdb_evicted_objects > 0` on a database. For non-caching workloads, evictions are data loss.", + "name": "[Redis Enterprise OTel] Database unexpected key evictions", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "redisenterprise" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "redisenterprise_otel-database-performance" }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-redisenterprise.otel-default\n| WHERE bdb_evicted_objects IS NOT NULL\n| STATS max_evictions = MAX(bdb_evicted_objects) BY bdb, cluster\n// Alert when evictions are detected (peak eviction rate > 0)\n// For non-caching workloads, evictions indicate memory pressure and data loss risk\n// For caching workloads with eviction policies, this is expected — disable this rule\n// or raise the threshold to filter out normal eviction activity\n| WHERE max_evictions > 0\n| SORT max_evictions DESC" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "bdb", - "termSize": 100 + { + "id": "redisenterprise_otel-overview" } + ], + "investigation_guide": { + "blob": "## Redis Enterprise database unexpected key evictions\n\n### What fired\nPeak `bdb_evicted_objects` exceeded 0 during the window.\n\n### Why it matters\nEvictions may be expected for caches but not for data stores. In the latter case they mean silent data loss.\n\n### Triage\n1. Determine the eviction policy applied.\n2. Check memory utilisation trend.\n\n### Remediation\n- Raise memory / change policy.\n\n### Tuning\n- Disable this rule for pure caching workloads.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-redisenterprise.otel-default\n| WHERE bdb_evicted_objects IS NOT NULL\n| STATS max_evictions = MAX(bdb_evicted_objects) BY bdb, cluster\n// Alert when evictions are detected (peak eviction rate > 0)\n// For non-caching workloads, evictions indicate memory pressure and data loss risk\n// For caching workloads with eviction policies, this is expected \u2014 disable this rule\n// or raise the threshold to filter out normal eviction activity\n| WHERE max_evictions > 0\n| SORT max_evictions DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "bdb", + "termSize": 100 } + } } diff --git a/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-listener-auth-errors.json b/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-listener-auth-errors.json index 539fb65f154..4e7945a2183 100644 --- a/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-listener-auth-errors.json +++ b/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-listener-auth-errors.json @@ -1,40 +1,54 @@ { - "id": "redisenterprise_otel-listener-auth-errors", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Redis Enterprise OTel] Authentication failures detected", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "redisenterprise" - ], - "schedule": { - "interval": "5m" + "id": "redisenterprise_otel-listener-auth-errors", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when `listener_auth_errors > 0`. Authentication failures on any listener endpoint.", + "name": "[Redis Enterprise OTel] Authentication failures detected", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "redisenterprise" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "redisenterprise_otel-proxy-listener" }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-redisenterprise.otel-default\n| WHERE listener_auth_errors IS NOT NULL\n| STATS max_auth_errors = MAX(listener_auth_errors) BY bdb, endpoint, node, cluster\n// Alert when authentication failures are detected at any listener endpoint\n// Possible causes: credential rotation issues, misconfigured clients,\n// or unauthorized access attempts\n// Adjust threshold if your environment has expected auth retries\n| WHERE max_auth_errors > 0\n| SORT max_auth_errors DESC" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "bdb", - "termSize": 100 + { + "id": "redisenterprise_otel-overview" } + ], + "investigation_guide": { + "blob": "## Redis Enterprise authentication failures detected\n\n### What fired\nAuth error peaks > 0 on a listener during the window.\n\n### Why it matters\nAuth errors indicate credential rotations, misconfigured clients, or unauthorized access attempts.\n\n### Triage\n1. Cross-reference recent credential changes.\n2. Inspect source IP patterns for attacks.\n\n### Remediation\n- Fix client credentials / reset secrets.\n\n### Tuning\n- Fires on any non-zero peak.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-redisenterprise.otel-default\n| WHERE listener_auth_errors IS NOT NULL\n| STATS max_auth_errors = MAX(listener_auth_errors) BY bdb, endpoint, node, cluster\n// Alert when authentication failures are detected at any listener endpoint\n// Possible causes: credential rotation issues, misconfigured clients,\n// or unauthorized access attempts\n// Adjust threshold if your environment has expected auth retries\n| WHERE max_auth_errors > 0\n| SORT max_auth_errors DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "bdb", + "termSize": 100 } + } } diff --git a/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-node-down.json b/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-node-down.json index f862f454c26..59c1982afb3 100644 --- a/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-node-down.json +++ b/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-node-down.json @@ -1,40 +1,54 @@ { - "id": "redisenterprise_otel-node-down", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Redis Enterprise OTel] Node down", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "redisenterprise" - ], - "schedule": { - "interval": "1m" + "id": "redisenterprise_otel-node-down", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when `node_up < 1`. Node down triggers automatic shard failover.", + "name": "[Redis Enterprise OTel] Node down", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "redisenterprise" + ], + "schedule": { + "interval": "1m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "redisenterprise_otel-node-health" }, - "alertDelay": { - "active": 3 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-redisenterprise.otel-default\n| WHERE node_up IS NOT NULL\n| STATS min_up = MIN(node_up) BY node, cluster\n// Alert when any node reports as down (node_up < 1)\n// A down node triggers automatic failover of its shards to replicas on other nodes\n// Investigate host health, network connectivity, and management service status\n| WHERE min_up < 1" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "node", - "termSize": 100 + { + "id": "redisenterprise_otel-overview" } + ], + "investigation_guide": { + "blob": "## Redis Enterprise node down\n\n### What fired\n`node_up < 1` during the window.\n\n### Why it matters\nA down node reduces cluster capacity and triggers failover for its shards. Extended downtime risks availability and durability.\n\n### Triage\n1. Check host health and management service.\n\n### Remediation\n- Recover or replace the node.\n\n### Tuning\n- Always P1.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-redisenterprise.otel-default\n| WHERE node_up IS NOT NULL\n| STATS min_up = MIN(node_up) BY node, cluster\n// Alert when any node reports as down (node_up < 1)\n// A down node triggers automatic failover of its shards to replicas on other nodes\n// Investigate host health, network connectivity, and management service status\n| WHERE min_up < 1" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "node", + "termSize": 100 } + } } diff --git a/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-node-high-cpu.json b/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-node-high-cpu.json index de46340eb75..94db8f127a1 100644 --- a/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-node-high-cpu.json +++ b/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-node-high-cpu.json @@ -1,40 +1,54 @@ { - "id": "redisenterprise_otel-node-high-cpu", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Redis Enterprise OTel] Node CPU saturation", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "redisenterprise" - ], - "schedule": { - "interval": "5m" + "id": "redisenterprise_otel-node-high-cpu", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when node CPU utilisation exceeds 85%. Saturated hosts cause cluster-wide performance loss.", + "name": "[Redis Enterprise OTel] Node CPU saturation", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "redisenterprise" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "redisenterprise_otel-node-health" }, - "alertDelay": { - "active": 3 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-redisenterprise.otel-default\n| WHERE node_cpu_user IS NOT NULL\n AND node_cpu_system IS NOT NULL\n AND node_cpu_idle IS NOT NULL\n| STATS\n avg_user = AVG(node_cpu_user),\n avg_system = AVG(node_cpu_system),\n avg_idle = AVG(node_cpu_idle)\n BY node, cluster\n| EVAL total = avg_user + avg_system + avg_idle\n| EVAL cpu_util_pct = ROUND((avg_user + avg_system) / total * 100, 2)\n// Alert when CPU utilization exceeds 85%\n// CPU is computed as (user + system) / (user + system + idle)\n// Sustained high CPU may indicate the node needs scaling or workload rebalancing\n// Adjust threshold based on your capacity planning targets\n| WHERE total > 0 AND cpu_util_pct > 85\n| SORT cpu_util_pct DESC" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 10, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "node", - "termSize": 100 + { + "id": "redisenterprise_otel-overview" } + ], + "investigation_guide": { + "blob": "## Redis Enterprise node CPU saturation\n\n### What fired\nNode CPU utilisation exceeded 85% during the window.\n\n### Why it matters\nSaturated node CPU slows every shard on that host and imperils persistence jobs.\n\n### Triage\n1. Inspect per-shard CPU usage.\n2. Check for background tasks (AOF rewrite, RDB save).\n\n### Remediation\n- Spread shards to other nodes.\n\n### Tuning\n- 85% threshold.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-redisenterprise.otel-default\n| WHERE node_cpu_user IS NOT NULL\n AND node_cpu_system IS NOT NULL\n AND node_cpu_idle IS NOT NULL\n| STATS\n avg_user = AVG(node_cpu_user),\n avg_system = AVG(node_cpu_system),\n avg_idle = AVG(node_cpu_idle)\n BY node, cluster\n| EVAL total = avg_user + avg_system + avg_idle\n| EVAL cpu_util_pct = ROUND((avg_user + avg_system) / total * 100, 2)\n// Alert when CPU utilization exceeds 85%\n// CPU is computed as (user + system) / (user + system + idle)\n// Sustained high CPU may indicate the node needs scaling or workload rebalancing\n// Adjust threshold based on your capacity planning targets\n| WHERE total > 0 AND cpu_util_pct > 85\n| SORT cpu_util_pct DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 10, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "node", + "termSize": 100 } + } } diff --git a/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-node-low-memory.json b/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-node-low-memory.json index 3c6803280dd..08928d5d25f 100644 --- a/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-node-low-memory.json +++ b/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-node-low-memory.json @@ -1,40 +1,54 @@ { - "id": "redisenterprise_otel-node-low-memory", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Redis Enterprise OTel] Node free memory low", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "redisenterprise" - ], - "schedule": { - "interval": "5m" + "id": "redisenterprise_otel-node-low-memory", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when node free memory drops below 1 GB. Low memory risks OOM kills and fork failures.", + "name": "[Redis Enterprise OTel] Node free memory low", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "redisenterprise" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "redisenterprise_otel-node-health" }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-redisenterprise.otel-default\n| WHERE node_free_memory IS NOT NULL\n| STATS avg_free = AVG(node_free_memory) BY node, cluster\n| EVAL free_gb = ROUND(avg_free / 1073741824, 2)\n// Alert when free memory drops below 1 GB\n// Low node memory can cause OOM kills, swap thrashing, and fork failures\n// Adjust threshold based on your node's total RAM and provisioning model\n// Nodes with overbooking enabled may legitimately run with lower free memory\n| WHERE free_gb < 1\n| SORT free_gb ASC" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 10, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "node", - "termSize": 100 + { + "id": "redisenterprise_otel-overview" } + ], + "investigation_guide": { + "blob": "## Redis Enterprise node free memory low\n\n### What fired\nNode free memory dropped below 1 GB during the window.\n\n### Why it matters\nLow free memory prevents persistence forks (copy-on-write needs headroom) and can lead to OOM kills.\n\n### Triage\n1. Check shard memory usage.\n2. Inspect OS-level memory pressure.\n\n### Remediation\n- Rebalance shards.\n- Grow node memory.\n\n### Tuning\n- 1 GB threshold; tune to node size.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-redisenterprise.otel-default\n| WHERE node_free_memory IS NOT NULL\n| STATS avg_free = AVG(node_free_memory) BY node, cluster\n| EVAL free_gb = ROUND(avg_free / 1073741824, 2)\n// Alert when free memory drops below 1 GB\n// Low node memory can cause OOM kills, swap thrashing, and fork failures\n// Adjust threshold based on your node's total RAM and provisioning model\n// Nodes with overbooking enabled may legitimately run with lower free memory\n| WHERE free_gb < 1\n| SORT free_gb ASC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 10, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "node", + "termSize": 100 } + } } diff --git a/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-node-storage-low.json b/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-node-storage-low.json index f66ca2f73bc..604e0be638f 100644 --- a/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-node-storage-low.json +++ b/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-node-storage-low.json @@ -1,40 +1,54 @@ { - "id": "redisenterprise_otel-node-storage-low", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Redis Enterprise OTel] Node storage running low", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "redisenterprise" - ], - "schedule": { - "interval": "15m" + "id": "redisenterprise_otel-node-storage-low", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when persistent or ephemeral storage free drops below 5 GB. Persistence / logs will fail.", + "name": "[Redis Enterprise OTel] Node storage running low", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "redisenterprise" + ], + "schedule": { + "interval": "15m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "redisenterprise_otel-node-health" }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-redisenterprise.otel-default\n| WHERE node_persistent_storage_avail IS NOT NULL\n OR node_ephemeral_storage_avail IS NOT NULL\n| STATS\n avg_persistent = AVG(node_persistent_storage_avail),\n avg_ephemeral = AVG(node_ephemeral_storage_avail)\n BY node, cluster\n| EVAL persistent_gb = ROUND(avg_persistent / 1073741824, 2)\n| EVAL ephemeral_gb = ROUND(avg_ephemeral / 1073741824, 2)\n// Alert when available storage drops below 5 GB on either volume\n// Persistent storage holds AOF/RDB snapshots; exhaustion prevents persistence\n// Ephemeral storage holds temp files and logs; exhaustion can crash processes\n// Adjust thresholds based on your node's total disk capacity\n| WHERE (persistent_gb IS NOT NULL AND persistent_gb < 5)\n OR (ephemeral_gb IS NOT NULL AND ephemeral_gb < 5)\n| SORT persistent_gb ASC" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "node", - "termSize": 100 + { + "id": "redisenterprise_otel-overview" } + ], + "investigation_guide": { + "blob": "## Redis Enterprise node storage running low\n\n### What fired\nPersistent or ephemeral storage free dropped below 5 GB during the window.\n\n### Why it matters\nPersistent storage holds RDB/AOF; exhaustion breaks durability. Ephemeral storage holds temp files and logs.\n\n### Triage\n1. Inspect which volume is saturated.\n\n### Remediation\n- Grow the volume / archive old data.\n\n### Tuning\n- 5 GB threshold; tune to node capacity.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-redisenterprise.otel-default\n| WHERE node_persistent_storage_avail IS NOT NULL\n OR node_ephemeral_storage_avail IS NOT NULL\n| STATS\n avg_persistent = AVG(node_persistent_storage_avail),\n avg_ephemeral = AVG(node_ephemeral_storage_avail)\n BY node, cluster\n| EVAL persistent_gb = ROUND(avg_persistent / 1073741824, 2)\n| EVAL ephemeral_gb = ROUND(avg_ephemeral / 1073741824, 2)\n// Alert when available storage drops below 5 GB on either volume\n// Persistent storage holds AOF/RDB snapshots; exhaustion prevents persistence\n// Ephemeral storage holds temp files and logs; exhaustion can crash processes\n// Adjust thresholds based on your node's total disk capacity\n| WHERE (persistent_gb IS NOT NULL AND persistent_gb < 5)\n OR (ephemeral_gb IS NOT NULL AND ephemeral_gb < 5)\n| SORT persistent_gb ASC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "node", + "termSize": 100 } + } } diff --git a/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-proxy-fd-saturation.json b/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-proxy-fd-saturation.json index f5227a49aa4..d9933c69006 100644 --- a/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-proxy-fd-saturation.json +++ b/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-proxy-fd-saturation.json @@ -1,40 +1,54 @@ { - "id": "redisenterprise_otel-proxy-fd-saturation", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Redis Enterprise OTel] Proxy file descriptor saturation", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "redisenterprise" - ], - "schedule": { - "interval": "5m" + "id": "redisenterprise_otel-proxy-fd-saturation", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when DMC proxy FD utilisation exceeds 80%. FD exhaustion rejects new client connections.", + "name": "[Redis Enterprise OTel] Proxy file descriptor saturation", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "redisenterprise" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "redisenterprise_otel-proxy-listener" }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-redisenterprise.otel-default\n| WHERE dmcproxy_process_open_fds IS NOT NULL\n AND dmcproxy_process_max_fds IS NOT NULL\n AND dmcproxy_process_max_fds > 0\n| STATS\n avg_open = AVG(dmcproxy_process_open_fds),\n avg_max = AVG(dmcproxy_process_max_fds)\n BY proxy, node, cluster\n| EVAL fd_util_pct = ROUND(avg_open / avg_max * 100, 2)\n// Alert when file descriptor utilization exceeds 80%\n// The DMC proxy uses one FD per client connection; exhaustion rejects new connections\n// Investigate connection leaks, increase system ulimits, or add connection pooling\n| WHERE fd_util_pct > 80\n| SORT fd_util_pct DESC" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 10, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "proxy", - "termSize": 100 + { + "id": "redisenterprise_otel-node-health" } + ], + "investigation_guide": { + "blob": "## Redis Enterprise proxy file descriptor saturation\n\n### What fired\n`dmcproxy_process_open_fds / dmcproxy_process_max_fds > 80%`.\n\n### Why it matters\nThe DMC proxy uses one FD per client connection. FD exhaustion causes silent connection rejections and client errors.\n\n### Triage\n1. Check for connection leaks / lack of pooling.\n\n### Remediation\n- Raise FD limits.\n- Fix pooling.\n\n### Tuning\n- 80% threshold.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-redisenterprise.otel-default\n| WHERE dmcproxy_process_open_fds IS NOT NULL\n AND dmcproxy_process_max_fds IS NOT NULL\n AND dmcproxy_process_max_fds > 0\n| STATS\n avg_open = AVG(dmcproxy_process_open_fds),\n avg_max = AVG(dmcproxy_process_max_fds)\n BY proxy, node, cluster\n| EVAL fd_util_pct = ROUND(avg_open / avg_max * 100, 2)\n// Alert when file descriptor utilization exceeds 80%\n// The DMC proxy uses one FD per client connection; exhaustion rejects new connections\n// Investigate connection leaks, increase system ulimits, or add connection pooling\n| WHERE fd_util_pct > 80\n| SORT fd_util_pct DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 10, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "proxy", + "termSize": 100 } + } } diff --git a/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-shard-connection-saturation.json b/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-shard-connection-saturation.json index 349a7f27c12..f279f4b28a6 100644 --- a/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-shard-connection-saturation.json +++ b/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-shard-connection-saturation.json @@ -1,40 +1,54 @@ { - "id": "redisenterprise_otel-shard-connection-saturation", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Redis Enterprise OTel] Shard connection saturation", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "redisenterprise" - ], - "schedule": { - "interval": "5m" + "id": "redisenterprise_otel-shard-connection-saturation", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when shard connection utilisation exceeds 80% of `maxclients`. Rejections imminent.", + "name": "[Redis Enterprise OTel] Shard connection saturation", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "redisenterprise" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "redisenterprise_otel-shard-diagnostics" }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-redisenterprise.otel-default\n| WHERE redis_connected_clients IS NOT NULL\n AND redis_maxclients IS NOT NULL\n AND redis_maxclients > 0\n| STATS\n avg_clients = AVG(redis_connected_clients),\n avg_max = AVG(redis_maxclients)\n BY redis, bdb, node, cluster\n| EVAL conn_util_pct = ROUND(avg_clients / avg_max * 100, 2)\n// Alert when connection utilization exceeds 80%\n// Reaching maxclients causes new connections to be rejected\n// Investigate client connection pooling or increase maxclients\n| WHERE conn_util_pct > 80\n| SORT conn_util_pct DESC" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 10, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "redis", - "termSize": 100 + { + "id": "redisenterprise_otel-proxy-listener" } + ], + "investigation_guide": { + "blob": "## Redis Enterprise shard connection saturation\n\n### What fired\n`redis_connected_clients / redis_maxclients > 80%`.\n\n### Why it matters\nAbove 80%, bursts quickly exhaust the connection budget and rejections begin.\n\n### Triage\n1. Check client pool configuration.\n\n### Remediation\n- Raise `maxclients` or add shards.\n\n### Tuning\n- 80% threshold.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-redisenterprise.otel-default\n| WHERE redis_connected_clients IS NOT NULL\n AND redis_maxclients IS NOT NULL\n AND redis_maxclients > 0\n| STATS\n avg_clients = AVG(redis_connected_clients),\n avg_max = AVG(redis_maxclients)\n BY redis, bdb, node, cluster\n| EVAL conn_util_pct = ROUND(avg_clients / avg_max * 100, 2)\n// Alert when connection utilization exceeds 80%\n// Reaching maxclients causes new connections to be rejected\n// Investigate client connection pooling or increase maxclients\n| WHERE conn_util_pct > 80\n| SORT conn_util_pct DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 10, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "redis", + "termSize": 100 } + } } diff --git a/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-shard-down.json b/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-shard-down.json index 926b372f66d..3d8141f8a5e 100644 --- a/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-shard-down.json +++ b/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-shard-down.json @@ -1,40 +1,54 @@ { - "id": "redisenterprise_otel-shard-down", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Redis Enterprise OTel] Shard down", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "redisenterprise" - ], - "schedule": { - "interval": "1m" + "id": "redisenterprise_otel-shard-down", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when `redis_up < 1`. Shard down triggers replica failover or reduces redundancy.", + "name": "[Redis Enterprise OTel] Shard down", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "redisenterprise" + ], + "schedule": { + "interval": "1m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "redisenterprise_otel-shard-diagnostics" }, - "alertDelay": { - "active": 3 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-redisenterprise.otel-default\n| WHERE redis_up IS NOT NULL\n| STATS min_up = MIN(redis_up) BY redis, bdb, node, role, cluster\n// Alert when any shard reports as down (redis_up < 1)\n// A primary shard going down triggers automatic failover to its replica\n// A replica going down reduces redundancy — the database continues but is less resilient\n| WHERE min_up < 1" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "redis", - "termSize": 100 + { + "id": "redisenterprise_otel-overview" } + ], + "investigation_guide": { + "blob": "## Redis Enterprise shard down\n\n### What fired\n`redis_up < 1` on a shard during the window.\n\n### Why it matters\nA primary shard going down triggers failover; a replica going down reduces redundancy.\n\n### Triage\n1. Identify the role and node of the down shard.\n\n### Remediation\n- Recover shard or migrate to healthy node.\n\n### Tuning\n- Always P1 for primary shards.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-redisenterprise.otel-default\n| WHERE redis_up IS NOT NULL\n| STATS min_up = MIN(redis_up) BY redis, bdb, node, role, cluster\n// Alert when any shard reports as down (redis_up < 1)\n// A primary shard going down triggers automatic failover to its replica\n// A replica going down reduces redundancy \u2014 the database continues but is less resilient\n| WHERE min_up < 1" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "redis", + "termSize": 100 } + } } diff --git a/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-shard-high-memory-fragmentation.json b/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-shard-high-memory-fragmentation.json index 3cc0f0679d6..1749df43815 100644 --- a/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-shard-high-memory-fragmentation.json +++ b/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-shard-high-memory-fragmentation.json @@ -1,40 +1,54 @@ { - "id": "redisenterprise_otel-shard-high-memory-fragmentation", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Redis Enterprise OTel] Shard high memory fragmentation", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "redisenterprise" - ], - "schedule": { - "interval": "15m" + "id": "redisenterprise_otel-shard-high-memory-fragmentation", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when `redis_mem_fragmentation_ratio > 1.5` on shards > 10 MB. High fragmentation wastes memory.", + "name": "[Redis Enterprise OTel] Shard high memory fragmentation", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "redisenterprise" + ], + "schedule": { + "interval": "15m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "redisenterprise_otel-shard-diagnostics" }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-redisenterprise.otel-default\n| WHERE redis_mem_fragmentation_ratio IS NOT NULL\n AND redis_used_memory IS NOT NULL\n| STATS\n avg_frag = AVG(redis_mem_fragmentation_ratio),\n avg_used = AVG(redis_used_memory)\n BY redis, bdb, node, cluster\n// Only evaluate shards with meaningful memory usage (>10 MB)\n// Small shards naturally exhibit high fragmentation ratios\n| WHERE avg_used > 10485760\n// Alert when fragmentation ratio exceeds 1.5\n// ratio = RSS / used_memory; values >> 1.0 indicate wasted memory\n// Consider enabling active defragmentation or scheduling a restart\n AND avg_frag > 1.5\n| SORT avg_frag DESC" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 30, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "redis", - "termSize": 100 + { + "id": "redisenterprise_otel-database-performance" } + ], + "investigation_guide": { + "blob": "## Redis Enterprise shard high memory fragmentation\n\n### What fired\nAverage fragmentation ratio > 1.5 on a shard with > 10 MB used.\n\n### Why it matters\nFragmentation ratio > 1.5 wastes memory. Chronic fragmentation may require a shard restart (memory compaction) to reclaim.\n\n### Triage\n1. Check shard growth and deletion patterns.\n\n### Remediation\n- Use `MEMORY PURGE` or restart the shard.\n\n### Tuning\n- 1.5 threshold; skip small shards.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-redisenterprise.otel-default\n| WHERE redis_mem_fragmentation_ratio IS NOT NULL\n AND redis_used_memory IS NOT NULL\n| STATS\n avg_frag = AVG(redis_mem_fragmentation_ratio),\n avg_used = AVG(redis_used_memory)\n BY redis, bdb, node, cluster\n// Only evaluate shards with meaningful memory usage (>10 MB)\n// Small shards naturally exhibit high fragmentation ratios\n| WHERE avg_used > 10485760\n// Alert when fragmentation ratio exceeds 1.5\n// ratio = RSS / used_memory; values >> 1.0 indicate wasted memory\n// Consider enabling active defragmentation or scheduling a restart\n AND avg_frag > 1.5\n| SORT avg_frag DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 30, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "redis", + "termSize": 100 } + } } diff --git a/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-shard-persistence-failures.json b/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-shard-persistence-failures.json index 0d36cd7371f..85eb46efe84 100644 --- a/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-shard-persistence-failures.json +++ b/packages/redisenterprise_otel/kibana/alerting_rule_template/redisenterprise_otel-shard-persistence-failures.json @@ -1,40 +1,54 @@ { - "id": "redisenterprise_otel-shard-persistence-failures", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Redis Enterprise OTel] Shard persistence failures", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "redisenterprise" - ], - "schedule": { - "interval": "5m" + "id": "redisenterprise_otel-shard-persistence-failures", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when consecutive RDB / AOF rewrite failures are detected. Durability is at risk.", + "name": "[Redis Enterprise OTel] Shard persistence failures", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "redisenterprise" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "redisenterprise_otel-shard-diagnostics" }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-redisenterprise.otel-default\n| WHERE redis_rdb_saves_consecutive_failures IS NOT NULL\n OR redis_aof_rewrites_consecutive_failures IS NOT NULL\n| STATS\n max_rdb_failures = MAX(redis_rdb_saves_consecutive_failures),\n max_aof_failures = MAX(redis_aof_rewrites_consecutive_failures)\n BY redis, bdb, node, cluster\n// Alert when consecutive RDB or AOF persistence failures are detected\n// Persistence failures risk data loss during shard restarts or failovers\n// Investigate disk space, I/O throughput, and memory for copy-on-write overhead\n| WHERE COALESCE(max_rdb_failures, 0) > 0\n OR COALESCE(max_aof_failures, 0) > 0\n| SORT COALESCE(max_rdb_failures, 0) DESC" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "redis", - "termSize": 100 + { + "id": "redisenterprise_otel-overview" } + ], + "investigation_guide": { + "blob": "## Redis Enterprise shard persistence failures\n\n### What fired\nMax of RDB or AOF consecutive failures > 0.\n\n### Why it matters\nPersistence failures risk data loss during shard restarts or failovers. Usually caused by disk space, IOPS, or COW overhead.\n\n### Triage\n1. Check disk space and I/O on the node.\n2. Inspect fork memory headroom.\n\n### Remediation\n- Free disk or grow node memory.\n\n### Tuning\n- Fires on any non-zero failure count.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-redisenterprise.otel-default\n| WHERE redis_rdb_saves_consecutive_failures IS NOT NULL\n OR redis_aof_rewrites_consecutive_failures IS NOT NULL\n| STATS\n max_rdb_failures = MAX(redis_rdb_saves_consecutive_failures),\n max_aof_failures = MAX(redis_aof_rewrites_consecutive_failures)\n BY redis, bdb, node, cluster\n// Alert when consecutive RDB or AOF persistence failures are detected\n// Persistence failures risk data loss during shard restarts or failovers\n// Investigate disk space, I/O throughput, and memory for copy-on-write overhead\n| WHERE COALESCE(max_rdb_failures, 0) > 0\n OR COALESCE(max_aof_failures, 0) > 0\n| SORT COALESCE(max_rdb_failures, 0) DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "redis", + "termSize": 100 } + } } diff --git a/packages/redisenterprise_otel/manifest.yml b/packages/redisenterprise_otel/manifest.yml index 89ff4281281..ad632a4aef0 100644 --- a/packages/redisenterprise_otel/manifest.yml +++ b/packages/redisenterprise_otel/manifest.yml @@ -1,7 +1,7 @@ format_version: 3.5.7 name: redisenterprise_otel title: "Redis Enterprise OpenTelemetry Assets" -version: 0.1.0 +version: 0.2.0 source: license: "Elastic-2.0" description: "Redis Enterprise Assets for OpenTelemetry Collector" @@ -12,7 +12,7 @@ categories: - opentelemetry conditions: kibana: - version: "^9.2.1" + version: "^9.4.0" elastic: subscription: "basic" discovery: diff --git a/packages/traefik_otel/changelog.yml b/packages/traefik_otel/changelog.yml index 7cf782fcc6c..e8ca3ffc660 100644 --- a/packages/traefik_otel/changelog.yml +++ b/packages/traefik_otel/changelog.yml @@ -1,4 +1,9 @@ # newer versions go on top +- version: "0.2.0" + changes: + - description: Add description and artifact fields to alerting rule template. + type: enhancement + link: https://github.com/elastic/integrations/pull/18506 - version: "0.1.0" changes: - description: Initial draft of the package diff --git a/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-file-descriptor-pressure.json b/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-file-descriptor-pressure.json index f1efdf1a0df..8c36513f720 100644 --- a/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-file-descriptor-pressure.json +++ b/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-file-descriptor-pressure.json @@ -1,38 +1,54 @@ { - "id": "traefik_otel-file-descriptor-pressure", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Traefik OTel] File descriptor pressure", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "traefik", - "saturation" - ], - "schedule": { - "interval": "5m" + "id": "traefik_otel-file-descriptor-pressure", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when `process_open_fds / process_max_fds > 80%`. FD exhaustion prevents new connections.", + "name": "[Traefik OTel] File descriptor pressure", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "traefik", + "saturation" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 8, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "traefik_otel-process" }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 8, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-traefik.otel-*\n// Limit to process FD metrics\n| WHERE process_open_fds IS NOT NULL AND process_max_fds IS NOT NULL\n// Take latest values (gauges)\n| STATS open_fds = MAX(process_open_fds), max_fds = MAX(process_max_fds)\n// Compute utilisation; alert when > 80% (tune as needed)\n| EVAL fd_pct = ROUND(open_fds / max_fds * 100.0, 2)\n| WHERE fd_pct > 80.0" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "all", - "excludeHitsFromPreviousRun": false + { + "id": "traefik_otel-overview" } + ], + "investigation_guide": { + "blob": "## Traefik file descriptor pressure\n\n### What fired\nFD utilisation exceeded 80% during the window.\n\n### Why it matters\nTraefik uses FDs for every inbound and upstream connection. Exhaustion causes new connections to fail.\n\n### Triage\n1. Check concurrent connections and backends.\n\n### Remediation\n- Raise OS FD limits.\n\n### Tuning\n- 80% threshold.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-traefik.otel-*\n// Limit to process FD metrics\n| WHERE process_open_fds IS NOT NULL AND process_max_fds IS NOT NULL\n// Take latest values (gauges)\n| STATS open_fds = MAX(process_open_fds), max_fds = MAX(process_max_fds)\n// Compute utilisation; alert when > 80% (tune as needed)\n| EVAL fd_pct = ROUND(open_fds / max_fds * 100.0, 2)\n| WHERE fd_pct > 80.0" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "all", + "excludeHitsFromPreviousRun": false } + } } diff --git a/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-high-5xx-rate-by-entrypoint.json b/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-high-5xx-rate-by-entrypoint.json index 134d153d134..c3228fcbce6 100644 --- a/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-high-5xx-rate-by-entrypoint.json +++ b/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-high-5xx-rate-by-entrypoint.json @@ -1,39 +1,55 @@ { - "id": "traefik_otel-high-5xx-rate-by-entrypoint", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Traefik OTel] High 5xx error rate by entrypoint", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "traefik" - ], - "schedule": { - "interval": "5m" + "id": "traefik_otel-high-5xx-rate-by-entrypoint", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when the 5xx error rate on a Traefik entrypoint exceeds threshold. Edge-level server errors.", + "name": "[Traefik OTel] High 5xx error rate by entrypoint", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "traefik" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "traefik_otel-overview" }, - "alertDelay": { - "active": 3 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-traefik.otel-*\n// Limit to entrypoint-level request counters (counter requires TS + INCREASE)\n| WHERE traefik_entrypoint_requests_total IS NOT NULL\n// Aggregate total and 5xx requests per entrypoint over the time window\n| STATS\n total = SUM(traefik_entrypoint_requests_total),\n errors_5xx = SUM(traefik_entrypoint_requests_total) WHERE attributes.code LIKE \"5*\"\n BY attributes.entrypoint\n// Minimum sample size; adjust based on expected traffic per entrypoint\n| WHERE total > 100\n| EVAL error_rate_pct = ROUND(errors_5xx / total * 100.0, 2)\n// Alert threshold: tune for sensitivity\n| WHERE error_rate_pct > 5.0\n| SORT error_rate_pct DESC\n| LIMIT 10" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "attributes.entrypoint", - "termSize": 10, - "excludeHitsFromPreviousRun": true + { + "id": "traefik_otel-services" } + ], + "investigation_guide": { + "blob": "## Traefik high 5xx error rate by entrypoint\n\n### What fired\nPer-entrypoint 5xx rate exceeded threshold (min 100 total requests).\n\n### Why it matters\nEntrypoint-level 5xx rates reflect edge-level failures, including both upstream problems and Traefik-internal errors.\n\n### Triage\n1. Split to service/router-level 5xx rules.\n2. Check upstream service health.\n\n### Remediation\n- Roll back / fix the failing upstream.\n\n### Tuning\n- Tune threshold to SLA.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-traefik.otel-*\n// Limit to entrypoint-level request counters (counter requires TS + INCREASE)\n| WHERE traefik_entrypoint_requests_total IS NOT NULL\n// Aggregate total and 5xx requests per entrypoint over the time window\n| STATS\n total = SUM(traefik_entrypoint_requests_total),\n errors_5xx = SUM(traefik_entrypoint_requests_total) WHERE attributes.code LIKE \"5*\"\n BY attributes.entrypoint\n// Minimum sample size; adjust based on expected traffic per entrypoint\n| WHERE total > 100\n| EVAL error_rate_pct = ROUND(errors_5xx / total * 100.0, 2)\n// Alert threshold: tune for sensitivity\n| WHERE error_rate_pct > 5.0\n| SORT error_rate_pct DESC\n| LIMIT 10" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "attributes.entrypoint", + "termSize": 10, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-high-5xx-rate-by-router.json b/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-high-5xx-rate-by-router.json index cf6725d7d0b..d9580b72a52 100644 --- a/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-high-5xx-rate-by-router.json +++ b/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-high-5xx-rate-by-router.json @@ -1,39 +1,55 @@ { - "id": "traefik_otel-high-5xx-rate-by-router", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Traefik OTel] High 5xx error rate by router", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "traefik" - ], - "schedule": { - "interval": "5m" + "id": "traefik_otel-high-5xx-rate-by-router", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when 5xx rate on a specific Traefik router exceeds threshold. Localises failures to a routing rule.", + "name": "[Traefik OTel] High 5xx error rate by router", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "traefik" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "traefik_otel-services" }, - "alertDelay": { - "active": 3 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-traefik.otel-*\n// Limit to router-level request counters (counter requires TS + INCREASE)\n| WHERE traefik_router_requests_total IS NOT NULL\n// Aggregate total and 5xx requests per router over the time window\n| STATS\n total = SUM(traefik_router_requests_total),\n errors_5xx = SUM(traefik_router_requests_total) WHERE attributes.code LIKE \"5*\"\n BY attributes.router\n// Minimum sample size; adjust based on expected traffic per router\n| WHERE total > 50\n| EVAL error_rate_pct = ROUND(errors_5xx / total * 100.0, 2)\n// Alert threshold: tune for sensitivity\n| WHERE error_rate_pct > 5.0\n| SORT error_rate_pct DESC\n| LIMIT 10" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "attributes.router", - "termSize": 10, - "excludeHitsFromPreviousRun": true + { + "id": "traefik_otel-overview" } + ], + "investigation_guide": { + "blob": "## Traefik high 5xx error rate by router\n\n### What fired\nPer-router 5xx rate exceeded threshold (min 50 total requests).\n\n### Why it matters\nRouter-level analysis isolates failures to specific host / path rules. Usually points at a misbehaving upstream service.\n\n### Triage\n1. Check the matched service.\n\n### Remediation\n- Fix the upstream.\n\n### Tuning\n- Tune threshold per router criticality.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-traefik.otel-*\n// Limit to router-level request counters (counter requires TS + INCREASE)\n| WHERE traefik_router_requests_total IS NOT NULL\n// Aggregate total and 5xx requests per router over the time window\n| STATS\n total = SUM(traefik_router_requests_total),\n errors_5xx = SUM(traefik_router_requests_total) WHERE attributes.code LIKE \"5*\"\n BY attributes.router\n// Minimum sample size; adjust based on expected traffic per router\n| WHERE total > 50\n| EVAL error_rate_pct = ROUND(errors_5xx / total * 100.0, 2)\n// Alert threshold: tune for sensitivity\n| WHERE error_rate_pct > 5.0\n| SORT error_rate_pct DESC\n| LIMIT 10" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "attributes.router", + "termSize": 10, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-high-5xx-rate-by-service.json b/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-high-5xx-rate-by-service.json index 6e77a9a7d46..4049521959a 100644 --- a/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-high-5xx-rate-by-service.json +++ b/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-high-5xx-rate-by-service.json @@ -1,39 +1,55 @@ { - "id": "traefik_otel-high-5xx-rate-by-service", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Traefik OTel] High 5xx error rate by service", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "traefik" - ], - "schedule": { - "interval": "5m" + "id": "traefik_otel-high-5xx-rate-by-service", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when per-service 5xx rate exceeds threshold. Service-level error diagnosis.", + "name": "[Traefik OTel] High 5xx error rate by service", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "traefik" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "traefik_otel-services" }, - "alertDelay": { - "active": 3 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-traefik.otel-*\n// Limit to service-level request counters (counter requires TS + INCREASE)\n| WHERE traefik_service_requests_total IS NOT NULL\n// Aggregate total and 5xx requests per service over the time window\n| STATS\n total = SUM(traefik_service_requests_total),\n errors_5xx = SUM(traefik_service_requests_total) WHERE attributes.code LIKE \"5*\"\n BY attributes.service\n// Minimum sample size to avoid noisy low-traffic services; adjust based on expected volume\n| WHERE total > 100\n// Calculate error rate as percentage\n| EVAL error_rate_pct = ROUND(errors_5xx / total * 100.0, 2)\n// Alert threshold: tune for sensitivity (e.g. 5.0 = 5%)\n| WHERE error_rate_pct > 5.0\n| SORT error_rate_pct DESC\n| LIMIT 10" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "attributes.service", - "termSize": 10, - "excludeHitsFromPreviousRun": true + { + "id": "traefik_otel-overview" } + ], + "investigation_guide": { + "blob": "## Traefik high 5xx error rate by service\n\n### What fired\nPer-service 5xx rate exceeded threshold (min 100 total requests).\n\n### Why it matters\nService-level 5xx indicates problems with the backend instances fronted by that service.\n\n### Triage\n1. Check backend health-checks.\n2. Inspect deployment state.\n\n### Remediation\n- Fix / roll back the backend.\n\n### Tuning\n- Tune threshold per SLA.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-traefik.otel-*\n// Limit to service-level request counters (counter requires TS + INCREASE)\n| WHERE traefik_service_requests_total IS NOT NULL\n// Aggregate total and 5xx requests per service over the time window\n| STATS\n total = SUM(traefik_service_requests_total),\n errors_5xx = SUM(traefik_service_requests_total) WHERE attributes.code LIKE \"5*\"\n BY attributes.service\n// Minimum sample size to avoid noisy low-traffic services; adjust based on expected volume\n| WHERE total > 100\n// Calculate error rate as percentage\n| EVAL error_rate_pct = ROUND(errors_5xx / total * 100.0, 2)\n// Alert threshold: tune for sensitivity (e.g. 5.0 = 5%)\n| WHERE error_rate_pct > 5.0\n| SORT error_rate_pct DESC\n| LIMIT 10" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "attributes.service", + "termSize": 10, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-high-goroutine-count.json b/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-high-goroutine-count.json index 1347b74733d..e201bf7f4b6 100644 --- a/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-high-goroutine-count.json +++ b/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-high-goroutine-count.json @@ -1,38 +1,54 @@ { - "id": "traefik_otel-high-goroutine-count", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Traefik OTel] High goroutine count", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "traefik", - "saturation" - ], - "schedule": { - "interval": "5m" + "id": "traefik_otel-high-goroutine-count", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when `go_goroutines > 2000`. Possible goroutine leak in Traefik.", + "name": "[Traefik OTel] High goroutine count", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "traefik", + "saturation" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 8, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "traefik_otel-process" }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 8, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-traefik.otel-*\n// Limit to Go runtime goroutine gauge\n| WHERE go_goroutines IS NOT NULL\n// Take max goroutine count in window (gauge)\n| STATS goroutines = MAX(go_goroutines)\n// Alert when goroutines exceed threshold; tune for typical baseline (e.g. 500-2000)\n| WHERE goroutines > 2000" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "all", - "excludeHitsFromPreviousRun": false + { + "id": "traefik_otel-overview" } + ], + "investigation_guide": { + "blob": "## Traefik high goroutine count\n\n### What fired\n`go_goroutines > 2000` during the window.\n\n### Why it matters\nTraefik uses one goroutine per active connection plus workers. Elevated counts usually mean connection leaks or a stuck plugin.\n\n### Triage\n1. Sample Go profile from Traefik admin endpoint.\n2. Check plugin / middleware behaviour.\n\n### Remediation\n- Restart Traefik.\n- File issue if reproducible.\n\n### Tuning\n- 2000 threshold; tune to baseline (typically 500-2000).\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-traefik.otel-*\n// Limit to Go runtime goroutine gauge\n| WHERE go_goroutines IS NOT NULL\n// Take max goroutine count in window (gauge)\n| STATS goroutines = MAX(go_goroutines)\n// Alert when goroutines exceed threshold; tune for typical baseline (e.g. 500-2000)\n| WHERE goroutines > 2000" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "all", + "excludeHitsFromPreviousRun": false } + } } diff --git a/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-high-memory-usage.json b/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-high-memory-usage.json index d7a98076e26..2b11e4bdb0d 100644 --- a/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-high-memory-usage.json +++ b/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-high-memory-usage.json @@ -1,38 +1,54 @@ { - "id": "traefik_otel-high-memory-usage", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Traefik OTel] High memory usage", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "traefik", - "saturation" - ], - "schedule": { - "interval": "5m" + "id": "traefik_otel-high-memory-usage", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when process RSS exceeds 2 GB. Traefik is usually memory-light; elevated RSS indicates workload growth or a leak.", + "name": "[Traefik OTel] High memory usage", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "traefik", + "saturation" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 8, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "traefik_otel-process" }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 8, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-traefik.otel-*\n// Limit to process resident memory gauge (RSS)\n| WHERE process_resident_memory_bytes IS NOT NULL\n// Take max RSS in window\n| STATS memory_bytes = MAX(process_resident_memory_bytes)\n// Convert to GB; alert when > 2GB (tune based on expected workload)\n| EVAL memory_gb = ROUND(memory_bytes / 1073741824.0, 2)\n| WHERE memory_gb > 2.0" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "all", - "excludeHitsFromPreviousRun": false + { + "id": "traefik_otel-overview" } + ], + "investigation_guide": { + "blob": "## Traefik high memory usage\n\n### What fired\n`process_resident_memory_bytes > 2 GB` during the window.\n\n### Why it matters\nTypical Traefik footprint is 100\u2013500 MB. Multi-GB usage suggests a leak or very large config.\n\n### Triage\n1. Check config size and active connection count.\n\n### Remediation\n- Restart / scale Traefik.\n\n### Tuning\n- 2 GB threshold; tune per workload.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-traefik.otel-*\n// Limit to process resident memory gauge (RSS)\n| WHERE process_resident_memory_bytes IS NOT NULL\n// Take max RSS in window\n| STATS memory_bytes = MAX(process_resident_memory_bytes)\n// Convert to GB; alert when > 2GB (tune based on expected workload)\n| EVAL memory_gb = ROUND(memory_bytes / 1073741824.0, 2)\n| WHERE memory_gb > 2.0" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "all", + "excludeHitsFromPreviousRun": false } + } } diff --git a/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-high-open-connections.json b/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-high-open-connections.json index 57d7c1d40a8..1e568dd20fc 100644 --- a/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-high-open-connections.json +++ b/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-high-open-connections.json @@ -1,38 +1,54 @@ { - "id": "traefik_otel-high-open-connections", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Traefik OTel] High open connections", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "traefik", - "saturation" - ], - "schedule": { - "interval": "5m" + "id": "traefik_otel-high-open-connections", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when total open connections exceed 1000. High connection count can exhaust FDs and workers.", + "name": "[Traefik OTel] High open connections", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "traefik", + "saturation" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 8, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "traefik_otel-process" }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 8, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-traefik.otel-*\n// Limit to open connections gauge (per entrypoint, protocol)\n| WHERE traefik_open_connections IS NOT NULL\n// Take latest value per entrypoint/protocol (gauge - last value in window)\n| STATS conns = MAX(traefik_open_connections) BY attributes.entrypoint, attributes.protocol\n// Sum across entrypoints for total, or alert per entrypoint\n| STATS total_conns = SUM(conns)\n// Alert when connections exceed threshold; tune based on expected capacity\n| WHERE total_conns > 1000" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "all", - "excludeHitsFromPreviousRun": false + { + "id": "traefik_otel-overview" } + ], + "investigation_guide": { + "blob": "## Traefik high open connections\n\n### What fired\nTotal `traefik_open_connections > 1000` during the window.\n\n### Why it matters\nConnection counts that grow steadily beyond expected baseline usually mean slow backends or missing keep-alive tuning.\n\n### Triage\n1. Check backend latency.\n\n### Remediation\n- Tune idle timeouts.\n\n### Tuning\n- 1000 threshold; tune per capacity.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-traefik.otel-*\n// Limit to open connections gauge (per entrypoint, protocol)\n| WHERE traefik_open_connections IS NOT NULL\n// Take latest value per entrypoint/protocol (gauge - last value in window)\n| STATS conns = MAX(traefik_open_connections) BY attributes.entrypoint, attributes.protocol\n// Sum across entrypoints for total, or alert per entrypoint\n| STATS total_conns = SUM(conns)\n// Alert when connections exceed threshold; tune based on expected capacity\n| WHERE total_conns > 1000" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "all", + "excludeHitsFromPreviousRun": false } + } } diff --git a/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-stale-config-reload.json b/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-stale-config-reload.json index f393578f8dc..38299ac1f4f 100644 --- a/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-stale-config-reload.json +++ b/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-stale-config-reload.json @@ -1,38 +1,51 @@ { - "id": "traefik_otel-stale-config-reload", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Traefik OTel] Stale config reload success", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "traefik", - "configuration" - ], - "schedule": { - "interval": "15m" - }, - "alertDelay": { - "active": 1 - }, - "flapping": { - "lookBackWindow": 6, - "statusChangeThreshold": 3 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-traefik.otel-*\n// Limit to last successful reload timestamp (gauge, Unix epoch)\n| WHERE traefik_config_last_reload_success IS NOT NULL\n// Take latest value\n| STATS last_success_epoch = MAX(traefik_config_last_reload_success)\n// If last success was > 24h ago, config may be stuck or reloads failing\n// Adjust 24 to match expected reload frequency\n| EVAL last_success_dt = TO_DATETIME(last_success_epoch * 1000)\n| EVAL hours_since_reload = DATE_DIFF(\"hours\", last_success_dt, NOW())\n| WHERE hours_since_reload > 24" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 1, - "timeWindowUnit": "h", - "groupBy": "all", - "excludeHitsFromPreviousRun": false + "id": "traefik_otel-stale-config-reload", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when the last successful config reload was more than 24 hours ago. Reloads may be stuck or failing.", + "name": "[Traefik OTel] Stale config reload success", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "traefik", + "configuration" + ], + "schedule": { + "interval": "15m" + }, + "alertDelay": { + "active": 1 + }, + "flapping": { + "lookBackWindow": 6, + "statusChangeThreshold": 3 + }, + "artifacts": { + "dashboards": [ + { + "id": "traefik_otel-overview" } + ], + "investigation_guide": { + "blob": "## Traefik stale config reload\n\n### What fired\nLast successful reload was more than 24 hours ago.\n\n### Why it matters\nTraefik should reload promptly on config changes. A stale last-success timestamp indicates reloads are failing silently or the config source is broken.\n\n### Triage\n1. Check Traefik logs for reload errors.\n2. Verify the config source (Kubernetes Ingress / Docker labels).\n\n### Remediation\n- Fix the config source / restart Traefik.\n\n### Tuning\n- 24h threshold; tune to expected reload cadence.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-traefik.otel-*\n// Limit to last successful reload timestamp (gauge, Unix epoch)\n| WHERE traefik_config_last_reload_success IS NOT NULL\n// Take latest value\n| STATS last_success_epoch = MAX(traefik_config_last_reload_success)\n// If last success was > 24h ago, config may be stuck or reloads failing\n// Adjust 24 to match expected reload frequency\n| EVAL last_success_dt = TO_DATETIME(last_success_epoch * 1000)\n| EVAL hours_since_reload = DATE_DIFF(\"hours\", last_success_dt, NOW())\n| WHERE hours_since_reload > 24" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 1, + "timeWindowUnit": "h", + "groupBy": "all", + "excludeHitsFromPreviousRun": false } + } } diff --git a/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-tls-cert-expiry-imminent.json b/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-tls-cert-expiry-imminent.json index 452f29f34c5..fa9cacba851 100644 --- a/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-tls-cert-expiry-imminent.json +++ b/packages/traefik_otel/kibana/alerting_rule_template/traefik_otel-tls-cert-expiry-imminent.json @@ -1,40 +1,56 @@ { - "id": "traefik_otel-tls-cert-expiry-imminent", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[Traefik OTel] TLS certificate expiry imminent", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "traefik", - "tls" - ], - "schedule": { - "interval": "1h" + "id": "traefik_otel-tls-cert-expiry-imminent", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when any TLS cert served by Traefik expires within 30 days. Expiry breaks client TLS.", + "name": "[Traefik OTel] TLS certificate expiry imminent", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "traefik", + "tls" + ], + "schedule": { + "interval": "1h" + }, + "alertDelay": { + "active": 1 + }, + "flapping": { + "lookBackWindow": 5, + "statusChangeThreshold": 3 + }, + "artifacts": { + "dashboards": [ + { + "id": "traefik_otel-tls-config" }, - "alertDelay": { - "active": 1 - }, - "flapping": { - "lookBackWindow": 5, - "statusChangeThreshold": 3 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-traefik.otel-*\n// Limit to TLS cert expiry timestamps (Unix epoch seconds)\n| WHERE traefik_tls_certs_not_after IS NOT NULL\n// Take latest expiry per cert (cn + serial)\n| STATS expiry_epoch = MAX(traefik_tls_certs_not_after) BY attributes.cn, attributes.serial\n// Convert epoch seconds to datetime; compute days until expiry\n// Adjust 30-day threshold based on renewal lead time\n| EVAL expiry_datetime = TO_DATETIME(expiry_epoch * 1000)\n| EVAL days_until_expiry = DATE_DIFF(\"days\", NOW(), expiry_datetime)\n| WHERE days_until_expiry < 30 AND days_until_expiry >= 0\n| SORT days_until_expiry ASC" - }, - "size": 0, - "threshold": [0], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 1, - "timeWindowUnit": "h", - "groupBy": "row", - "termField": "attributes.cn", - "termSize": 20, - "excludeHitsFromPreviousRun": false + { + "id": "traefik_otel-overview" } + ], + "investigation_guide": { + "blob": "## Traefik TLS certificate expiry imminent\n\n### What fired\nTLS cert expiry less than 30 days for a (cn, serial) pair.\n\n### Why it matters\nExpired TLS certs cause handshake failures on Traefik's entrypoints. Renewal usually automated via ACME.\n\n### Triage\n1. Identify the specific cert.\n2. Inspect ACME / external issuer state.\n\n### Remediation\n- Force-renew the cert.\n\n### Tuning\n- 30 days lead; use shorter for short-lived certs.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-traefik.otel-*\n// Limit to TLS cert expiry timestamps (Unix epoch seconds)\n| WHERE traefik_tls_certs_not_after IS NOT NULL\n// Take latest expiry per cert (cn + serial)\n| STATS expiry_epoch = MAX(traefik_tls_certs_not_after) BY attributes.cn, attributes.serial\n// Convert epoch seconds to datetime; compute days until expiry\n// Adjust 30-day threshold based on renewal lead time\n| EVAL expiry_datetime = TO_DATETIME(expiry_epoch * 1000)\n| EVAL days_until_expiry = DATE_DIFF(\"days\", NOW(), expiry_datetime)\n| WHERE days_until_expiry < 30 AND days_until_expiry >= 0\n| SORT days_until_expiry ASC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 1, + "timeWindowUnit": "h", + "groupBy": "row", + "termField": "attributes.cn", + "termSize": 20, + "excludeHitsFromPreviousRun": false } + } } diff --git a/packages/traefik_otel/manifest.yml b/packages/traefik_otel/manifest.yml index 84e1c270327..afac6d1b467 100644 --- a/packages/traefik_otel/manifest.yml +++ b/packages/traefik_otel/manifest.yml @@ -1,7 +1,7 @@ format_version: 3.5.7 name: traefik_otel title: "Traefik OpenTelemetry Assets" -version: 0.1.0 +version: 0.2.0 source: license: "Elastic-2.0" description: "Traefik Assets for OpenTelemetry Collector" @@ -17,7 +17,7 @@ categories: - network conditions: kibana: - version: "^9.2.1" + version: "^9.4.0" elastic: subscription: "basic" discovery: diff --git a/packages/vsphere_otel/changelog.yml b/packages/vsphere_otel/changelog.yml index 083bedf81ec..1381e4e533b 100644 --- a/packages/vsphere_otel/changelog.yml +++ b/packages/vsphere_otel/changelog.yml @@ -1,4 +1,9 @@ # newer versions go on top +- version: "0.2.0" + changes: + - description: Add description and artifact fields to alerting rule template. + type: enhancement + link: https://github.com/elastic/integrations/pull/18506 - version: "0.1.0" changes: - description: Initial draft of the vSphere OpenTelemetry Assets Package diff --git a/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-datastore-utilization-high.json b/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-datastore-utilization-high.json index 1a867933c25..fc7969887d8 100644 --- a/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-datastore-utilization-high.json +++ b/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-datastore-utilization-high.json @@ -1,43 +1,57 @@ { - "id": "vsphere_otel-datastore-utilization-high", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[vSphere OTel] Datastore utilization high", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "vsphere", - "vcenter", - "storage" - ], - "schedule": { - "interval": "5m" + "id": "vsphere_otel-datastore-utilization-high", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when `vcenter.datastore.disk.utilization > 80%`. Datastore full halts VMs using it.", + "name": "[vSphere OTel] Datastore utilization high", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "vsphere", + "vcenter", + "storage" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "vsphere_otel-storage" }, - "alertDelay": { - "active": 3 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-vcenterreceiver.otel-*\n// Datastore disk utilization (%): healthy <80%; critical >90% approaching full\n// Adjust threshold (e.g. 80 or 90) based on provisioning lead time\n| WHERE vcenter.datastore.disk.utilization IS NOT NULL AND vcenter.datastore.disk.utilization > 80\n| STATS utilization_pct = MAX(vcenter.datastore.disk.utilization) BY vcenter.datacenter.name, vcenter.datastore.name\n| SORT utilization_pct DESC\n| LIMIT 20" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "vcenter.datastore.name", - "termSize": 20, - "excludeHitsFromPreviousRun": true + { + "id": "vsphere_otel-overview" } + ], + "investigation_guide": { + "blob": "## vSphere datastore utilization high\n\n### What fired\nDatastore utilisation exceeded 80% during the window.\n\n### Why it matters\nFull datastores prevent VM snapshots, migrations, and thin-disk growth. Critical to address before hitting 100%.\n\n### Triage\n1. Identify large VMs / snapshots on the datastore.\n\n### Remediation\n- Storage vMotion VMs to a less utilised datastore.\n- Grow the datastore.\n\n### Tuning\n- 80% threshold; tighten for small datastores.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-vcenterreceiver.otel-*\n// Datastore disk utilization (%): healthy <80%; critical >90% approaching full\n// Adjust threshold (e.g. 80 or 90) based on provisioning lead time\n| WHERE vcenter.datastore.disk.utilization IS NOT NULL AND vcenter.datastore.disk.utilization > 80\n| STATS utilization_pct = MAX(vcenter.datastore.disk.utilization) BY vcenter.datacenter.name, vcenter.datastore.name\n| SORT utilization_pct DESC\n| LIMIT 20" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "vcenter.datastore.name", + "termSize": 20, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-entity-status-unhealthy.json b/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-entity-status-unhealthy.json index 1ac83d9cdf9..aed6a817368 100644 --- a/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-entity-status-unhealthy.json +++ b/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-entity-status-unhealthy.json @@ -1,43 +1,60 @@ { - "id": "vsphere_otel-entity-status-unhealthy", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[vSphere OTel] Entity status unhealthy", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "vsphere", - "vcenter", - "availability" - ], - "schedule": { - "interval": "5m" + "id": "vsphere_otel-entity-status-unhealthy", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when any datacenter entity (host, VM, cluster) reports yellow or red status.", + "name": "[vSphere OTel] Entity status unhealthy", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "vsphere", + "vcenter", + "availability" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "vsphere_otel-overview" }, - "alertDelay": { - "active": 2 + { + "id": "vsphere_otel-hosts" }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-vcenterreceiver.otel-*\n// Datacenter-level host, VM, and cluster counts by entity status\n// Yellow or red status indicates problems; gray may indicate disconnected\n// Combine host, VM, and cluster counts with unhealthy status\n| WHERE (vcenter.datacenter.host.count IS NOT NULL OR vcenter.datacenter.vm.count IS NOT NULL OR vcenter.datacenter.cluster.count IS NOT NULL)\n AND attributes.status IN (\"yellow\", \"red\")\n| EVAL entity_type = CASE(vcenter.datacenter.host.count IS NOT NULL, \"host\", vcenter.datacenter.vm.count IS NOT NULL, \"vm\", \"cluster\")\n| EVAL unhealthy_count = COALESCE(vcenter.datacenter.host.count, vcenter.datacenter.vm.count, vcenter.datacenter.cluster.count)\n| WHERE unhealthy_count > 0\n| STATS total_unhealthy = SUM(unhealthy_count) BY vcenter.datacenter.name, attributes.status, entity_type\n| SORT total_unhealthy DESC\n| LIMIT 20" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "vcenter.datacenter.name", - "termSize": 20, - "excludeHitsFromPreviousRun": true + { + "id": "vsphere_otel-vms" } + ], + "investigation_guide": { + "blob": "## vSphere entity status unhealthy\n\n### What fired\nEntities with `status IN (yellow, red)` exceeded 0.\n\n### Why it matters\nvSphere rolls up many underlying health checks into yellow/red status. Yellow is a warning; red is a failure \u2014 always worth investigating.\n\n### Triage\n1. Drill into vCenter for the specific alarms.\n\n### Remediation\n- Fix root cause based on alarm text.\n\n### Tuning\n- Any non-green is alerted.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-vcenterreceiver.otel-*\n// Datacenter-level host, VM, and cluster counts by entity status\n// Yellow or red status indicates problems; gray may indicate disconnected\n// Combine host, VM, and cluster counts with unhealthy status\n| WHERE (vcenter.datacenter.host.count IS NOT NULL OR vcenter.datacenter.vm.count IS NOT NULL OR vcenter.datacenter.cluster.count IS NOT NULL)\n AND attributes.status IN (\"yellow\", \"red\")\n| EVAL entity_type = CASE(vcenter.datacenter.host.count IS NOT NULL, \"host\", vcenter.datacenter.vm.count IS NOT NULL, \"vm\", \"cluster\")\n| EVAL unhealthy_count = COALESCE(vcenter.datacenter.host.count, vcenter.datacenter.vm.count, vcenter.datacenter.cluster.count)\n| WHERE unhealthy_count > 0\n| STATS total_unhealthy = SUM(unhealthy_count) BY vcenter.datacenter.name, attributes.status, entity_type\n| SORT total_unhealthy DESC\n| LIMIT 20" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "vcenter.datacenter.name", + "termSize": 20, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-host-cpu-utilization-high.json b/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-host-cpu-utilization-high.json index 26e9d185131..2f98cee41e6 100644 --- a/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-host-cpu-utilization-high.json +++ b/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-host-cpu-utilization-high.json @@ -1,43 +1,57 @@ { - "id": "vsphere_otel-host-cpu-utilization-high", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[vSphere OTel] Host CPU utilization high", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "vsphere", - "vcenter", - "cpu" - ], - "schedule": { - "interval": "5m" + "id": "vsphere_otel-host-cpu-utilization-high", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when host CPU utilisation exceeds 80%. Hosts above 80% cause VM performance degradation.", + "name": "[vSphere OTel] Host CPU utilization high", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "vsphere", + "vcenter", + "cpu" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "vsphere_otel-hosts" }, - "alertDelay": { - "active": 3 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-vcenterreceiver.otel-*\n// Host CPU utilization: healthy <80%; unhealthy >90%\n// Adjust threshold (e.g. 80 or 90) to suit environment\n| WHERE vcenter.host.cpu.utilization IS NOT NULL AND vcenter.host.cpu.utilization > 80\n| STATS utilization_pct = MAX(vcenter.host.cpu.utilization) BY vcenter.datacenter.name, vcenter.cluster.name, vcenter.host.name\n| SORT utilization_pct DESC\n| LIMIT 20" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "vcenter.host.name", - "termSize": 20, - "excludeHitsFromPreviousRun": true + { + "id": "vsphere_otel-overview" } + ], + "investigation_guide": { + "blob": "## vSphere host CPU utilization high\n\n### What fired\n`vcenter.host.cpu.utilization > 80%` during the window.\n\n### Why it matters\nSaturated ESXi hosts cause CPU-ready time on VMs and performance degradation. DRS should normally rebalance, but heavy spikes slip through.\n\n### Triage\n1. Check DRS settings and cluster capacity.\n2. Identify CPU-heavy VMs.\n\n### Remediation\n- Migrate VMs via vMotion.\n- Add hosts.\n\n### Tuning\n- 80% threshold; 90% for more tolerant envs.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-vcenterreceiver.otel-*\n// Host CPU utilization: healthy <80%; unhealthy >90%\n// Adjust threshold (e.g. 80 or 90) to suit environment\n| WHERE vcenter.host.cpu.utilization IS NOT NULL AND vcenter.host.cpu.utilization > 80\n| STATS utilization_pct = MAX(vcenter.host.cpu.utilization) BY vcenter.datacenter.name, vcenter.cluster.name, vcenter.host.name\n| SORT utilization_pct DESC\n| LIMIT 20" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "vcenter.host.name", + "termSize": 20, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-host-disk-latency-high.json b/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-host-disk-latency-high.json index 0a461bd23f0..6283c18e56d 100644 --- a/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-host-disk-latency-high.json +++ b/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-host-disk-latency-high.json @@ -1,43 +1,57 @@ { - "id": "vsphere_otel-host-disk-latency-high", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[vSphere OTel] Host disk latency high", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "vsphere", - "vcenter", - "storage" - ], - "schedule": { - "interval": "5m" + "id": "vsphere_otel-host-disk-latency-high", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when host max disk latency exceeds 15 ms. Storage pressure impacts all VMs on the host.", + "name": "[vSphere OTel] Host disk latency high", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "vsphere", + "vcenter", + "storage" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "vsphere_otel-storage" }, - "alertDelay": { - "active": 3 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-vcenterreceiver.otel-*\n// Host disk latency max (ms): healthy <15ms; unhealthy >30ms\n// Uses max latency per object; use vcenter.host.disk.latency.avg if available for finer granularity\n// Adjust threshold to suit storage tier\n| WHERE vcenter.host.disk.latency.max IS NOT NULL AND vcenter.host.disk.latency.max > 15\n| STATS latency_ms = MAX(vcenter.host.disk.latency.max) BY vcenter.datacenter.name, vcenter.cluster.name, vcenter.host.name, attributes.object\n| SORT latency_ms DESC\n| LIMIT 20" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "vcenter.host.name", - "termSize": 20, - "excludeHitsFromPreviousRun": true + { + "id": "vsphere_otel-hosts" } + ], + "investigation_guide": { + "blob": "## vSphere host disk latency high\n\n### What fired\n`vcenter.host.disk.latency.max > 15 ms` during the window.\n\n### Why it matters\nHost-level disk latency affects every VM on the host. Usually points at storage array pressure or a bad path.\n\n### Triage\n1. Check per-LUN latency.\n2. Inspect path health (multipathing).\n\n### Remediation\n- Rebalance storage / fix path.\n\n### Tuning\n- 15 ms threshold; 30 ms for tolerant workloads.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-vcenterreceiver.otel-*\n// Host disk latency max (ms): healthy <15ms; unhealthy >30ms\n// Uses max latency per object; use vcenter.host.disk.latency.avg if available for finer granularity\n// Adjust threshold to suit storage tier\n| WHERE vcenter.host.disk.latency.max IS NOT NULL AND vcenter.host.disk.latency.max > 15\n| STATS latency_ms = MAX(vcenter.host.disk.latency.max) BY vcenter.datacenter.name, vcenter.cluster.name, vcenter.host.name, attributes.object\n| SORT latency_ms DESC\n| LIMIT 20" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "vcenter.host.name", + "termSize": 20, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-host-memory-utilization-high.json b/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-host-memory-utilization-high.json index 65d0d99c05a..cf688d9fec9 100644 --- a/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-host-memory-utilization-high.json +++ b/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-host-memory-utilization-high.json @@ -1,43 +1,57 @@ { - "id": "vsphere_otel-host-memory-utilization-high", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[vSphere OTel] Host memory utilization high", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "vsphere", - "vcenter", - "memory" - ], - "schedule": { - "interval": "5m" + "id": "vsphere_otel-host-memory-utilization-high", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when host memory utilisation exceeds 85%. Above 85% risks ballooning and swapping on VMs.", + "name": "[vSphere OTel] Host memory utilization high", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "vsphere", + "vcenter", + "memory" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "vsphere_otel-hosts" }, - "alertDelay": { - "active": 3 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-vcenterreceiver.otel-*\n// Host memory utilization: healthy <85%; unhealthy >90%\n// Adjust threshold (e.g. 85 or 90) to suit environment\n| WHERE vcenter.host.memory.utilization IS NOT NULL AND vcenter.host.memory.utilization > 85\n| STATS utilization_pct = MAX(vcenter.host.memory.utilization) BY vcenter.datacenter.name, vcenter.cluster.name, vcenter.host.name\n| SORT utilization_pct DESC\n| LIMIT 20" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "vcenter.host.name", - "termSize": 20, - "excludeHitsFromPreviousRun": true + { + "id": "vsphere_otel-overview" } + ], + "investigation_guide": { + "blob": "## vSphere host memory utilization high\n\n### What fired\n`vcenter.host.memory.utilization > 85%` during the window.\n\n### Why it matters\nAbove ~85% memory, ESXi starts reclaiming from VMs \u2014 ballooning, compression, eventually swapping. Performance collapses at high utilisation.\n\n### Triage\n1. Correlate with VM ballooning / swapping metrics.\n\n### Remediation\n- Migrate VMs.\n- Add hosts.\n\n### Tuning\n- 85% threshold; 90% for tolerant environments.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-vcenterreceiver.otel-*\n// Host memory utilization: healthy <85%; unhealthy >90%\n// Adjust threshold (e.g. 85 or 90) to suit environment\n| WHERE vcenter.host.memory.utilization IS NOT NULL AND vcenter.host.memory.utilization > 85\n| STATS utilization_pct = MAX(vcenter.host.memory.utilization) BY vcenter.datacenter.name, vcenter.cluster.name, vcenter.host.name\n| SORT utilization_pct DESC\n| LIMIT 20" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "vcenter.host.name", + "termSize": 20, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-host-power-state-unhealthy.json b/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-host-power-state-unhealthy.json index 14540411e11..6f17f36e0a5 100644 --- a/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-host-power-state-unhealthy.json +++ b/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-host-power-state-unhealthy.json @@ -1,43 +1,57 @@ { - "id": "vsphere_otel-host-power-state-unhealthy", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[vSphere OTel] Host power state unhealthy", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "vsphere", - "vcenter", - "availability" - ], - "schedule": { - "interval": "5m" + "id": "vsphere_otel-host-power-state-unhealthy", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when datacenter hosts are in `off` or `standby` power state. Usually hardware failure or unplanned maintenance.", + "name": "[vSphere OTel] Host power state unhealthy", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "vsphere", + "vcenter", + "availability" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "vsphere_otel-hosts" }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-vcenterreceiver.otel-*\n// Datacenter host count by power_state: off or standby indicates problems\n// May indicate hardware failure or unplanned maintenance\n// Excludes \"on\" and \"unknown\"; adjust if unknown should be included\n| WHERE vcenter.datacenter.host.count IS NOT NULL\n AND attributes.power_state IN (\"off\", \"standby\")\n AND vcenter.datacenter.host.count > 0\n| STATS host_count = MAX(vcenter.datacenter.host.count) BY vcenter.datacenter.name, attributes.power_state\n| SORT host_count DESC\n| LIMIT 20" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "vcenter.datacenter.name", - "termSize": 20, - "excludeHitsFromPreviousRun": true + { + "id": "vsphere_otel-overview" } + ], + "investigation_guide": { + "blob": "## vSphere host power state unhealthy\n\n### What fired\nHost count with `power_state in (off, standby)` exceeded 0.\n\n### Why it matters\nUnplanned host power-off reduces cluster capacity; ongoing standby indicates scheduled maintenance or DPM.\n\n### Triage\n1. Check maintenance windows.\n2. Inspect hardware alerts (HW sensors).\n\n### Remediation\n- Recover or replace the host.\n\n### Tuning\n- Fires on any off/standby host.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-vcenterreceiver.otel-*\n// Datacenter host count by power_state: off or standby indicates problems\n// May indicate hardware failure or unplanned maintenance\n// Excludes \"on\" and \"unknown\"; adjust if unknown should be included\n| WHERE vcenter.datacenter.host.count IS NOT NULL\n AND attributes.power_state IN (\"off\", \"standby\")\n AND vcenter.datacenter.host.count > 0\n| STATS host_count = MAX(vcenter.datacenter.host.count) BY vcenter.datacenter.name, attributes.power_state\n| SORT host_count DESC\n| LIMIT 20" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "vcenter.datacenter.name", + "termSize": 20, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-vm-disk-latency-high.json b/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-vm-disk-latency-high.json index 79981d5e05f..a77423fbb2e 100644 --- a/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-vm-disk-latency-high.json +++ b/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-vm-disk-latency-high.json @@ -1,43 +1,57 @@ { - "id": "vsphere_otel-vm-disk-latency-high", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[vSphere OTel] VM disk latency high", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "vsphere", - "vcenter", - "storage" - ], - "schedule": { - "interval": "5m" + "id": "vsphere_otel-vm-disk-latency-high", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when VM max disk latency exceeds 20 ms. VM-specific storage pressure.", + "name": "[vSphere OTel] VM disk latency high", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "vsphere", + "vcenter", + "storage" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "vsphere_otel-storage" }, - "alertDelay": { - "active": 3 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-vcenterreceiver.otel-*\n// VM disk latency max (ms): healthy <20ms; unhealthy >30ms\n// Uses max latency per object; use vcenter.vm.disk.latency.avg if available for finer granularity\n// Adjust threshold to suit workload\n| WHERE vcenter.vm.disk.latency.max IS NOT NULL AND vcenter.vm.disk.latency.max > 20\n| STATS latency_ms = MAX(vcenter.vm.disk.latency.max) BY vcenter.datacenter.name, vcenter.cluster.name, vcenter.host.name, vcenter.vm.name, attributes.object\n| SORT latency_ms DESC\n| LIMIT 20" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "vcenter.vm.name", - "termSize": 20, - "excludeHitsFromPreviousRun": true + { + "id": "vsphere_otel-vms" } + ], + "investigation_guide": { + "blob": "## vSphere VM disk latency high\n\n### What fired\n`vcenter.vm.disk.latency.max > 20 ms`.\n\n### Why it matters\nVM-specific disk latency points at the particular VM's I/O profile, its VMDK placement, or a storage contention with other VMs on the same datastore.\n\n### Triage\n1. Inspect per-object latency.\n2. Check datastore saturation.\n\n### Remediation\n- Storage vMotion to a less busy datastore.\n\n### Tuning\n- 20 ms threshold.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-vcenterreceiver.otel-*\n// VM disk latency max (ms): healthy <20ms; unhealthy >30ms\n// Uses max latency per object; use vcenter.vm.disk.latency.avg if available for finer granularity\n// Adjust threshold to suit workload\n| WHERE vcenter.vm.disk.latency.max IS NOT NULL AND vcenter.vm.disk.latency.max > 20\n| STATS latency_ms = MAX(vcenter.vm.disk.latency.max) BY vcenter.datacenter.name, vcenter.cluster.name, vcenter.host.name, vcenter.vm.name, attributes.object\n| SORT latency_ms DESC\n| LIMIT 20" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "vcenter.vm.name", + "termSize": 20, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-vm-memory-ballooning.json b/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-vm-memory-ballooning.json index 6c33b90c856..e5409157ed3 100644 --- a/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-vm-memory-ballooning.json +++ b/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-vm-memory-ballooning.json @@ -1,43 +1,57 @@ { - "id": "vsphere_otel-vm-memory-ballooning", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[vSphere OTel] VM memory ballooning", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "vsphere", - "vcenter", - "memory" - ], - "schedule": { - "interval": "5m" + "id": "vsphere_otel-vm-memory-ballooning", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when `vcenter.vm.memory.ballooned > 0`. The hypervisor is reclaiming memory from the VM.", + "name": "[vSphere OTel] VM memory ballooning", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "vsphere", + "vcenter", + "memory" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "vsphere_otel-vms" }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-vcenterreceiver.otel-*\n// VM memory ballooning: host reclaiming memory from VMs indicates memory pressure\n// Any non-zero value is a concern; adjust threshold if needed for noise reduction\n| WHERE vcenter.vm.memory.ballooned IS NOT NULL AND vcenter.vm.memory.ballooned > 0\n| STATS ballooned_mib = MAX(vcenter.vm.memory.ballooned) BY vcenter.datacenter.name, vcenter.cluster.name, vcenter.host.name, vcenter.vm.name\n| SORT ballooned_mib DESC\n| LIMIT 20" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "vcenter.vm.name", - "termSize": 20, - "excludeHitsFromPreviousRun": true + { + "id": "vsphere_otel-hosts" } + ], + "investigation_guide": { + "blob": "## vSphere VM memory ballooning\n\n### What fired\n`vcenter.vm.memory.ballooned > 0` during the window.\n\n### Why it matters\nBallooning is ESXi pressuring the guest to release memory. Performance within the VM degrades proportional to ballooned size.\n\n### Triage\n1. Check host memory utilisation.\n2. Inspect VM memory configuration.\n\n### Remediation\n- Migrate VM or grow host memory.\n\n### Tuning\n- Fires on any ballooned memory.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-vcenterreceiver.otel-*\n// VM memory ballooning: host reclaiming memory from VMs indicates memory pressure\n// Any non-zero value is a concern; adjust threshold if needed for noise reduction\n| WHERE vcenter.vm.memory.ballooned IS NOT NULL AND vcenter.vm.memory.ballooned > 0\n| STATS ballooned_mib = MAX(vcenter.vm.memory.ballooned) BY vcenter.datacenter.name, vcenter.cluster.name, vcenter.host.name, vcenter.vm.name\n| SORT ballooned_mib DESC\n| LIMIT 20" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "vcenter.vm.name", + "termSize": 20, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-vm-memory-swapping.json b/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-vm-memory-swapping.json index cf3f528bc1e..44f6e3bf94a 100644 --- a/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-vm-memory-swapping.json +++ b/packages/vsphere_otel/kibana/alerting_rule_template/vsphere_otel-vm-memory-swapping.json @@ -1,43 +1,57 @@ { - "id": "vsphere_otel-vm-memory-swapping", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[vSphere OTel] VM memory swapping", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "vsphere", - "vcenter", - "memory" - ], - "schedule": { - "interval": "5m" + "id": "vsphere_otel-vm-memory-swapping", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when `vcenter.vm.memory.swapped > 0`. The hypervisor is swapping VM memory to disk \u2014 severe performance loss.", + "name": "[vSphere OTel] VM memory swapping", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "vsphere", + "vcenter", + "memory" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "vsphere_otel-vms" }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-vcenterreceiver.otel-*\n// VM memory swapping: hypervisor swapping to disk \u2014 severe performance degradation\n// Any non-zero value is critical; adjust threshold if needed for noise reduction\n| WHERE vcenter.vm.memory.swapped IS NOT NULL AND vcenter.vm.memory.swapped > 0\n| STATS swapped_mib = MAX(vcenter.vm.memory.swapped) BY vcenter.datacenter.name, vcenter.cluster.name, vcenter.host.name, vcenter.vm.name\n| SORT swapped_mib DESC\n| LIMIT 20" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "vcenter.vm.name", - "termSize": 20, - "excludeHitsFromPreviousRun": true + { + "id": "vsphere_otel-hosts" } + ], + "investigation_guide": { + "blob": "## vSphere VM memory swapping\n\n### What fired\n`vcenter.vm.memory.swapped > 0` during the window.\n\n### Why it matters\nHypervisor-level swapping of VM memory is the worst-case memory pressure signal \u2014 orders-of-magnitude latency hit on the VM.\n\n### Triage\n1. Always address immediately.\n\n### Remediation\n- vMotion the VM to a healthier host.\n- Add host memory.\n\n### Tuning\n- Always P1.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-vcenterreceiver.otel-*\n// VM memory swapping: hypervisor swapping to disk \u2014 severe performance degradation\n// Any non-zero value is critical; adjust threshold if needed for noise reduction\n| WHERE vcenter.vm.memory.swapped IS NOT NULL AND vcenter.vm.memory.swapped > 0\n| STATS swapped_mib = MAX(vcenter.vm.memory.swapped) BY vcenter.datacenter.name, vcenter.cluster.name, vcenter.host.name, vcenter.vm.name\n| SORT swapped_mib DESC\n| LIMIT 20" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "vcenter.vm.name", + "termSize": 20, + "excludeHitsFromPreviousRun": true } + } } diff --git a/packages/vsphere_otel/manifest.yml b/packages/vsphere_otel/manifest.yml index 0464c12da69..59775325026 100644 --- a/packages/vsphere_otel/manifest.yml +++ b/packages/vsphere_otel/manifest.yml @@ -1,7 +1,7 @@ format_version: 3.5.7 name: vsphere_otel title: "VMware vSphere OpenTelemetry Assets" -version: 0.1.0 +version: 0.2.0 source: license: "Elastic-2.0" description: "VMware vSphere Assets for OpenTelemetry Collector" @@ -12,7 +12,7 @@ categories: - virtualization conditions: kibana: - version: "^9.3.0" + version: "^9.4.0" elastic: subscription: "basic" discovery: diff --git a/packages/zookeeper_otel/changelog.yml b/packages/zookeeper_otel/changelog.yml index 8cc8e6c0ee6..a69ecf0caa9 100644 --- a/packages/zookeeper_otel/changelog.yml +++ b/packages/zookeeper_otel/changelog.yml @@ -1,4 +1,9 @@ # newer versions go on top +- version: "0.2.0" + changes: + - description: Add description and artifact fields to alerting rule template. + type: enhancement + link: https://github.com/elastic/integrations/pull/18506 - version: "0.1.1" changes: - description: Update supported stack version for SLO templates. diff --git a/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-data-tree-size-excessive.json b/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-data-tree-size-excessive.json index 6a8d0d67843..9eb51af4682 100644 --- a/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-data-tree-size-excessive.json +++ b/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-data-tree-size-excessive.json @@ -1,40 +1,51 @@ { - "id": "zookeeper_otel-data-tree-size-excessive", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[ZooKeeper OTel] Data tree size excessive", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "zookeeper" - ], - "schedule": { - "interval": "5m" - }, - "alertDelay": { - "active": 3 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-generic.otel-default\n| WHERE metrics.zookeeper.data_tree.size IS NOT NULL\n// Peak data tree size per node in the time window\n| STATS max_data_size = MAX(metrics.zookeeper.data_tree.size) BY host.name\n// Alert when the in-memory data tree exceeds 1 GB (1,073,741,824 bytes)\n// Unbounded growth causes memory pressure, GC pauses, and latency spikes\n// Adjust based on available heap memory\n| WHERE max_data_size > 1073741824" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "host.name", - "termSize": 10 + "id": "zookeeper_otel-data-tree-size-excessive", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when `zookeeper.data_tree.size > 1 GB`. Unbounded growth stresses heap and causes GC pauses.", + "name": "[ZooKeeper OTel] Data tree size excessive", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "zookeeper" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "zookeeper_otel-overview" } + ], + "investigation_guide": { + "blob": "## ZooKeeper data tree size excessive\n\n### What fired\nPeak data tree size exceeded 1 GB during the window.\n\n### Why it matters\nZooKeeper holds its data tree in memory. Very large trees mean long startup, heavy GC, and latency spikes.\n\n### Triage\n1. Identify which znode subtrees are largest.\n2. Check client write patterns.\n\n### Remediation\n- Clean up obsolete znodes.\n- Grow heap if legitimately needed.\n\n### Tuning\n- 1 GB threshold; tune to heap size.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-generic.otel-default\n| WHERE metrics.zookeeper.data_tree.size IS NOT NULL\n// Peak data tree size per node in the time window\n| STATS max_data_size = MAX(metrics.zookeeper.data_tree.size) BY host.name\n// Alert when the in-memory data tree exceeds 1 GB (1,073,741,824 bytes)\n// Unbounded growth causes memory pressure, GC pauses, and latency spikes\n// Adjust based on available heap memory\n| WHERE max_data_size > 1073741824" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "host.name", + "termSize": 10 } + } } diff --git a/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-excessive-watch-count.json b/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-excessive-watch-count.json index 06d7db4844b..35c1f381848 100644 --- a/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-excessive-watch-count.json +++ b/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-excessive-watch-count.json @@ -1,40 +1,51 @@ { - "id": "zookeeper_otel-excessive-watch-count", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[ZooKeeper OTel] Excessive watch count", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "zookeeper" - ], - "schedule": { - "interval": "5m" - }, - "alertDelay": { - "active": 3 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-generic.otel-default\n| WHERE metrics.zookeeper.watch.count IS NOT NULL\n// Peak watch count per node in the time window\n| STATS max_watches = MAX(metrics.zookeeper.watch.count) BY host.name\n// Official docs recommend alerting above 10,000 watches\n// High watch counts risk notification storms during topology changes\n// Adjust based on your client workload and Kafka topology size\n| WHERE max_watches > 10000" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "host.name", - "termSize": 10 + "id": "zookeeper_otel-excessive-watch-count", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when watch count exceeds 10,000. High watch counts risk notification storms.", + "name": "[ZooKeeper OTel] Excessive watch count", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "zookeeper" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "zookeeper_otel-overview" } + ], + "investigation_guide": { + "blob": "## ZooKeeper excessive watch count\n\n### What fired\nWatch count exceeded 10,000 during the window.\n\n### Why it matters\nHigh watch counts amplify any topology change into a storm of callbacks, potentially overloading the ensemble.\n\n### Triage\n1. Inspect clients with the most watches.\n\n### Remediation\n- Redesign clients to use fewer targeted watches.\n\n### Tuning\n- 10k threshold matches the documented guidance.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-generic.otel-default\n| WHERE metrics.zookeeper.watch.count IS NOT NULL\n// Peak watch count per node in the time window\n| STATS max_watches = MAX(metrics.zookeeper.watch.count) BY host.name\n// Official docs recommend alerting above 10,000 watches\n// High watch counts risk notification storms during topology changes\n// Adjust based on your client workload and Kafka topology size\n| WHERE max_watches > 10000" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "host.name", + "termSize": 10 } + } } diff --git a/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-excessive-znode-count.json b/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-excessive-znode-count.json index 8f67c4f119e..40c5d157aa6 100644 --- a/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-excessive-znode-count.json +++ b/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-excessive-znode-count.json @@ -1,40 +1,51 @@ { - "id": "zookeeper_otel-excessive-znode-count", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[ZooKeeper OTel] Excessive znode count", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "zookeeper" - ], - "schedule": { - "interval": "5m" - }, - "alertDelay": { - "active": 3 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-generic.otel-default\n| WHERE metrics.zookeeper.znode.count IS NOT NULL\n// Peak znode count per node in the time window\n| STATS max_znodes = MAX(metrics.zookeeper.znode.count) BY host.name\n// Official ZooKeeper docs recommend alerting above 1,000,000 znodes\n// Excessive znodes cause memory pressure and slow startup/recovery\n// Adjust based on your expected data model size\n| WHERE max_znodes > 1000000" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 15, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "host.name", - "termSize": 10 + "id": "zookeeper_otel-excessive-znode-count", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when znode count exceeds 1,000,000. Large counts slow startup and recovery.", + "name": "[ZooKeeper OTel] Excessive znode count", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "zookeeper" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "zookeeper_otel-overview" } + ], + "investigation_guide": { + "blob": "## ZooKeeper excessive znode count\n\n### What fired\nZnode count exceeded 1,000,000 during the window.\n\n### Why it matters\nMillion-plus znodes stretch ZooKeeper beyond its design sweet spot \u2014 startup, snapshots, and replication all slow.\n\n### Triage\n1. Identify the clients producing many znodes.\n\n### Remediation\n- Redesign the data model.\n\n### Tuning\n- 1M threshold matches the documented guidance.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-generic.otel-default\n| WHERE metrics.zookeeper.znode.count IS NOT NULL\n// Peak znode count per node in the time window\n| STATS max_znodes = MAX(metrics.zookeeper.znode.count) BY host.name\n// Official ZooKeeper docs recommend alerting above 1,000,000 znodes\n// Excessive znodes cause memory pressure and slow startup/recovery\n// Adjust based on your expected data model size\n| WHERE max_znodes > 1000000" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "host.name", + "termSize": 10 } + } } diff --git a/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-file-descriptor-utilization-high.json b/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-file-descriptor-utilization-high.json index 4aa7095d30f..b08e6096e73 100644 --- a/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-file-descriptor-utilization-high.json +++ b/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-file-descriptor-utilization-high.json @@ -1,40 +1,51 @@ { - "id": "zookeeper_otel-file-descriptor-utilization-high", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[ZooKeeper OTel] File descriptor utilization high", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "zookeeper" - ], - "schedule": { - "interval": "5m" - }, - "alertDelay": { - "active": 3 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-generic.otel-default\n| WHERE metrics.zookeeper.file_descriptor.open IS NOT NULL\n AND metrics.zookeeper.file_descriptor.limit IS NOT NULL\n// Compute FD utilization as a percentage\n| EVAL fd_pct = metrics.zookeeper.file_descriptor.open * 100.0\n / metrics.zookeeper.file_descriptor.limit\n// Take the peak utilization per node in the time window\n| STATS max_fd_pct = MAX(fd_pct) BY host.name\n// 80% threshold — approaching the limit causes connection refusals\n// Each client connection uses one FD plus FDs for data/transaction logs\n// Adjust based on your configured FD limits\n| WHERE max_fd_pct > 80" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "host.name", - "termSize": 10 + "id": "zookeeper_otel-file-descriptor-utilization-high", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when FD utilisation exceeds 80%. FD exhaustion rejects client connections.", + "name": "[ZooKeeper OTel] File descriptor utilization high", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "zookeeper" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "zookeeper_otel-overview" } + ], + "investigation_guide": { + "blob": "## ZooKeeper file descriptor utilization high\n\n### What fired\nPeak FD utilisation exceeded 80% during the window.\n\n### Why it matters\nEvery client connection uses one FD plus FDs for snapshot/transaction logs. Exhaustion blocks new connections.\n\n### Triage\n1. Inspect client connection churn.\n\n### Remediation\n- Raise `ulimit -n`.\n\n### Tuning\n- 80% threshold.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-generic.otel-default\n| WHERE metrics.zookeeper.file_descriptor.open IS NOT NULL\n AND metrics.zookeeper.file_descriptor.limit IS NOT NULL\n// Compute FD utilization as a percentage\n| EVAL fd_pct = metrics.zookeeper.file_descriptor.open * 100.0\n / metrics.zookeeper.file_descriptor.limit\n// Take the peak utilization per node in the time window\n| STATS max_fd_pct = MAX(fd_pct) BY host.name\n// 80% threshold \u2014 approaching the limit causes connection refusals\n// Each client connection uses one FD plus FDs for data/transaction logs\n// Adjust based on your configured FD limits\n| WHERE max_fd_pct > 80" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "host.name", + "termSize": 10 } + } } diff --git a/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-high-average-latency.json b/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-high-average-latency.json index 7e6d036fc8a..d7060742c55 100644 --- a/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-high-average-latency.json +++ b/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-high-average-latency.json @@ -1,40 +1,51 @@ { - "id": "zookeeper_otel-high-average-latency", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[ZooKeeper OTel] High request latency", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "zookeeper" - ], - "schedule": { - "interval": "1m" - }, - "alertDelay": { - "active": 3 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-generic.otel-default\n| WHERE metrics.zookeeper.latency.max IS NOT NULL\n// Average the peak-latency gauge readings per node across the time window\n// latency.max reports the highest request latency since the last collection\n| STATS avg_peak_latency = AVG(metrics.zookeeper.latency.max) BY host.name\n// Threshold: sustained peak latency above 500 ms signals degradation\n// Sustained high latency risks Kafka broker session timeouts\n// Adjust based on your ensemble's baseline and SLA requirements\n| WHERE avg_peak_latency > 500" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "host.name", - "termSize": 10 + "id": "zookeeper_otel-high-average-latency", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when average peak latency exceeds 500 ms. High latency risks Kafka broker session timeouts.", + "name": "[ZooKeeper OTel] High request latency", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "zookeeper" + ], + "schedule": { + "interval": "1m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "zookeeper_otel-overview" } + ], + "investigation_guide": { + "blob": "## ZooKeeper high request latency\n\n### What fired\nAverage peak latency exceeded 500 ms during the window.\n\n### Why it matters\nSustained high latency in ZooKeeper cascades into session timeouts for Kafka / HBase / Curator clients \u2014 often causing downstream rebalances.\n\n### Triage\n1. Check disk latency for txn log fsyncs.\n2. Inspect CPU and GC on ensemble members.\n\n### Remediation\n- Move ZK to dedicated fast storage.\n\n### Tuning\n- 500 ms threshold.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-generic.otel-default\n| WHERE metrics.zookeeper.latency.max IS NOT NULL\n// Average the peak-latency gauge readings per node across the time window\n// latency.max reports the highest request latency since the last collection\n| STATS avg_peak_latency = AVG(metrics.zookeeper.latency.max) BY host.name\n// Threshold: sustained peak latency above 500 ms signals degradation\n// Sustained high latency risks Kafka broker session timeouts\n// Adjust based on your ensemble's baseline and SLA requirements\n| WHERE avg_peak_latency > 500" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "host.name", + "termSize": 10 } + } } diff --git a/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-node-health-check-failed.json b/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-node-health-check-failed.json index e91ada1c8d1..d04db9095b2 100644 --- a/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-node-health-check-failed.json +++ b/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-node-health-check-failed.json @@ -1,40 +1,51 @@ { - "id": "zookeeper_otel-node-health-check-failed", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[ZooKeeper OTel] Node health check failed", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "zookeeper" - ], - "schedule": { - "interval": "1m" - }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-generic.otel-default\n| WHERE metrics.zookeeper.ruok IS NOT NULL\n// Aggregate per node; MIN captures any unhealthy reading in the window\n| STATS min_ruok = MIN(metrics.zookeeper.ruok) BY host.name\n// ruok = 0 means the node failed its health check and is not serving requests\n// This is the most direct signal of ZooKeeper node availability\n| WHERE min_ruok == 0" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "host.name", - "termSize": 10 + "id": "zookeeper_otel-node-health-check-failed", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when `zookeeper.ruok == 0`. Node failed its four-letter health check \u2014 not serving requests.", + "name": "[ZooKeeper OTel] Node health check failed", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "zookeeper" + ], + "schedule": { + "interval": "1m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "zookeeper_otel-overview" } + ], + "investigation_guide": { + "blob": "## ZooKeeper node health check failed\n\n### What fired\n`zookeeper.ruok == 0` during the window.\n\n### Why it matters\n`ruok` is ZooKeeper's built-in liveness signal. A failed ruok means the node is unable to respond \u2014 loss of ensemble member.\n\n### Triage\n1. Check process state and logs.\n\n### Remediation\n- Restart the node after root-causing.\n\n### Tuning\n- Always P1.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-generic.otel-default\n| WHERE metrics.zookeeper.ruok IS NOT NULL\n// Aggregate per node; MIN captures any unhealthy reading in the window\n| STATS min_ruok = MIN(metrics.zookeeper.ruok) BY host.name\n// ruok = 0 means the node failed its health check and is not serving requests\n// This is the most direct signal of ZooKeeper node availability\n| WHERE min_ruok == 0" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "host.name", + "termSize": 10 } + } } diff --git a/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-outstanding-requests-saturation.json b/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-outstanding-requests-saturation.json index efdcfcaecee..e0825dfc792 100644 --- a/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-outstanding-requests-saturation.json +++ b/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-outstanding-requests-saturation.json @@ -1,40 +1,51 @@ { - "id": "zookeeper_otel-outstanding-requests-saturation", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[ZooKeeper OTel] Outstanding requests saturation", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "zookeeper" - ], - "schedule": { - "interval": "1m" - }, - "alertDelay": { - "active": 3 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-generic.otel-default\n| WHERE metrics.zookeeper.request.active IS NOT NULL\n// Average outstanding requests per node over the time window\n| STATS avg_outstanding = AVG(metrics.zookeeper.request.active) BY host.name\n// A healthy ensemble should have near-zero outstanding requests\n// Sustained values above this threshold indicate processing saturation\n// Adjust based on your workload; lower values catch issues sooner\n| WHERE avg_outstanding > 10" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "host.name", - "termSize": 10 + "id": "zookeeper_otel-outstanding-requests-saturation", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when average outstanding requests exceed 10. Processing saturation on ZooKeeper.", + "name": "[ZooKeeper OTel] Outstanding requests saturation", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "zookeeper" + ], + "schedule": { + "interval": "1m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "zookeeper_otel-overview" } + ], + "investigation_guide": { + "blob": "## ZooKeeper outstanding requests saturation\n\n### What fired\nAverage `zookeeper.request.active > 10` during the window.\n\n### Why it matters\nA healthy ZK has near-zero outstanding requests. Sustained values signal saturation \u2014 client timeouts follow.\n\n### Triage\n1. Check disk fsync latency.\n2. Inspect CPU / GC pauses.\n\n### Remediation\n- Move ZK to faster storage or less-contended hosts.\n\n### Tuning\n- 10 threshold; lower for stricter monitoring.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-generic.otel-default\n| WHERE metrics.zookeeper.request.active IS NOT NULL\n// Average outstanding requests per node over the time window\n| STATS avg_outstanding = AVG(metrics.zookeeper.request.active) BY host.name\n// A healthy ensemble should have near-zero outstanding requests\n// Sustained values above this threshold indicate processing saturation\n// Adjust based on your workload; lower values catch issues sooner\n| WHERE avg_outstanding > 10" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "host.name", + "termSize": 10 } + } } diff --git a/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-pending-syncs-buildup.json b/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-pending-syncs-buildup.json index 34c22959c0a..96176d40324 100644 --- a/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-pending-syncs-buildup.json +++ b/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-pending-syncs-buildup.json @@ -1,40 +1,51 @@ { - "id": "zookeeper_otel-pending-syncs-buildup", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[ZooKeeper OTel] Pending syncs buildup on leader", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "zookeeper" - ], - "schedule": { - "interval": "1m" - }, - "alertDelay": { - "active": 3 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-generic.otel-default\n| WHERE metrics.zookeeper.sync.pending IS NOT NULL\n AND resource.attributes.server.state == \"leader\"\n// Average pending syncs on the leader over the time window\n| STATS avg_pending = AVG(metrics.zookeeper.sync.pending) BY host.name\n// A healthy leader should have zero pending syncs\n// Sustained buildup means the ZAB protocol is struggling to replicate\n// This is an early warning before followers become unsynced\n// Adjust threshold based on ensemble size and write volume\n| WHERE avg_pending > 10" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "host.name", - "termSize": 10 + "id": "zookeeper_otel-pending-syncs-buildup", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when the leader has more than 10 pending syncs. Replication backlog precedes unsynced followers.", + "name": "[ZooKeeper OTel] Pending syncs buildup on leader", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "zookeeper" + ], + "schedule": { + "interval": "1m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "zookeeper_otel-overview" } + ], + "investigation_guide": { + "blob": "## ZooKeeper pending syncs buildup on leader\n\n### What fired\nLeader's average `sync.pending > 10` during the window.\n\n### Why it matters\nThe ZAB protocol relies on quick syncs to followers. Buildup means followers are slow or the network is degraded.\n\n### Triage\n1. Check network between leader and followers.\n2. Inspect follower CPU/disk.\n\n### Remediation\n- Fix slow followers or network.\n\n### Tuning\n- 10 pending threshold.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-generic.otel-default\n| WHERE metrics.zookeeper.sync.pending IS NOT NULL\n AND resource.attributes.server.state == \"leader\"\n// Average pending syncs on the leader over the time window\n| STATS avg_pending = AVG(metrics.zookeeper.sync.pending) BY host.name\n// A healthy leader should have zero pending syncs\n// Sustained buildup means the ZAB protocol is struggling to replicate\n// This is an early warning before followers become unsynced\n// Adjust threshold based on ensemble size and write volume\n| WHERE avg_pending > 10" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "host.name", + "termSize": 10 } + } } diff --git a/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-unsynced-followers.json b/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-unsynced-followers.json index 02435a4b366..c517ae453ff 100644 --- a/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-unsynced-followers.json +++ b/packages/zookeeper_otel/kibana/alerting_rule_template/zookeeper_otel-unsynced-followers.json @@ -1,40 +1,51 @@ { - "id": "zookeeper_otel-unsynced-followers", - "type": "alerting_rule_template", - "managed": true, - "attributes": { - "name": "[ZooKeeper OTel] Unsynced followers detected", - "ruleTypeId": ".es-query", - "tags": [ - "observability", - "zookeeper" - ], - "schedule": { - "interval": "1m" - }, - "alertDelay": { - "active": 2 - }, - "flapping": { - "lookBackWindow": 10, - "statusChangeThreshold": 4 - }, - "params": { - "searchType": "esqlQuery", - "esqlQuery": { - "esql": "FROM metrics-generic.otel-default\n| WHERE metrics.zookeeper.follower.count IS NOT NULL\n AND resource.attributes.server.state == \"leader\"\n AND attributes.state == \"unsynced\"\n// Peak unsynced follower count reported by the leader\n| STATS max_unsynced = MAX(metrics.zookeeper.follower.count) BY host.name\n// Any unsynced followers risk data loss during failover\n// and indicate replication lag or network issues between ensemble members\n| WHERE max_unsynced > 0" - }, - "size": 0, - "threshold": [ - 0 - ], - "thresholdComparator": ">", - "timeField": "@timestamp", - "timeWindowSize": 5, - "timeWindowUnit": "m", - "groupBy": "row", - "termField": "host.name", - "termSize": 10 + "id": "zookeeper_otel-unsynced-followers", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when the leader reports any unsynced followers. Risks data loss during failover.", + "name": "[ZooKeeper OTel] Unsynced followers detected", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "zookeeper" + ], + "schedule": { + "interval": "1m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "dashboards": [ + { + "id": "zookeeper_otel-overview" } + ], + "investigation_guide": { + "blob": "## ZooKeeper unsynced followers detected\n\n### What fired\n`follower.count` with `state=unsynced` exceeded 0.\n\n### Why it matters\nUnsynced followers cannot promote to leader. If the leader fails while followers are unsynced, the ensemble loses quorum.\n\n### Triage\n1. Check follower logs for sync errors.\n2. Inspect replication pipeline.\n\n### Remediation\n- Restart or repair the unsynced follower.\n\n### Tuning\n- Fires on any unsynced follower.\n" + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-generic.otel-default\n| WHERE metrics.zookeeper.follower.count IS NOT NULL\n AND resource.attributes.server.state == \"leader\"\n AND attributes.state == \"unsynced\"\n// Peak unsynced follower count reported by the leader\n| STATS max_unsynced = MAX(metrics.zookeeper.follower.count) BY host.name\n// Any unsynced followers risk data loss during failover\n// and indicate replication lag or network issues between ensemble members\n| WHERE max_unsynced > 0" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "host.name", + "termSize": 10 } + } } diff --git a/packages/zookeeper_otel/manifest.yml b/packages/zookeeper_otel/manifest.yml index 7ef441ee671..8b8eaad8cc3 100644 --- a/packages/zookeeper_otel/manifest.yml +++ b/packages/zookeeper_otel/manifest.yml @@ -1,7 +1,7 @@ format_version: 3.5.7 name: zookeeper_otel title: "Zookeeper OTel Assets" -version: 0.1.1 +version: 0.2.0 source: license: "Elastic-2.0" description: "Zookeeper OTel Assets" @@ -11,7 +11,7 @@ categories: - opentelemetry conditions: kibana: - version: "^9.3.0" + version: "^9.4.0" elastic: subscription: "basic" discovery: