-
Notifications
You must be signed in to change notification settings - Fork 173
feat(BA-5806): force capacity sentinel for unbounded kernel live_stat metrics #11535
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
a5494d2
feat(BA-5806): synthesize capacity sentinel for kernel live_stat
seedspirit d7dbe1b
changelog: add news fragment for PR #11535
seedspirit 3713c11
fix: Align news fragment file name
seedspirit ff41673
fix(BA-5806): force sentinel capacity over current-as-fallback artifacts
seedspirit e7790d3
style: More intuitive var name
seedspirit File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| Inject capacity sentinel into kernel live_stat for metrics without a Prometheus capacity series | ||
|
seedspirit marked this conversation as resolved.
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,9 @@ | ||
| from typing import Final | ||
| from collections.abc import Mapping | ||
| from dataclasses import dataclass | ||
| from typing import Final, Self | ||
|
|
||
| from ai.backend.common.clients.prometheus.types import MetricValue, ValueType | ||
| from ai.backend.common.types import KernelId | ||
|
|
||
| UNDEFINED: Final[str] = "undefined" | ||
|
|
||
|
|
@@ -9,3 +14,60 @@ | |
| CONTAINER_UTILIZATION_METRIC_LABEL_NAME: Final[str] = "container_metric_name" | ||
| DEVICE_UTILIZATION_METRIC_LABEL_NAME: Final[str] = "device_metric_name" | ||
| PROCESS_UTILIZATION_METRIC_LABEL_NAME: Final[str] = "process_metric_name" | ||
|
|
||
| # Stand-in capacity for metrics whose Prometheus capacity series does not exist. | ||
| CAPACITY_SENTINEL: Final[str] = "9223372036854775807" # 2**63 - 1 | ||
|
|
||
| CAPACITY_SENTINEL_METRICS: Final[frozenset[str]] = frozenset({ | ||
| "cpu_used", | ||
| "net_rx", | ||
| "net_tx", | ||
| "io_read", | ||
| "io_write", | ||
| }) | ||
|
|
||
|
|
||
| @dataclass(frozen=True) | ||
| class KernelLiveStatValues: | ||
| values_by_kernel: Mapping[KernelId, list[MetricValue]] | ||
|
|
||
| @classmethod | ||
| def with_capacity_sentinels( | ||
| cls, | ||
| values_by_kernel: Mapping[KernelId, list[MetricValue]], | ||
| ) -> Self: | ||
| """For metrics in `CAPACITY_SENTINEL_METRICS` that are live (have a | ||
| CURRENT sample), force the CAPACITY sample to `CAPACITY_SENTINEL`. | ||
|
|
||
| These metrics have no meaningful capacity (cumulative counters / rates), | ||
| so any CAPACITY value present in the Prometheus response is a stale | ||
| current-as-fallback artifact and must be overwritten rather than | ||
| respected. | ||
| """ | ||
| new_values: dict[KernelId, list[MetricValue]] = { | ||
| kid: list(vs) for kid, vs in values_by_kernel.items() | ||
| } | ||
| for kid, vs in new_values.items(): | ||
| reported_currents: set[str] = { | ||
| v.metric_name for v in vs if v.value_type is ValueType.CURRENT | ||
| } | ||
| sentinel_targets = reported_currents & CAPACITY_SENTINEL_METRICS | ||
| if not sentinel_targets: | ||
| continue | ||
|
Comment on lines
+51
to
+56
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The implementation feels a bit forced. |
||
| rewritten: list[MetricValue] = [] | ||
| samples_to_keep = [ | ||
| v | ||
| for v in vs | ||
| if not (v.value_type is ValueType.CAPACITY and v.metric_name in sentinel_targets) | ||
| ] | ||
| rewritten.extend(samples_to_keep) | ||
| for metric_name in sentinel_targets: | ||
| rewritten.append( | ||
| MetricValue( | ||
| metric_name=metric_name, | ||
| value_type=ValueType.CAPACITY, | ||
| value=CAPACITY_SENTINEL, | ||
| ) | ||
| ) | ||
| new_values[kid] = rewritten | ||
| return cls(values_by_kernel=new_values) | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,97 @@ | ||
| from uuid import UUID | ||
|
|
||
| import pytest | ||
|
|
||
| from ai.backend.common.clients.prometheus.types import MetricValue, ValueType | ||
| from ai.backend.common.metrics.types import ( | ||
| CAPACITY_SENTINEL, | ||
| KernelLiveStatValues, | ||
| ) | ||
| from ai.backend.common.types import KernelId | ||
|
|
||
|
|
||
| def _capacities_of(result: KernelLiveStatValues, kernel_id: KernelId) -> dict[str, str]: | ||
| """Return `{metric_name: value}` for every CAPACITY sample of `kernel_id`.""" | ||
| return { | ||
| v.metric_name: v.value | ||
| for v in result.values_by_kernel[kernel_id] | ||
| if v.value_type is ValueType.CAPACITY | ||
| } | ||
|
|
||
|
|
||
| class TestWithCapacitySentinels: | ||
| """Tests for `KernelLiveStatValues.with_capacity_sentinels`.""" | ||
|
|
||
| @pytest.fixture() | ||
| def kernel_id(self) -> KernelId: | ||
| return KernelId(UUID("12345678-1234-5678-1234-567812345678")) | ||
|
|
||
| @pytest.fixture() | ||
| def other_kernel_id(self) -> KernelId: | ||
| return KernelId(UUID("87654321-4321-8765-4321-876543218765")) | ||
|
|
||
| @pytest.mark.parametrize( | ||
| "metric_name", | ||
| ["cpu_used", "net_rx", "net_tx", "io_read", "io_write"], | ||
| ) | ||
| async def test_appends_sentinel_capacity_for_whitelisted_live_metric( | ||
| self, kernel_id: KernelId, metric_name: str | ||
| ) -> None: | ||
| """Whitelisted metrics with a CURRENT sample but no CAPACITY sample | ||
| receive a synthetic capacity carrying `CAPACITY_SENTINEL`. | ||
| """ | ||
| result = KernelLiveStatValues.with_capacity_sentinels({ | ||
| kernel_id: [MetricValue(metric_name, ValueType.CURRENT, "42")], | ||
| }) | ||
| assert _capacities_of(result, kernel_id) == {metric_name: CAPACITY_SENTINEL} | ||
|
|
||
| @pytest.mark.parametrize( | ||
| "metric_name", | ||
| ["cpu_used", "net_rx", "net_tx", "io_read", "io_write"], | ||
| ) | ||
| async def test_existing_capacity_is_overwritten( | ||
| self, kernel_id: KernelId, metric_name: str | ||
| ) -> None: | ||
| """Whitelisted metrics never have a meaningful capacity, so any | ||
| capacity already present (e.g. a current-as-fallback artifact) must | ||
| be overwritten by the sentinel. | ||
| """ | ||
| result = KernelLiveStatValues.with_capacity_sentinels({ | ||
| kernel_id: [ | ||
| MetricValue(metric_name, ValueType.CURRENT, "10"), | ||
| MetricValue(metric_name, ValueType.CAPACITY, "999"), | ||
| ], | ||
| }) | ||
| assert _capacities_of(result, kernel_id) == {metric_name: CAPACITY_SENTINEL} | ||
|
|
||
| async def test_skips_metric_without_current_sample(self, kernel_id: KernelId) -> None: | ||
| """No phantom capacity is added when the metric has no CURRENT sample.""" | ||
| result = KernelLiveStatValues.with_capacity_sentinels({kernel_id: []}) | ||
| assert _capacities_of(result, kernel_id) == {} | ||
|
|
||
| @pytest.mark.parametrize("metric_name", ["mem", "io_scratch_size", "cpu_util"]) | ||
| async def test_metric_outside_whitelist_is_untouched( | ||
| self, kernel_id: KernelId, metric_name: str | ||
| ) -> None: | ||
| """Metrics that have a real Prometheus capacity series are left alone.""" | ||
| result = KernelLiveStatValues.with_capacity_sentinels({ | ||
| kernel_id: [MetricValue(metric_name, ValueType.CURRENT, "1")], | ||
| }) | ||
| assert _capacities_of(result, kernel_id) == {} | ||
|
|
||
| async def test_isolates_per_kernel( | ||
| self, kernel_id: KernelId, other_kernel_id: KernelId | ||
| ) -> None: | ||
| """Sentinel injection on one kernel does not leak into another.""" | ||
| result = KernelLiveStatValues.with_capacity_sentinels({ | ||
| kernel_id: [MetricValue("net_rx", ValueType.CURRENT, "1")], | ||
| other_kernel_id: [MetricValue("mem", ValueType.CURRENT, "2")], | ||
| }) | ||
| assert _capacities_of(result, kernel_id) == {"net_rx": CAPACITY_SENTINEL} | ||
| assert _capacities_of(result, other_kernel_id) == {} | ||
|
|
||
| async def test_input_is_not_mutated(self, kernel_id: KernelId) -> None: | ||
| """The caller's input list is left untouched.""" | ||
| original = [MetricValue("net_rx", ValueType.CURRENT, "1")] | ||
| KernelLiveStatValues.with_capacity_sentinels({kernel_id: original}) | ||
| assert original == [MetricValue("net_rx", ValueType.CURRENT, "1")] |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.