diff --git a/README.md b/README.md index 4d6b7dda9..7932df84e 100644 --- a/README.md +++ b/README.md @@ -58,3 +58,20 @@ To run the server, see the [server installation documentation](https://uktrade.g ## Development See our full development guide and coding standards on our [contribution guide](https://uktrade.github.io/matchbox/contributing/). + +## Local development with Datadog + +When iterating the Datadog configuration, environment variables can be set in several ways: + +1. **Datadog configuration**: Create a `.datadog.env` file with your Datadog API key and other agent settings +2. **Compose override**: Use `docker-compose.override.yml` for local-specific variable overrides + +Variables in `.datadog.env` will override any defaults set in the compose file. + +Example `.datadog.env`: + +``` +DD_API_KEY=your_api_key_here +``` + +The Docker Compose file will automatically set `DD_ENV=local-{username}` for local development isolation. diff --git a/environments/development.env b/environments/development.env index 16d9760c2..349e0eeef 100644 --- a/environments/development.env +++ b/environments/development.env @@ -2,7 +2,7 @@ MB__DEV__API_PORT=8000 MB__DEV__DATASTORE_CONSOLE_PORT=9003 MB__DEV__DATASTORE_PORT=9002 MB__DEV__WAREHOUSE_PORT=7654 -MB__DEV__POSTGRES_BACKEND_PORT=5432 # Change to 9876 here and server.env to avoid conflict with other services +MB__DEV__POSTGRES_BACKEND_PORT=9876 # Change to 9876 here and server.env to avoid conflict with other services MB__SERVER__API_KEY=matchbox-api-key MB__SERVER__BACKEND_TYPE=postgres @@ -17,7 +17,7 @@ MB__SERVER__DATASTORE__DEFAULT_REGION=eu-west-2 MB__SERVER__DATASTORE__CACHE_BUCKET_NAME=cache MB__SERVER__POSTGRES__HOST=localhost -MB__SERVER__POSTGRES__PORT=5432 # Change to 9876 here and server.env to avoid conflict with other services +MB__SERVER__POSTGRES__PORT=9876 # Change to 9876 here and server.env to avoid conflict with other services MB__SERVER__POSTGRES__USER=matchbox_user MB__SERVER__POSTGRES__PASSWORD=matchbox_password MB__SERVER__POSTGRES__DATABASE=matchbox diff --git a/environments/server.env b/environments/server.env index bb669db81..3650e852f 100644 --- a/environments/server.env +++ b/environments/server.env @@ -2,7 +2,7 @@ MB__DEV__API_PORT=8000 MB__DEV__DATASTORE_CONSOLE_PORT=9003 MB__DEV__DATASTORE_PORT=9002 MB__DEV__WAREHOUSE_PORT=7654 -MB__DEV__POSTGRES_BACKEND_PORT=5432 # Change to 9876 here and development.env to avoid conflict with other services +MB__DEV__POSTGRES_BACKEND_PORT=9876 # Change to 9876 here and development.env to avoid conflict with other services MB__SERVER__API_KEY=matchbox-api-key MB__SERVER__BACKEND_TYPE=postgres diff --git a/src/matchbox/client/_handler.py b/src/matchbox/client/_handler.py index 6562d5de4..a378e9bf1 100644 --- a/src/matchbox/client/_handler.py +++ b/src/matchbox/client/_handler.py @@ -158,21 +158,21 @@ def login(user_name: str) -> int: def query( - source: SourceResolutionName, + sources: list[SourceResolutionName], return_leaf_id: bool, resolution: ResolutionName | None = None, threshold: int | None = None, limit: int | None = None, ) -> Table: - """Query a source in Matchbox.""" - log_prefix = f"Query {source}" + """Query multiple sources in Matchbox.""" + log_prefix = f"Query {', '.join(sources)}" logger.debug(f"Using {resolution}", prefix=log_prefix) res = CLIENT.get( "/query", params=url_params( { - "source": source, + "sources": sources, "resolution": resolution, "return_leaf_id": return_leaf_id, "threshold": threshold, @@ -438,7 +438,8 @@ def sample_for_eval(n: int, resolution: ModelResolutionName, user_id: int) -> Ta params=url_params({"n": n, "resolution": resolution, "user_id": user_id}), ) - return read_table(BytesIO(res.content)) + buffer = BytesIO(res.content) + return read_table(buffer) def compare_models(resolutions: list[ModelResolutionName]) -> ModelComparison: diff --git a/src/matchbox/client/extract.py b/src/matchbox/client/extract.py index e666209a8..2a847082b 100644 --- a/src/matchbox/client/extract.py +++ b/src/matchbox/client/extract.py @@ -34,25 +34,44 @@ def key_field_map( source_mb_ids: list[ArrowTable] = [] source_to_key_field: dict[str, str] = {} + # Store source names and key field mappings + source_names = [s.name for s in sources] for s in sources: - # Get Matchbox IDs from backend + source_to_key_field[s.name] = s.key_field.name + + if len(sources) == 1: + # Single source - make individual call source_mb_ids.append( _handler.query( - source=s.name, + sources=[sources[0].name], resolution=resolution, return_leaf_id=False, ) ) + else: + # Multiple sources - make single multi-source call + combined_result = _handler.query( + sources=source_names, + resolution=resolution, + return_leaf_id=False, + ) - source_to_key_field[s.name] = s.key_field.name + # Split the combined result by source + import polars as pl + + combined_df = pl.from_arrow(combined_result) + for source_name in source_names: + source_data = combined_df.filter(pl.col("source") == source_name).to_arrow() + source_mb_ids.append(source_data) # Join Matchbox IDs to form mapping table - mapping = source_mb_ids[0] + mapping = source_mb_ids[0].select(["id", "key"]) mapping = mapping.rename_columns({"key": sources[0].qualified_key}) if len(sources) > 1: for s, mb_ids in zip(sources[1:], source_mb_ids[1:], strict=True): + mb_ids_selected = mb_ids.select(["id", "key"]) mapping = mapping.join( - right_table=mb_ids, keys="id", join_type="full outer" + right_table=mb_ids_selected, keys="id", join_type="full outer" ) mapping = mapping.rename_columns({"key": s.qualified_key}) diff --git a/src/matchbox/client/helpers/selector.py b/src/matchbox/client/helpers/selector.py index 6ec755173..08101d3fc 100644 --- a/src/matchbox/client/helpers/selector.py +++ b/src/matchbox/client/helpers/selector.py @@ -198,16 +198,27 @@ def _process_selectors( For batched queries, yield from it. """ - selector_results: list[PolarsDataFrame] = [] - for selector in selectors: - mb_ids = pl.from_arrow( - _handler.query( - source=selector.source.name, - resolution=resolution, - threshold=threshold, - return_leaf_id=return_leaf_id, - ) + # Group selectors by resolution to make efficient multi-source queries + if not selectors: + return + + # Make single multi-source query with all selectors + source_names = [selector.source.name for selector in selectors] + + # Make single multi-source API call + mb_ids = pl.from_arrow( + _handler.query( + sources=source_names, + resolution=resolution, + threshold=threshold, + return_leaf_id=return_leaf_id, ) + ) + + # Process each selector with the multi-source result + for selector in selectors: + # Filter the multi-source results to this selector's source + source_filtered_ids = mb_ids.filter(pl.col("source") == selector.source.name) raw_batches = selector.source.query( qualify_names=True, @@ -218,15 +229,17 @@ def _process_selectors( processed_batches = [ _process_query_result( data=b, + mb_ids=source_filtered_ids, selector=selector, - mb_ids=mb_ids, return_leaf_id=return_leaf_id, ) for b in raw_batches ] - selector_results.append(pl.concat(processed_batches, how="vertical")) - return selector_results + # Concatenate all batches for this selector and yield + if processed_batches: + selector_result = pl.concat(processed_batches, how="vertical") + yield selector_result def query( diff --git a/src/matchbox/common/arrow.py b/src/matchbox/common/arrow.py index 2c151f645..0302bb9d0 100644 --- a/src/matchbox/common/arrow.py +++ b/src/matchbox/common/arrow.py @@ -5,18 +5,21 @@ from typing import Final import pyarrow as pa -import pyarrow.parquet as pq from pyarrow import Schema from matchbox.common.exceptions import MatchboxArrowSchemaMismatch SCHEMA_QUERY: Final[pa.Schema] = pa.schema( - [("id", pa.int64()), ("key", pa.large_string())] + [ + ("id", pa.int64()), + ("key", pa.large_string()), + ("source", pa.dictionary(pa.int32(), pa.string())), + ] ) -"""Data transfer schema for root cluster IDs keyed to primary keys.""" +"""Data transfer schema for root cluster IDs keyed to primary keys with source ID.""" SCHEMA_QUERY_WITH_LEAVES = SCHEMA_QUERY.append(pa.field("leaf_id", pa.int64())) -"""Data transfer schema for root cluster IDs keyed to primary keys and leaf IDs.""" +"""Data transfer schema for cluster IDs with primary keys, source ID, and leaf IDs.""" SCHEMA_INDEX: Final[pa.Schema] = pa.schema( @@ -70,9 +73,15 @@ class JudgementsZipFilenames(StrEnum): def table_to_buffer(table: pa.Table) -> BytesIO: - """Converts an Arrow table to a BytesIO buffer.""" + """Converts an Arrow table to a BytesIO buffer using Arrow IPC format. + + Uses Arrow IPC format instead of parquet to preserve exact schema fidelity, + including uint32 dictionary indices and large_string values. + """ sink = BytesIO() - pq.write_table(table, sink) + writer = pa.ipc.new_file(sink, table.schema) + writer.write_table(table) + writer.close() sink.seek(0) return sink diff --git a/src/matchbox/common/eval.py b/src/matchbox/common/eval.py index ce195b0ef..fd33c93c7 100644 --- a/src/matchbox/common/eval.py +++ b/src/matchbox/common/eval.py @@ -177,7 +177,11 @@ def process_judgements( # if missing expansion, assume we're dealing with singleton leaves .with_columns( pl.when(pl.col("endorsed_leaves").is_null()) - .then(pl.col("endorsed").map_elements(lambda x: [x])) + .then( + pl.col("endorsed").map_elements( + lambda x: [x], return_dtype=pl.List(pl.UInt64) + ) + ) .otherwise(pl.col("endorsed_leaves")) .alias("endorsed_leaves") ) diff --git a/src/matchbox/common/factories/entities.py b/src/matchbox/common/factories/entities.py index 9b759946f..4eb64de9f 100644 --- a/src/matchbox/common/factories/entities.py +++ b/src/matchbox/common/factories/entities.py @@ -291,12 +291,12 @@ def get_values( raise ValueError(f"SourceConfig not found: {source_name}") # Get rows for this entity in this source - df = source.data.to_pandas() - entity_rows = df[df["key"].isin(keys)] + df = pl.from_arrow(source.data) + entity_rows = df.filter(pl.col("key").is_in(keys)) # Get unique values for each feature in this source values[source_name] = { - feature.name: sorted(entity_rows[feature.name].unique()) + feature.name: sorted(entity_rows[feature.name].unique().to_list()) for feature in source.features } @@ -473,33 +473,43 @@ def query_to_cluster_entities( Returns: A set of ClusterEntity objects """ - # Convert polars to pandas for compatibility with existing logic - if isinstance(query, pl.DataFrame): - query = query.to_pandas() - elif isinstance(query, pa.Table): - query = query.to_pandas() + # Convert to polars for efficient processing (avoids pandas uint32 issues) + if isinstance(query, pa.Table): + query_df = pl.from_arrow(query) + elif isinstance(query, pd.DataFrame): + query_df = pl.from_pandas(query) + else: + query_df = query must_have_fields = set(["id"] + list(keys.values())) - if not must_have_fields.issubset(query.columns): + if not must_have_fields.issubset(query_df.columns): raise ValueError( - f"Fields {must_have_fields.difference(query.columns)} must be included " + f"Fields {must_have_fields.difference(query_df.columns)} must be included " "in the query and are missing." ) - def _create_cluster_entity(group: pd.DataFrame) -> ClusterEntity: - entity_refs = { - source: frozenset(group[key_field].dropna().values) - for source, key_field in keys.items() - if not group[key_field].dropna().empty - } + def _create_cluster_entity(group_df: pl.DataFrame) -> ClusterEntity: + # Get the cluster ID (should be the same for all rows in the group) + cluster_id = group_df["id"][0] + + entity_refs = {} + for source, key_field in keys.items(): + # Get non-null values for this key field + values = group_df.filter(pl.col(key_field).is_not_null())[key_field] + if len(values) > 0: + entity_refs[source] = frozenset(values.to_list()) return ClusterEntity( - id=group.name, + id=cluster_id, keys=EntityReference(entity_refs), ) - result = query.groupby("id").apply(_create_cluster_entity, include_groups=False) - return set(result.tolist()) + # Group by cluster ID and create ClusterEntity for each group + result = [] + for _cluster_id, group_df in query_df.group_by("id"): + result.append(_create_cluster_entity(group_df)) + + return set(result) @cache diff --git a/src/matchbox/server/api/main.py b/src/matchbox/server/api/main.py index e97981602..72abf3ff3 100644 --- a/src/matchbox/server/api/main.py +++ b/src/matchbox/server/api/main.py @@ -235,16 +235,16 @@ async def get_upload_status( ) def query( backend: BackendDependency, - source: SourceResolutionName, + sources: Annotated[list[SourceResolutionName], Query()], return_leaf_id: bool, resolution: ResolutionName | None = None, threshold: int | None = None, limit: int | None = None, ) -> ParquetResponse: - """Query Matchbox for matches based on a source resolution name.""" + """Query Matchbox for matches based on multiple source resolution names.""" try: res = backend.query( - source=source, + sources=sources, resolution=resolution, threshold=threshold, return_leaf_id=return_leaf_id, diff --git a/src/matchbox/server/base.py b/src/matchbox/server/base.py index 94d0997bd..492cae5f7 100644 --- a/src/matchbox/server/base.py +++ b/src/matchbox/server/base.py @@ -228,7 +228,7 @@ class MatchboxDBAdapter(ABC): @abstractmethod def query( self, - source: SourceResolutionName, + sources: list[SourceResolutionName], resolution: ResolutionName | None = None, threshold: int | None = None, return_leaf_id: bool = False, @@ -237,9 +237,9 @@ def query( """Queries the database from an optional point of truth. Args: - source: the `SourceResolutionName` string identifying the source to query + sources: list of `SourceResolutionName` strings identifying sources to query resolution (optional): the resolution to use for filtering results - If not specified, will use the source resolution for the queried source + If not specified, will use the source resolution for the first source threshold (optional): the threshold to use for creating clusters If None, uses the models' default threshold If an integer, uses that threshold for the specified model, and the diff --git a/src/matchbox/server/postgresql/adapter.py b/src/matchbox/server/postgresql/adapter.py index 2f07dc527..6ea8c133e 100644 --- a/src/matchbox/server/postgresql/adapter.py +++ b/src/matchbox/server/postgresql/adapter.py @@ -193,14 +193,14 @@ def __init__(self, settings: MatchboxPostgresSettings): def query( # noqa: D102 self, - source: SourceResolutionName, + sources: list[SourceResolutionName], resolution: ResolutionName | None = None, threshold: int | None = None, return_leaf_id: bool = False, limit: int | None = None, ) -> ArrowTable: return query( - source=source, + sources=sources, resolution=resolution, threshold=threshold, return_leaf_id=return_leaf_id, diff --git a/src/matchbox/server/postgresql/utils/query.py b/src/matchbox/server/postgresql/utils/query.py index 6469e355c..3f7d42ae6 100644 --- a/src/matchbox/server/postgresql/utils/query.py +++ b/src/matchbox/server/postgresql/utils/query.py @@ -2,6 +2,7 @@ from typing import Literal, TypeVar +import polars as pl import pyarrow as pa from sqlalchemy import ( CTE, @@ -467,17 +468,17 @@ def _build_match_query( def query( - source: SourceResolutionName, + sources: list[SourceResolutionName], resolution: ResolutionName | None = None, threshold: int | None = None, return_leaf_id: bool = False, limit: int = None, ) -> pa.Table: - """Queries Matchbox to retrieve linked data for a source. + """Queries Matchbox to retrieve linked data for multiple sources. - Retrieves all linked data for a given source, resolving through hierarchy if needed. + Retrieves all linked data for given sources, resolving through hierarchy if needed. - * Simple case: If querying the same resolution as the source, just select cluster + * Simple case: If querying the same resolution as the sources, just select cluster IDs and keys directly from ClusterSourceKey * Hierarchy case: Uses the unified query builder to traverse up the resolution hierarchy, applying COALESCE priority logic to determine which parent cluster @@ -487,11 +488,19 @@ def query( Returns all records with their final resolved cluster IDs. """ + if not sources: + raise ValueError("At least one source must be provided") + with MBDB.get_session() as session: - source_config: SourceConfigs = get_source_config(source, session) - source_resolution: Resolutions = session.get( - Resolutions, source_config.resolution_id - ) + # Get all source configs and validate they exist + source_configs: list[SourceConfigs] = [] + source_resolutions: list[Resolutions] = [] + + for source in sources: + source_config = get_source_config(source, session) + source_configs.append(source_config) + source_resolution = session.get(Resolutions, source_config.resolution_id) + source_resolutions.append(source_resolution) if resolution: truth_resolution: Resolutions = ( @@ -502,14 +511,16 @@ def query( if truth_resolution is None: raise MatchboxResolutionNotFoundError(name=resolution) else: - truth_resolution: Resolutions = source_resolution + # Use the first source's resolution as the truth resolution + truth_resolution: Resolutions = source_resolutions[0] + # Add source_config_id to the query level selection id_query: Select = build_unified_query( resolution=truth_resolution, - sources=[source_config], + sources=source_configs, threshold=threshold, level="key", - ) + ).add_columns(ClusterSourceKey.source_config_id) if limit: id_query = id_query.limit(limit) @@ -518,14 +529,36 @@ def query( stmt: str = compile_sql(id_query) logger.debug(f"Query SQL: \n {stmt}") id_results = sql_to_df( - stmt=stmt, connection=conn.dbapi_connection, return_type="arrow" - ).rename_columns({"root_id": "id"}) + stmt=stmt, connection=conn.dbapi_connection, return_type="polars" + ).rename({"root_id": "id"}) + + # Add source identification column using Polars join (faster than map_elements) + # Create a mapping DataFrame for efficient joining + source_mapping_df = pl.DataFrame( + { + "source_config_id": [ + config.source_config_id for config in source_configs + ], + "source": [sources[i] for i in range(len(source_configs))], + } + ) - selection = ["id", "key"] + # Join to add source names and cast to categorical for efficient encoding + id_results = id_results.join( + source_mapping_df, on="source_config_id", how="left" + ).with_columns(pl.col("source").cast(pl.Categorical)) + + # Select final columns + selection = ["id", "key", "source"] if return_leaf_id: selection.append("leaf_id") - return id_results.select(selection) + final_df = id_results.select(selection) + + # Convert to Arrow - Polars creates dictionary + # which is compatible with parquet format for efficient file sizes. + # Note: parquet would downgrade uint32/large_string anyway for compatibility. + return final_df.to_arrow() def get_parent_clusters_and_leaves( diff --git a/test/client/helpers/test_query_helper.py b/test/client/helpers/test_query_helper.py index f005e7ced..d4e4c0d7f 100644 --- a/test/client/helpers/test_query_helper.py +++ b/test/client/helpers/test_query_helper.py @@ -1,4 +1,7 @@ +from io import BytesIO + import pyarrow as pa +import pyarrow.parquet as pq import pytest from httpx import Response from numpy import ndarray @@ -7,7 +10,7 @@ from matchbox import query from matchbox.client.helpers import select -from matchbox.common.arrow import SCHEMA_QUERY, table_to_buffer +from matchbox.common.arrow import SCHEMA_QUERY from matchbox.common.dtos import BackendResourceType, NotFoundError from matchbox.common.exceptions import MatchboxResolutionNotFoundError from matchbox.common.factories.sources import source_factory, source_from_tuple @@ -32,20 +35,29 @@ def test_query_no_resolution_ok_various_params( return_value=Response(200, json=testkit.source_config.model_dump(mode="json")) ) + # Create mock table that matches parquet schema (int32/string dictionary) + id_array = pa.array([1, 2], type=pa.int64()) + key_array = pa.array(["0", "1"], type=pa.large_string()) + + # Create dictionary with int32 indices and string values (parquet compatible) + source_indices = pa.array([0, 0], type=pa.int32()) + source_dict = pa.array(["foo"], type=pa.string()) + source_array = pa.DictionaryArray.from_arrays(source_indices, source_dict) + + mock_table = pa.table([id_array, key_array, source_array], schema=SCHEMA_QUERY) + + # Create parquet buffer instead of Arrow IPC + parquet_buffer = BytesIO() + pq.write_table(mock_table, parquet_buffer) + parquet_buffer.seek(0) + query_route = matchbox_api.get("/query").mock( return_value=Response( 200, - content=table_to_buffer( - pa.Table.from_pylist( - [ - {"key": "0", "id": 1}, - {"key": "1", "id": 2}, - ], - schema=SCHEMA_QUERY, - ) - ).read(), + content=parquet_buffer.read(), ) - ) + ) # The query route mock above will handle both calls automatically + # since respx allows multiple calls to the same route selectors = select({"foo": ["a", "b"]}, client=sqlite_warehouse) @@ -54,8 +66,10 @@ def test_query_no_resolution_ok_various_params( assert len(results) == 2 assert {"foo_a", "foo_b", "id"} == set(results.columns) - assert dict(query_route.calls.last.request.url.params) == { - "source": testkit.source_config.name, + # Check first call (without threshold) + first_call_params = dict(query_route.calls[0].request.url.params) + assert first_call_params == { + "sources": testkit.source_config.name, "return_leaf_id": "False", } @@ -66,8 +80,10 @@ def test_query_no_resolution_ok_various_params( assert len(results) == 2 assert {"foo_a", "foo_b", "id"} == set(results.columns) - assert dict(query_route.calls.last.request.url.params) == { - "source": testkit.source_config.name, + # Check second call (with threshold) + second_call_params = dict(query_route.calls[1].request.url.params) + assert second_call_params == { + "sources": testkit.source_config.name, "threshold": "50", "return_leaf_id": "False", } @@ -101,34 +117,31 @@ def test_query_multiple_sources(matchbox_api: MockRouter, sqlite_warehouse: Engi return_value=Response(200, json=testkit2.source_config.model_dump(mode="json")) ) + # Create combined mock table for both sources using parquet-compatible schema + id_array = pa.array([1, 2, 1, 2], type=pa.int64()) + key_array = pa.array(["0", "1", "2", "3"], type=pa.large_string()) + + # Create categorical source array with parquet-compatible types + source_indices = pa.array([0, 0, 1, 1], type=pa.int32()) + source_dict = pa.array( + [testkit1.source_config.name, testkit2.source_config.name], type=pa.string() + ) + source_array = pa.DictionaryArray.from_arrays(source_indices, source_dict) + + combined_mock_table = pa.table( + [id_array, key_array, source_array], schema=SCHEMA_QUERY + ) + + # Mock for multi-source query - use general matching + combined_parquet_buffer = BytesIO() + pq.write_table(combined_mock_table, combined_parquet_buffer) + combined_parquet_buffer.seek(0) + query_route = matchbox_api.get("/query").mock( - side_effect=[ - Response( - 200, - content=table_to_buffer( - pa.Table.from_pylist( - [ - {"key": "0", "id": 1}, - {"key": "1", "id": 2}, - ], - schema=SCHEMA_QUERY, - ) - ).read(), - ), - Response( - 200, - content=table_to_buffer( - pa.Table.from_pylist( - [ - {"key": "2", "id": 1}, - {"key": "3", "id": 2}, - ], - schema=SCHEMA_QUERY, - ) - ).read(), - ), - ] - * 2 # 2 calls to `query()` in this test, each querying server twice + return_value=Response( + 200, + content=combined_parquet_buffer.read(), + ) ) sels = select("foo", {"foo2": ["c"]}, client=sqlite_warehouse) @@ -146,18 +159,21 @@ def test_query_multiple_sources(matchbox_api: MockRouter, sqlite_warehouse: Engi "id", } == set(results.columns) - assert dict(query_route.calls[-2].request.url.params) == { - "source": testkit1.source_config.name, - "resolution": DEFAULT_RESOLUTION, - "return_leaf_id": "False", - } - assert dict(query_route.calls[-1].request.url.params) == { - "source": testkit2.source_config.name, - "resolution": DEFAULT_RESOLUTION, - "return_leaf_id": "False", + # Check that the multi-source query was made correctly + from urllib.parse import parse_qs, urlparse + + last_request_url = str(query_route.calls.last.request.url) + parsed_url = urlparse(last_request_url) + url_params = parse_qs(parsed_url.query) + + assert url_params == { + "sources": [testkit1.source_config.name, testkit2.source_config.name], + "resolution": [DEFAULT_RESOLUTION], + "return_leaf_id": ["False"], } # It also works with the selectors specified separately + # But with the optimization, this also makes a single multi-source call query([sels[0]], [sels[1]], return_leaf_id=False) @@ -195,41 +211,34 @@ def test_query_combine_type( return_value=Response(200, json=testkit2.source_config.model_dump(mode="json")) ) + # Create combined mock table for multi-source query using parquet-compatible schema + id_array = pa.array([1, 1, 2, 2, 2, 3], type=pa.int64()) + key_array = pa.array(["0", "1", "2", "3", "3", "4"], type=pa.large_string()) + + # Create categorical source array with parquet-compatible types + source_indices = pa.array([0, 0, 0, 1, 1, 1], type=pa.int32()) + source_dict = pa.array( + [testkit1.source_config.name, testkit2.source_config.name], type=pa.string() + ) + source_array = pa.DictionaryArray.from_arrays(source_indices, source_dict) + + combined_mock_table = pa.table( + [id_array, key_array, source_array], schema=SCHEMA_QUERY + ) + + # Create parquet buffer for mock response + combined_parquet_buffer2 = BytesIO() + pq.write_table(combined_mock_table, combined_parquet_buffer2) + combined_parquet_buffer2.seek(0) + matchbox_api.get("/query").mock( - side_effect=[ - Response( - 200, - content=table_to_buffer( - pa.Table.from_pylist( - [ - {"key": "0", "id": 1}, - {"key": "1", "id": 1}, - {"key": "2", "id": 2}, - ], - schema=SCHEMA_QUERY, - ) - ).read(), - ), - Response( - 200, - content=table_to_buffer( - pa.Table.from_pylist( - [ - # Creating a duplicate value for the same Matchbox ID - {"key": "3", "id": 2}, - {"key": "3", "id": 2}, - {"key": "4", "id": 3}, - ], - schema=SCHEMA_QUERY, - ) - ).read(), - ), - ] # two sources to query + return_value=Response( + 200, + content=combined_parquet_buffer2.read(), + ) ) sels = select("foo", "bar", client=sqlite_warehouse) - - # Validate results results = query(sels, combine_type=combine_type, return_leaf_id=False) if combine_type == "set_agg": diff --git a/test/client/test_eval.py b/test/client/test_eval.py index 6d33b2f20..60e50a444 100644 --- a/test/client/test_eval.py +++ b/test/client/test_eval.py @@ -163,7 +163,13 @@ def test_get_samples( content=table_to_buffer(just_baz_samples).read(), ) ) - no_accessible_samples = get_samples(n=10, resolution="resolution", user_id=user_id) + # Suppress expected warning about incompatible client for 'baz' source + with pytest.warns( + UserWarning, match="Skipping baz, incompatible with given client" + ): + no_accessible_samples = get_samples( + n=10, resolution="resolution", user_id=user_id + ) assert no_accessible_samples == {} # Using default client as fallback diff --git a/test/client/test_extract.py b/test/client/test_extract.py index d2caae7ae..836d8ab37 100644 --- a/test/client/test_extract.py +++ b/test/client/test_extract.py @@ -61,35 +61,48 @@ def test_key_field_map( ) ) - matchbox_api.get("/query", params={"source": "foo"}).mock( + # Create mock table for foo source (for single-source queries) + indices_foo = pa.array([0, 0, 0], type=pa.uint32()) + dictionary_foo = pa.array(["foo"], type=pa.large_string()) + source_dict_foo = pa.DictionaryArray.from_arrays(indices_foo, dictionary_foo) + + foo_table = pa.Table.from_arrays( + [ + pa.array([1, 2, 3], type=pa.int64()), + pa.array(["1", "2", "3"], type=pa.large_string()), + source_dict_foo, + ], + schema=SCHEMA_QUERY, + ) + + matchbox_api.get("/query", params={"sources": ["foo"]}).mock( return_value=Response( 200, - content=table_to_buffer( - pa.Table.from_pylist( - [ - {"id": 1, "key": "1"}, - {"id": 2, "key": "2"}, - {"id": 3, "key": "3"}, - ], - schema=SCHEMA_QUERY, - ) - ).read(), + content=table_to_buffer(foo_table).read(), ) ) - matchbox_api.get("/query", params={"source": "bar"}).mock( + # Create combined mock table for multi-source queries + combined_indices = pa.array([0, 0, 0, 1, 1, 1], type=pa.uint32()) + combined_dictionary = pa.array(["foo", "bar"], type=pa.large_string()) + combined_source_dict = pa.DictionaryArray.from_arrays( + combined_indices, combined_dictionary + ) + + combined_table = pa.Table.from_arrays( + [ + pa.array([1, 2, 3, 1, 3, 3], type=pa.int64()), + pa.array(["1", "2", "3", "a", "b", "c"], type=pa.large_string()), + combined_source_dict, + ], + schema=SCHEMA_QUERY, + ) + + # Mock for multi-source query (no filter or both sources) + matchbox_api.get("/query", params={"sources": ["foo", "bar"]}).mock( return_value=Response( 200, - content=table_to_buffer( - pa.Table.from_pylist( - [ - {"id": 1, "key": "a"}, - {"id": 3, "key": "b"}, - {"id": 3, "key": "c"}, - ], - schema=SCHEMA_QUERY, - ) - ).read(), + content=table_to_buffer(combined_table).read(), ) ) @@ -112,7 +125,7 @@ def test_key_field_map( ) # With source filter - foo_mapping = key_field_map(resolution="companies", source_filter="foo") + foo_mapping = key_field_map(resolution="companies", source_filter=["foo"]) assert_frame_equal( pl.from_arrow(foo_mapping), @@ -124,7 +137,7 @@ def test_key_field_map( # With both filters foo_mapping = key_field_map( resolution="companies", - source_filter="foo", + source_filter=["foo"], location_names="sqlite", ) diff --git a/test/fixtures/db.py b/test/fixtures/db.py index ff8a1029d..034b3cc5b 100644 --- a/test/fixtures/db.py +++ b/test/fixtures/db.py @@ -146,7 +146,7 @@ def create_dedupe_scenario( name = f"naive_test.{source.name}" # Query the raw data - source_query = backend.query(source=source.name) + source_query = backend.query(sources=[source.name]) # Build model testkit using query data model_testkit = query_to_model_factory( @@ -189,7 +189,7 @@ def create_probabilistic_dedupe_scenario( name = f"probabilistic_test.{source.name}" # Query the raw data - source_query = backend.query(source=source.name) + source_query = backend.query(sources=[source.name]) # Build model testkit using query data model_testkit = query_to_model_factory( @@ -234,9 +234,9 @@ def create_link_scenario( cdms_model = dag.models["naive_test.cdms"] # Query data for each resolution - crn_query = backend.query(source="crn", resolution=crn_model.name) - duns_query = backend.query(source="duns", resolution=duns_model.name) - cdms_query = backend.query(source="cdms", resolution=cdms_model.name) + crn_query = backend.query(sources=["crn"], resolution=crn_model.name) + duns_query = backend.query(sources=["duns"], resolution=duns_model.name) + cdms_query = backend.query(sources=["cdms"], resolution=cdms_model.name) # Create CRN-DUNS link crn_duns_name = "deterministic_naive_test.crn_naive_test.duns" @@ -282,17 +282,17 @@ def create_link_scenario( # Create final join # Query the previous link's results crn_cdms_query_crn_only = backend.query( - source="crn", resolution=crn_cdms_name - ).rename_columns(["id", "keys_crn"]) + sources=["crn"], resolution=crn_cdms_name + ).rename_columns(["id", "keys_crn", "source"]) crn_cdms_query_cdms_only = backend.query( - source="cdms", resolution=crn_cdms_name - ).rename_columns(["id", "keys_cdms"]) + sources=["cdms"], resolution=crn_cdms_name + ).rename_columns(["id", "keys_cdms", "source"]) crn_cdms_query = pa.concat_tables( [crn_cdms_query_crn_only, crn_cdms_query_cdms_only], promote_options="default", ).combine_chunks() - duns_query_linked = backend.query(source="duns", resolution=duns_model.name) + duns_query_linked = backend.query(sources=["duns"], resolution=duns_model.name) final_join_name = "final_join" final_join_model = query_to_model_factory( @@ -361,7 +361,7 @@ def create_alt_dedupe_scenario( model_name2 = f"dedupe2.{source.name}" # Query the raw data - source_query = backend.query(source=source.name) + source_query = backend.query(sources=[source.name]) # Build model testkit using query data model_testkit1 = query_to_model_factory( @@ -457,7 +457,7 @@ def create_convergent_scenario( name = f"naive_test.{source.name}" # Query the raw data - source_query = backend.query(source=source.name) + source_query = backend.query(sources=[source.name]) # Build model testkit using query data model_testkit = query_to_model_factory( diff --git a/test/server/api/routes/test_routes_main.py b/test/server/api/routes/test_routes_main.py index 7d1fe78bf..946071e7e 100644 --- a/test/server/api/routes/test_routes_main.py +++ b/test/server/api/routes/test_routes_main.py @@ -309,23 +309,31 @@ def test_process_upload_deletes_file_on_failure(s3: S3Client): def test_query(test_client: TestClient): # Mock backend mock_backend = Mock() - mock_backend.query = Mock( - return_value=pa.Table.from_pylist( - [ - {"keys": "a", "id": 1}, - {"keys": "b", "id": 2}, - ], - schema=SCHEMA_QUERY, - ) + + # Create test data with proper schema including categorical source + # Create dictionary array directly with large_string values + indices = pa.array([0, 0], type=pa.uint32()) + dictionary = pa.array(["foo"], type=pa.large_string()) + source_dict = pa.DictionaryArray.from_arrays(indices, dictionary) + + mock_table = pa.Table.from_arrays( + [ + pa.array([1, 2], type=pa.int64()), + pa.array(["a", "b"], type=pa.large_string()), + source_dict, + ], + schema=SCHEMA_QUERY, ) + mock_backend.query = Mock(return_value=mock_table) + # Override app dependencies with mocks app.dependency_overrides[backend] = lambda: mock_backend - # Hit endpoint + # Hit endpoint with sources parameter (now a list) response = test_client.get( "/query", - params={"source": "foo", "return_leaf_id": False}, + params={"sources": ["foo"], "return_leaf_id": False}, ) # Process response @@ -334,7 +342,14 @@ def test_query(test_client: TestClient): # Check response assert response.status_code == 200 - assert table.schema.equals(SCHEMA_QUERY) + # Check that we have the right columns + assert len(table.schema) == 3 + assert table.schema.names == ["id", "key", "source"] + assert table.schema.field("id").type == pa.int64() + assert table.schema.field("key").type == pa.large_string() + assert table.schema.field("source").type.equals( + pa.dictionary(pa.uint32(), pa.large_string()) + ) def test_query_404_resolution(test_client: TestClient): @@ -347,7 +362,7 @@ def test_query_404_resolution(test_client: TestClient): # Hit endpoint response = test_client.get( "/query", - params={"source": "foo", "resolution": "bar", "return_leaf_id": True}, + params={"sources": ["foo"], "resolution": "bar", "return_leaf_id": True}, ) # Check response @@ -364,7 +379,7 @@ def test_query_404_source(test_client: TestClient): # Hit endpoint response = test_client.get( "/query", - params={"source": "foo", "return_leaf_id": True}, + params={"sources": ["foo"], "return_leaf_id": True}, ) # Check response diff --git a/test/server/postgresql/test_pg_sql.py b/test/server/postgresql/test_pg_sql.py index 5f6b7b77a..ae28dae26 100644 --- a/test/server/postgresql/test_pg_sql.py +++ b/test/server/postgresql/test_pg_sql.py @@ -1180,7 +1180,9 @@ class TestQueryFunction: def test_query_source_only(self, populated_postgres_db: MatchboxPostgres): """Should query source data without resolution.""" - result = query("source_a", resolution=None, threshold=None, limit=None) + result = query( + sources=["source_a"], resolution=None, threshold=None, limit=None + ) # Should return all keys from source_a with their cluster assignments assert result.shape[0] == 6 @@ -1206,7 +1208,9 @@ def test_query_source_only(self, populated_postgres_db: MatchboxPostgres): def test_query_source_b_only(self, populated_postgres_db: MatchboxPostgres): """Should query source_b which has one key per cluster.""" - result = query("source_b", resolution=None, threshold=None, limit=None) + result = query( + sources=["source_b"], resolution=None, threshold=None, limit=None + ) # Should return all keys from source_b assert result.shape[0] == 5 @@ -1225,7 +1229,9 @@ def test_query_source_b_only(self, populated_postgres_db: MatchboxPostgres): def test_query_through_deduper(self, populated_postgres_db: MatchboxPostgres): """Should query source through its deduper resolution.""" - result = query("source_a", resolution="dedupe_a", threshold=None, limit=None) + result = query( + sources=["source_a"], resolution="dedupe_a", threshold=None, limit=None + ) # Should return all 6 keys, but some mapped to dedupe clusters assert result.shape[0] == 6 @@ -1246,7 +1252,9 @@ def test_query_through_deduper_with_threshold( ): """Should query source through deduper with threshold override.""" # Test with threshold=90 (higher than dedupe_a's clusters) - result = query("source_a", resolution="dedupe_a", threshold=90, limit=None) + result = query( + sources=["source_a"], resolution="dedupe_a", threshold=90, limit=None + ) # Should return all 6 keys, but no dedupe clusters qualify assert result.shape[0] == 6 @@ -1261,7 +1269,9 @@ def test_query_through_deduper_with_threshold( def test_query_through_linker(self, populated_postgres_db: MatchboxPostgres): """Should query source through complex linker resolution.""" - result = query("source_a", resolution="linker_ab", threshold=None, limit=None) + result = query( + sources=["source_a"], resolution="linker_ab", threshold=None, limit=None + ) # Should return all 6 keys with linker cluster assignments assert result.shape[0] == 6 @@ -1294,8 +1304,12 @@ def test_query_both_sources_through_linker( self, populated_postgres_db: MatchboxPostgres ): """Should query both sources through linker with consistent results.""" - result_a = query("source_a", resolution="linker_ab", threshold=80, limit=None) - result_b = query("source_b", resolution="linker_ab", threshold=80, limit=None) + result_a = query( + sources=["source_a"], resolution="linker_ab", threshold=80, limit=None + ) + result_b = query( + sources=["source_b"], resolution="linker_ab", threshold=80, limit=None + ) # Both should return their respective key counts assert result_a.shape[0] == 6 # source_a keys @@ -1338,7 +1352,7 @@ def test_query_both_sources_through_linker( def test_query_with_limit(self, populated_postgres_db: MatchboxPostgres): """Should respect limit parameter.""" - result = query("source_a", resolution=None, threshold=None, limit=3) + result = query(sources=["source_a"], resolution=None, threshold=None, limit=3) # Should return only 3 rows assert result.shape[0] == 3 @@ -1350,7 +1364,9 @@ def test_query_multiple_keys_per_cluster_scenario( ): """Should handle case where multiple keys belong to same cluster.""" # This tests the scenario causing your test failure - result = query("source_a", resolution="dedupe_a", threshold=80, limit=None) + result = query( + sources=["source_a"], resolution="dedupe_a", threshold=80, limit=None + ) # source_a has 6 keys but some share clusters: # - keys 1,2 both in cluster 101 → both map to C301 diff --git a/test/server/test_adapter.py b/test/server/test_adapter.py index 9c5de0530..cffadabd3 100644 --- a/test/server/test_adapter.py +++ b/test/server/test_adapter.py @@ -75,7 +75,7 @@ def test_validate_ids(self): crn_testkit = dag.sources.get("crn") df_crn = self.backend.query( - source=crn_testkit.source_config.name, + sources=[crn_testkit.source_config.name], resolution="naive_test.crn", ) @@ -92,7 +92,7 @@ def test_validate_hashes(self): crn_testkit = dag.sources.get("crn") df_crn = self.backend.query( - source=crn_testkit.source_config.name, + sources=[crn_testkit.source_config.name], resolution="naive_test.crn", ) @@ -112,7 +112,7 @@ def test_cluster_id_to_hash(self): crn_testkit = dag.sources.get("crn") df_crn = self.backend.query( - source=crn_testkit.source_config.name, + sources=[crn_testkit.source_config.name], resolution="naive_test.crn", ) @@ -294,7 +294,7 @@ def test_model_results_basic(self): # Query returns the same results as the testkit, showing # that processing was performed accurately res = self.backend.query( - source=dag.sources["crn"].source_config.name, + sources=[dag.sources["crn"].source_config.name], resolution="naive_test.crn", ) res_clusters = query_to_cluster_entities( @@ -350,7 +350,7 @@ def test_model_results_probabilistic(self): # Query returns the same results as the testkit, showing # that processing was performed accurately res = self.backend.query( - source=dag.sources["crn"].source_config.name, + sources=[dag.sources["crn"].source_config.name], resolution="probabilistic_test.crn", ) res_clusters = query_to_cluster_entities( @@ -553,14 +553,14 @@ def test_query_only_source(self): crn_testkit = dag.sources.get("crn") df_crn_sample = self.backend.query( - source=crn_testkit.source_config.name, + sources=[crn_testkit.source_config.name], limit=10, ) assert isinstance(df_crn_sample, pa.Table) assert df_crn_sample.num_rows == 10 - df_crn_full = self.backend.query(source=crn_testkit.source_config.name) + df_crn_full = self.backend.query(sources=[crn_testkit.source_config.name]) assert df_crn_full.num_rows == crn_testkit.query.num_rows assert df_crn_full.schema.equals(SCHEMA_QUERY) @@ -571,7 +571,7 @@ def test_query_return_leaf_ids(self): crn_testkit = dag.sources.get("crn") df_crn_full = self.backend.query( - source=crn_testkit.source_config.name, return_leaf_id=True + sources=[crn_testkit.source_config.name], return_leaf_id=True ) assert df_crn_full.num_rows == crn_testkit.query.num_rows @@ -583,7 +583,7 @@ def test_query_with_dedupe_model(self): crn_testkit = dag.sources.get("crn") df_crn = self.backend.query( - source=crn_testkit.source_config.name, + sources=[crn_testkit.source_config.name], resolution="naive_test.crn", ) @@ -607,7 +607,7 @@ def test_query_with_link_model(self): duns_testkit = dag.sources.get("duns") df_crn = self.backend.query( - source=crn_testkit.source_config.name, + sources=[crn_testkit.source_config.name], resolution=linker_name, ) @@ -616,7 +616,7 @@ def test_query_with_link_model(self): assert df_crn.schema.equals(SCHEMA_QUERY) df_duns = self.backend.query( - source=duns_testkit.source_config.name, + sources=[duns_testkit.source_config.name], resolution=linker_name, ) @@ -644,7 +644,7 @@ def test_threshold_query_with_link_model(self): cdms_testkit = dag.sources.get("cdms") df_crn = self.backend.query( - source=crn_testkit.source_config.name, + sources=[crn_testkit.source_config.name], resolution=linker_name, ) @@ -653,7 +653,7 @@ def test_threshold_query_with_link_model(self): assert df_crn.schema.equals(SCHEMA_QUERY) df_cdms = self.backend.query( - source=cdms_testkit.source_config.name, + sources=[cdms_testkit.source_config.name], resolution=linker_name, ) @@ -667,12 +667,12 @@ def test_threshold_query_with_link_model(self): # Test query with threshold df_crn_threshold = self.backend.query( - source=crn_testkit.source_config.name, + sources=[crn_testkit.source_config.name], resolution=linker_name, threshold=100, ) df_cdms_threshold = self.backend.query( - source=cdms_testkit.source_config.name, + sources=[cdms_testkit.source_config.name], resolution=linker_name, threshold=100, ) @@ -889,7 +889,7 @@ def get_counts(): # Get some specific IDs to verify they're restored properly df_crn_before = self.backend.query( - source=crn_testkit.source_config.name, + sources=[crn_testkit.source_config.name], resolution="naive_test.crn", ) sample_ids_before = df_crn_before["id"].to_pylist()[:5] # Take first 5 IDs @@ -909,7 +909,7 @@ def get_counts(): # Verify specific data was restored correctly df_crn_after = self.backend.query( - source=crn_testkit.source_config.name, + sources=[crn_testkit.source_config.name], resolution="naive_test.crn", ) sample_ids_after = df_crn_after["id"].to_pylist()[:5] # Take first 5 IDs @@ -939,10 +939,10 @@ def test_insert_and_get_judgement(self): # Do some queries to find real source cluster IDs deduped_query = pl.from_arrow( - self.backend.query(source="crn", resolution="naive_test.crn") + self.backend.query(sources=["crn"], resolution="naive_test.crn") ) unique_ids = deduped_query["id"].unique() - all_leaves = pl.from_arrow(self.backend.query(source="crn")) + all_leaves = pl.from_arrow(self.backend.query(sources=["crn"])) def get_leaf_ids(cluster_id: int) -> list[int]: return ( @@ -1098,9 +1098,9 @@ def test_sample_for_eval(self): # We now look at more interesting cases # Query backend to form expectations resolution_clusters = pl.from_arrow( - self.backend.query(source="crn", resolution="naive_test.crn") + self.backend.query(sources=["crn"], resolution="naive_test.crn") ) - source_clusters = pl.from_arrow(self.backend.query(source="crn")) + source_clusters = pl.from_arrow(self.backend.query(sources=["crn"])) # We can request more than available assert len(resolution_clusters["id"].unique()) < 99 @@ -1111,8 +1111,11 @@ def test_sample_for_eval(self): assert samples_99.schema.equals(SCHEMA_EVAL_SAMPLES) # We can reconstruct the expected sample from resolution and source queries + # Select only needed columns from source_clusters to avoid column collision expected_sample = ( - resolution_clusters.join(source_clusters, on="key", suffix="_source") + resolution_clusters.join( + source_clusters.select(["id", "key"]), on="key", suffix="_source" + ) .rename({"id": "root", "id_source": "leaf"}) .with_columns(pl.lit("crn").alias("source")) ) diff --git a/uv.lock b/uv.lock index f25396101..d1f21b453 100644 --- a/uv.lock +++ b/uv.lock @@ -2756,27 +2756,28 @@ wheels = [ [[package]] name = "ruff" -version = "0.12.7" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/a1/81/0bd3594fa0f690466e41bd033bdcdf86cba8288345ac77ad4afbe5ec743a/ruff-0.12.7.tar.gz", hash = "sha256:1fc3193f238bc2d7968772c82831a4ff69252f673be371fb49663f0068b7ec71", size = 5197814 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e1/d2/6cb35e9c85e7a91e8d22ab32ae07ac39cc34a71f1009a6f9e4a2a019e602/ruff-0.12.7-py3-none-linux_armv6l.whl", hash = "sha256:76e4f31529899b8c434c3c1dede98c4483b89590e15fb49f2d46183801565303", size = 11852189 }, - { url = "https://files.pythonhosted.org/packages/63/5b/a4136b9921aa84638f1a6be7fb086f8cad0fde538ba76bda3682f2599a2f/ruff-0.12.7-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:789b7a03e72507c54fb3ba6209e4bb36517b90f1a3569ea17084e3fd295500fb", size = 12519389 }, - { url = "https://files.pythonhosted.org/packages/a8/c9/3e24a8472484269b6b1821794141f879c54645a111ded4b6f58f9ab0705f/ruff-0.12.7-py3-none-macosx_11_0_arm64.whl", hash = "sha256:2e1c2a3b8626339bb6369116e7030a4cf194ea48f49b64bb505732a7fce4f4e3", size = 11743384 }, - { url = "https://files.pythonhosted.org/packages/26/7c/458dd25deeb3452c43eaee853c0b17a1e84169f8021a26d500ead77964fd/ruff-0.12.7-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32dec41817623d388e645612ec70d5757a6d9c035f3744a52c7b195a57e03860", size = 11943759 }, - { url = "https://files.pythonhosted.org/packages/7f/8b/658798472ef260ca050e400ab96ef7e85c366c39cf3dfbef4d0a46a528b6/ruff-0.12.7-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:47ef751f722053a5df5fa48d412dbb54d41ab9b17875c6840a58ec63ff0c247c", size = 11654028 }, - { url = "https://files.pythonhosted.org/packages/a8/86/9c2336f13b2a3326d06d39178fd3448dcc7025f82514d1b15816fe42bfe8/ruff-0.12.7-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a828a5fc25a3efd3e1ff7b241fd392686c9386f20e5ac90aa9234a5faa12c423", size = 13225209 }, - { url = "https://files.pythonhosted.org/packages/76/69/df73f65f53d6c463b19b6b312fd2391dc36425d926ec237a7ed028a90fc1/ruff-0.12.7-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:5726f59b171111fa6a69d82aef48f00b56598b03a22f0f4170664ff4d8298efb", size = 14182353 }, - { url = "https://files.pythonhosted.org/packages/58/1e/de6cda406d99fea84b66811c189b5ea139814b98125b052424b55d28a41c/ruff-0.12.7-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:74e6f5c04c4dd4aba223f4fe6e7104f79e0eebf7d307e4f9b18c18362124bccd", size = 13631555 }, - { url = "https://files.pythonhosted.org/packages/6f/ae/625d46d5164a6cc9261945a5e89df24457dc8262539ace3ac36c40f0b51e/ruff-0.12.7-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5d0bfe4e77fba61bf2ccadf8cf005d6133e3ce08793bbe870dd1c734f2699a3e", size = 12667556 }, - { url = "https://files.pythonhosted.org/packages/55/bf/9cb1ea5e3066779e42ade8d0cd3d3b0582a5720a814ae1586f85014656b6/ruff-0.12.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:06bfb01e1623bf7f59ea749a841da56f8f653d641bfd046edee32ede7ff6c606", size = 12939784 }, - { url = "https://files.pythonhosted.org/packages/55/7f/7ead2663be5627c04be83754c4f3096603bf5e99ed856c7cd29618c691bd/ruff-0.12.7-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:e41df94a957d50083fd09b916d6e89e497246698c3f3d5c681c8b3e7b9bb4ac8", size = 11771356 }, - { url = "https://files.pythonhosted.org/packages/17/40/a95352ea16edf78cd3a938085dccc55df692a4d8ba1b3af7accbe2c806b0/ruff-0.12.7-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:4000623300563c709458d0ce170c3d0d788c23a058912f28bbadc6f905d67afa", size = 11612124 }, - { url = "https://files.pythonhosted.org/packages/4d/74/633b04871c669e23b8917877e812376827c06df866e1677f15abfadc95cb/ruff-0.12.7-py3-none-musllinux_1_2_i686.whl", hash = "sha256:69ffe0e5f9b2cf2b8e289a3f8945b402a1b19eff24ec389f45f23c42a3dd6fb5", size = 12479945 }, - { url = "https://files.pythonhosted.org/packages/be/34/c3ef2d7799c9778b835a76189c6f53c179d3bdebc8c65288c29032e03613/ruff-0.12.7-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:a07a5c8ffa2611a52732bdc67bf88e243abd84fe2d7f6daef3826b59abbfeda4", size = 12998677 }, - { url = "https://files.pythonhosted.org/packages/77/ab/aca2e756ad7b09b3d662a41773f3edcbd262872a4fc81f920dc1ffa44541/ruff-0.12.7-py3-none-win32.whl", hash = "sha256:c928f1b2ec59fb77dfdf70e0419408898b63998789cc98197e15f560b9e77f77", size = 11756687 }, - { url = "https://files.pythonhosted.org/packages/b4/71/26d45a5042bc71db22ddd8252ca9d01e9ca454f230e2996bb04f16d72799/ruff-0.12.7-py3-none-win_amd64.whl", hash = "sha256:9c18f3d707ee9edf89da76131956aba1270c6348bfee8f6c647de841eac7194f", size = 12912365 }, - { url = "https://files.pythonhosted.org/packages/4c/9b/0b8aa09817b63e78d94b4977f18b1fcaead3165a5ee49251c5d5c245bb2d/ruff-0.12.7-py3-none-win_arm64.whl", hash = "sha256:dfce05101dbd11833a0776716d5d1578641b7fddb537fe7fa956ab85d1769b69", size = 11982083 }, +version = "0.12.9" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/4a/45/2e403fa7007816b5fbb324cb4f8ed3c7402a927a0a0cb2b6279879a8bfdc/ruff-0.12.9.tar.gz", hash = "sha256:fbd94b2e3c623f659962934e52c2bea6fc6da11f667a427a368adaf3af2c866a", size = 5254702 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ad/20/53bf098537adb7b6a97d98fcdebf6e916fcd11b2e21d15f8c171507909cc/ruff-0.12.9-py3-none-linux_armv6l.whl", hash = "sha256:fcebc6c79fcae3f220d05585229463621f5dbf24d79fdc4936d9302e177cfa3e", size = 11759705 }, + { url = "https://files.pythonhosted.org/packages/20/4d/c764ee423002aac1ec66b9d541285dd29d2c0640a8086c87de59ebbe80d5/ruff-0.12.9-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:aed9d15f8c5755c0e74467731a007fcad41f19bcce41cd75f768bbd687f8535f", size = 12527042 }, + { url = "https://files.pythonhosted.org/packages/8b/45/cfcdf6d3eb5fc78a5b419e7e616d6ccba0013dc5b180522920af2897e1be/ruff-0.12.9-py3-none-macosx_11_0_arm64.whl", hash = "sha256:5b15ea354c6ff0d7423814ba6d44be2807644d0c05e9ed60caca87e963e93f70", size = 11724457 }, + { url = "https://files.pythonhosted.org/packages/72/e6/44615c754b55662200c48bebb02196dbb14111b6e266ab071b7e7297b4ec/ruff-0.12.9-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d596c2d0393c2502eaabfef723bd74ca35348a8dac4267d18a94910087807c53", size = 11949446 }, + { url = "https://files.pythonhosted.org/packages/fd/d1/9b7d46625d617c7df520d40d5ac6cdcdf20cbccb88fad4b5ecd476a6bb8d/ruff-0.12.9-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1b15599931a1a7a03c388b9c5df1bfa62be7ede6eb7ef753b272381f39c3d0ff", size = 11566350 }, + { url = "https://files.pythonhosted.org/packages/59/20/b73132f66f2856bc29d2d263c6ca457f8476b0bbbe064dac3ac3337a270f/ruff-0.12.9-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3d02faa2977fb6f3f32ddb7828e212b7dd499c59eb896ae6c03ea5c303575756", size = 13270430 }, + { url = "https://files.pythonhosted.org/packages/a2/21/eaf3806f0a3d4c6be0a69d435646fba775b65f3f2097d54898b0fd4bb12e/ruff-0.12.9-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:17d5b6b0b3a25259b69ebcba87908496e6830e03acfb929ef9fd4c58675fa2ea", size = 14264717 }, + { url = "https://files.pythonhosted.org/packages/d2/82/1d0c53bd37dcb582b2c521d352fbf4876b1e28bc0d8894344198f6c9950d/ruff-0.12.9-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:72db7521860e246adbb43f6ef464dd2a532ef2ef1f5dd0d470455b8d9f1773e0", size = 13684331 }, + { url = "https://files.pythonhosted.org/packages/3b/2f/1c5cf6d8f656306d42a686f1e207f71d7cebdcbe7b2aa18e4e8a0cb74da3/ruff-0.12.9-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a03242c1522b4e0885af63320ad754d53983c9599157ee33e77d748363c561ce", size = 12739151 }, + { url = "https://files.pythonhosted.org/packages/47/09/25033198bff89b24d734e6479e39b1968e4c992e82262d61cdccaf11afb9/ruff-0.12.9-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fc83e4e9751e6c13b5046d7162f205d0a7bac5840183c5beebf824b08a27340", size = 12954992 }, + { url = "https://files.pythonhosted.org/packages/52/8e/d0dbf2f9dca66c2d7131feefc386523404014968cd6d22f057763935ab32/ruff-0.12.9-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:881465ed56ba4dd26a691954650de6ad389a2d1fdb130fe51ff18a25639fe4bb", size = 12899569 }, + { url = "https://files.pythonhosted.org/packages/a0/bd/b614d7c08515b1428ed4d3f1d4e3d687deffb2479703b90237682586fa66/ruff-0.12.9-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:43f07a3ccfc62cdb4d3a3348bf0588358a66da756aa113e071b8ca8c3b9826af", size = 11751983 }, + { url = "https://files.pythonhosted.org/packages/58/d6/383e9f818a2441b1a0ed898d7875f11273f10882f997388b2b51cb2ae8b5/ruff-0.12.9-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:07adb221c54b6bba24387911e5734357f042e5669fa5718920ee728aba3cbadc", size = 11538635 }, + { url = "https://files.pythonhosted.org/packages/20/9c/56f869d314edaa9fc1f491706d1d8a47747b9d714130368fbd69ce9024e9/ruff-0.12.9-py3-none-musllinux_1_2_i686.whl", hash = "sha256:f5cd34fabfdea3933ab85d72359f118035882a01bff15bd1d2b15261d85d5f66", size = 12534346 }, + { url = "https://files.pythonhosted.org/packages/bd/4b/d8b95c6795a6c93b439bc913ee7a94fda42bb30a79285d47b80074003ee7/ruff-0.12.9-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:f6be1d2ca0686c54564da8e7ee9e25f93bdd6868263805f8c0b8fc6a449db6d7", size = 13017021 }, + { url = "https://files.pythonhosted.org/packages/c7/c1/5f9a839a697ce1acd7af44836f7c2181cdae5accd17a5cb85fcbd694075e/ruff-0.12.9-py3-none-win32.whl", hash = "sha256:cc7a37bd2509974379d0115cc5608a1a4a6c4bff1b452ea69db83c8855d53f93", size = 11734785 }, + { url = "https://files.pythonhosted.org/packages/fa/66/cdddc2d1d9a9f677520b7cfc490d234336f523d4b429c1298de359a3be08/ruff-0.12.9-py3-none-win_amd64.whl", hash = "sha256:6fb15b1977309741d7d098c8a3cb7a30bc112760a00fb6efb7abc85f00ba5908", size = 12840654 }, + { url = "https://files.pythonhosted.org/packages/ac/fd/669816bc6b5b93b9586f3c1d87cd6bc05028470b3ecfebb5938252c47a35/ruff-0.12.9-py3-none-win_arm64.whl", hash = "sha256:63c8c819739d86b96d500cce885956a1a48ab056bbcbc61b747ad494b2485089", size = 11949623 }, ] [[package]]