From eb567cf7883f369e95ead447079014ea77ac7c7b Mon Sep 17 00:00:00 2001 From: codehex083 Date: Sun, 7 Jun 2026 04:46:14 +0900 Subject: [PATCH 1/2] Fix: Fix llama-stack startup as non-root user --- docker-compose-library.yaml | 9 ++++++++- docker-compose.yaml | 5 ++++- scripts/lightspeed-stack-entrypoint.sh | 25 +++++++++++++++++++++++++ scripts/llama-stack-entrypoint.sh | 15 +++++++++++++++ 4 files changed, 52 insertions(+), 2 deletions(-) create mode 100755 scripts/lightspeed-stack-entrypoint.sh diff --git a/docker-compose-library.yaml b/docker-compose-library.yaml index e268e4aef..98c38459e 100755 --- a/docker-compose-library.yaml +++ b/docker-compose-library.yaml @@ -6,6 +6,8 @@ services: dockerfile: deploy/lightspeed-stack/Containerfile platform: linux/amd64 container_name: lightspeed-stack + # Wrapper seeds the RAG kvstore into a writable path before starting the app. + entrypoint: ["/bin/bash", "/app-root/lightspeed-stack-entrypoint.sh"] ports: - "8080:8080" depends_on: @@ -18,7 +20,12 @@ services: - ./lightspeed-stack.yaml:/app-root/lightspeed-stack.yaml:Z - ./run.yaml:/app-root/run.yaml:Z - ${GCP_KEYS_PATH:-./tmp/.gcp-keys-dummy}:/opt/app-root/.gcp-keys:ro - - ./tests/e2e/rag:/opt/app-root/src/.llama/storage/rag:Z + # Deliver the seeded RAG kvstore read-only; the entrypoint copies it into + # a writable location so the embedded llama-stack can write to it as a + # non-root user (the registry shares this kvstore and must be writable). + - ./tests/e2e/rag:/opt/app-root/rag-seed:ro,Z + # Host copy so `docker compose up` picks up script changes without rebuilding + - ./scripts/lightspeed-stack-entrypoint.sh:/app-root/lightspeed-stack-entrypoint.sh:ro,z - ./tests/e2e/secrets/mcp-token:/tmp/mcp-token:ro,z - ./tests/e2e/secrets/invalid-mcp-token:/tmp/invalid-mcp-token:ro,z environment: diff --git a/docker-compose.yaml b/docker-compose.yaml index d65f7158c..6197640ce 100755 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -19,7 +19,10 @@ services: - ${GCP_KEYS_PATH:-./tmp/.gcp-keys-dummy}:/opt/app-root/.gcp-keys:ro - ./lightspeed-stack.yaml:/opt/app-root/lightspeed-stack.yaml:ro,z - llama-storage:/opt/app-root/src/.llama/storage - - ./tests/e2e/rag:/opt/app-root/src/.llama/storage/rag:z + # Deliver the seeded RAG kvstore read-only; the entrypoint copies it into + # the writable storage volume so llama-stack can write to it as a non-root + # user (the registry shares this kvstore and must be writable at startup). + - ./tests/e2e/rag:/opt/app-root/rag-seed:ro,z - mock-tls-certs:/certs:ro environment: - BRAVE_SEARCH_API_KEY=${BRAVE_SEARCH_API_KEY:-} diff --git a/scripts/lightspeed-stack-entrypoint.sh b/scripts/lightspeed-stack-entrypoint.sh new file mode 100755 index 000000000..56228acc7 --- /dev/null +++ b/scripts/lightspeed-stack-entrypoint.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Entrypoint for the library-mode lightspeed-stack container. +# Seeds the RAG kvstore into a writable location, then starts lightspeed-stack. + +set -e + +# Seed the RAG kvstore into the writable storage volume. +# +# The seed db is mounted read-only from the host (owned by the host user), but +# the embedded llama-stack runs as a non-root user and must write to this +# kvstore at startup (the resource registry shares it). Copying it into the +# storage tree makes the runtime db owned by the container user, so it is +# writable regardless of the host UID. See run.yaml -> storage.backends.kv_default. +RAG_SEED_DIR="${RAG_SEED_DIR:-/opt/app-root/rag-seed}" +STORAGE_RAG_DIR="${STORAGE_RAG_DIR:-/opt/app-root/src/.llama/storage/rag}" +if [ -d "$RAG_SEED_DIR" ]; then + echo "Seeding RAG kvstore from $RAG_SEED_DIR into $STORAGE_RAG_DIR..." + mkdir -p "$STORAGE_RAG_DIR" + cp -f "$RAG_SEED_DIR"/*.db "$STORAGE_RAG_DIR"/ +fi + +# Use the venv interpreter explicitly: overriding the image entrypoint changes +# PATH ordering, so a bare `python3.12` may resolve to the system interpreter +# (without the app's dependencies) instead of the venv at /app-root/.venv. +exec /app-root/.venv/bin/python3.12 src/lightspeed_stack.py "$@" diff --git a/scripts/llama-stack-entrypoint.sh b/scripts/llama-stack-entrypoint.sh index e3360c3b6..feadc82b6 100755 --- a/scripts/llama-stack-entrypoint.sh +++ b/scripts/llama-stack-entrypoint.sh @@ -8,6 +8,21 @@ INPUT_CONFIG="${LLAMA_STACK_CONFIG:-/opt/app-root/run.yaml}" ENRICHED_CONFIG="/tmp/enriched-run.yaml" LIGHTSPEED_CONFIG="${LIGHTSPEED_CONFIG:-/opt/app-root/lightspeed-stack.yaml}" +# Seed the RAG kvstore into the writable storage volume. +# +# The seed db is mounted read-only from the host (owned by the host user), but +# llama-stack runs as a non-root user and must write to this kvstore at startup +# (the resource registry shares it). Copying it into the storage volume makes +# the runtime db owned by the container user, so it is writable regardless of +# the host UID. See run.yaml -> storage.backends.kv_default. +RAG_SEED_DIR="${RAG_SEED_DIR:-/opt/app-root/rag-seed}" +STORAGE_RAG_DIR="${STORAGE_RAG_DIR:-/opt/app-root/src/.llama/storage/rag}" +if [ -d "$RAG_SEED_DIR" ]; then + echo "Seeding RAG kvstore from $RAG_SEED_DIR into $STORAGE_RAG_DIR..." + mkdir -p "$STORAGE_RAG_DIR" + cp -f "$RAG_SEED_DIR"/*.db "$STORAGE_RAG_DIR"/ +fi + # Enrich config if lightspeed config exists if [ -f "$LIGHTSPEED_CONFIG" ]; then echo "Enriching llama-stack config..." From bf7294008e4cdac6fb04e3c49118a45b6c8525fb Mon Sep 17 00:00:00 2001 From: codehex083 Date: Wed, 10 Jun 2026 10:34:24 +0900 Subject: [PATCH 2/2] LCORE-1859: Prevent topic summary failure from aborting streamed response --- src/app/endpoints/streaming_query.py | 22 ++++-- .../app/endpoints/test_streaming_query.py | 67 +++++++++++++++++++ 2 files changed, 84 insertions(+), 5 deletions(-) diff --git a/src/app/endpoints/streaming_query.py b/src/app/endpoints/streaming_query.py index c88fb03dd..2a3a3f7db 100644 --- a/src/app/endpoints/streaming_query.py +++ b/src/app/endpoints/streaming_query.py @@ -644,11 +644,23 @@ async def generate_response( should_generate = context.query_request.generate_topic_summary if should_generate: logger.debug("Generating topic summary for new conversation") - topic_summary = await get_topic_summary( - context.query_request.query, - context.client, - responses_params.model, - ) + # The stream has already started here, so a raised exception cannot + # be turned into a clean HTTP error response ("response already + # started"). Topic summaries are non-essential metadata, so failures + # (e.g. context-length overflow on a very large query) are caught and + # logged rather than aborting the already-streamed answer. + try: + topic_summary = await get_topic_summary( + context.query_request.query, + context.client, + responses_params.model, + ) + except Exception: # pylint: disable=broad-except + logger.exception( + "Failed to generate topic summary for new conversation, " + "request %s", + context.request_id, + ) # Consume tokens logger.info("Consuming tokens") diff --git a/tests/unit/app/endpoints/test_streaming_query.py b/tests/unit/app/endpoints/test_streaming_query.py index 19176d57f..465ad90ad 100644 --- a/tests/unit/app/endpoints/test_streaming_query.py +++ b/tests/unit/app/endpoints/test_streaming_query.py @@ -1216,6 +1216,73 @@ async def mock_generator() -> AsyncIterator[str]: assert len(result) > 0 + @pytest.mark.asyncio + async def test_generate_response_topic_summary_failure_does_not_abort_stream( + self, mocker: MockerFixture + ) -> None: + """Test stream completes when post-stream topic summary generation fails. + + The topic summary is generated after the stream has already started, so a + raised exception must be caught and logged rather than propagated (which + would otherwise trigger "response already started"). + """ + + async def mock_generator() -> AsyncIterator[str]: + yield "data: token\n\n" + + mock_context = mocker.Mock(spec=ResponseGeneratorContext) + mock_context.conversation_id = "conv_123" + mock_context.user_id = "user_123" + mock_context.vector_store_ids = [] + mock_context.rag_id_mapping = {} + mock_context.inline_rag_context = RAGContext() + mock_context.query_request = QueryRequest( + query="test", generate_topic_summary=True + ) # pyright: ignore[reportCallIssue] + mock_context.started_at = "2024-01-01T00:00:00Z" + mock_context.skip_userid_check = False + mock_context.request_id = "123e4567-e89b-12d3-a456-426614174000" + mock_context.client = mocker.AsyncMock(spec=AsyncLlamaStackClient) + + mock_responses_params = mocker.Mock(spec=ResponsesApiParams) + mock_responses_params.model = "provider1/model1" + + mock_turn_summary = TurnSummary() + mock_turn_summary.token_usage = TokenCounter(input_tokens=10, output_tokens=5) + + mock_config = mocker.Mock() + mock_config.quota_limiters = [] + mocker.patch("app.endpoints.streaming_query.configuration", mock_config) + mocker.patch("app.endpoints.streaming_query.consume_query_tokens") + mocker.patch( + "app.endpoints.streaming_query.get_available_quotas", return_value={} + ) + mocker.patch( + "app.endpoints.streaming_query.get_topic_summary", + new=mocker.AsyncMock( + side_effect=HTTPException(status_code=500, detail="context length") + ), + ) + store_results_mock = mocker.patch( + "app.endpoints.streaming_query.store_query_results" + ) + + result = [] + async for item in generate_response( + mock_generator(), + mock_context, + mock_responses_params, + mock_turn_summary, + ): + result.append(item) + + # The stream still completes (post-stream side effects run) and the + # topic summary failure does not propagate. + assert len(result) > 0 + store_results_mock.assert_called_once() + call_kwargs = store_results_mock.call_args.kwargs + assert call_kwargs["topic_summary"] is None + @pytest.mark.asyncio async def test_generate_response_connection_error( self, mocker: MockerFixture