lightspeed-core · CodeHex083 · Jun 6, 2026 · Jun 10, 2026 · coderabbitai · Jun 6, 2026
diff --git a/docker-compose-library.yaml b/docker-compose-library.yaml
@@ -6,6 +6,8 @@ services:
       dockerfile: deploy/lightspeed-stack/Containerfile
     platform: linux/amd64
     container_name: lightspeed-stack
+    # Wrapper seeds the RAG kvstore into a writable path before starting the app.
+    entrypoint: ["/bin/bash", "/app-root/lightspeed-stack-entrypoint.sh"]
     ports:
       - "8080:8080"
     depends_on:
@@ -18,7 +20,12 @@ services:
       - ./lightspeed-stack.yaml:/app-root/lightspeed-stack.yaml:Z
       - ./run.yaml:/app-root/run.yaml:Z
       - ${GCP_KEYS_PATH:-./tmp/.gcp-keys-dummy}:/opt/app-root/.gcp-keys:ro
-      - ./tests/e2e/rag:/opt/app-root/src/.llama/storage/rag:Z
+      # Deliver the seeded RAG kvstore read-only; the entrypoint copies it into
+      # a writable location so the embedded llama-stack can write to it as a
+      # non-root user (the registry shares this kvstore and must be writable).
+      - ./tests/e2e/rag:/opt/app-root/rag-seed:ro,Z
+      # Host copy so `docker compose up` picks up script changes without rebuilding
+      - ./scripts/lightspeed-stack-entrypoint.sh:/app-root/lightspeed-stack-entrypoint.sh:ro,z
       - ./tests/e2e/secrets/mcp-token:/tmp/mcp-token:ro,z
       - ./tests/e2e/secrets/invalid-mcp-token:/tmp/invalid-mcp-token:ro,z
     environment:

diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -19,7 +19,10 @@ services:
       - ${GCP_KEYS_PATH:-./tmp/.gcp-keys-dummy}:/opt/app-root/.gcp-keys:ro
       - ./lightspeed-stack.yaml:/opt/app-root/lightspeed-stack.yaml:ro,z
       - llama-storage:/opt/app-root/src/.llama/storage
-      - ./tests/e2e/rag:/opt/app-root/src/.llama/storage/rag:z
+      # Deliver the seeded RAG kvstore read-only; the entrypoint copies it into
+      # the writable storage volume so llama-stack can write to it as a non-root
+      # user (the registry shares this kvstore and must be writable at startup).
+      - ./tests/e2e/rag:/opt/app-root/rag-seed:ro,z
       - mock-tls-certs:/certs:ro
     environment:
       - BRAVE_SEARCH_API_KEY=${BRAVE_SEARCH_API_KEY:-}

diff --git a/scripts/lightspeed-stack-entrypoint.sh b/scripts/lightspeed-stack-entrypoint.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Entrypoint for the library-mode lightspeed-stack container.
+# Seeds the RAG kvstore into a writable location, then starts lightspeed-stack.
+
+set -e
+
+# Seed the RAG kvstore into the writable storage volume.
+#
+# The seed db is mounted read-only from the host (owned by the host user), but
+# the embedded llama-stack runs as a non-root user and must write to this
+# kvstore at startup (the resource registry shares it). Copying it into the
+# storage tree makes the runtime db owned by the container user, so it is
+# writable regardless of the host UID. See run.yaml -> storage.backends.kv_default.
+RAG_SEED_DIR="${RAG_SEED_DIR:-/opt/app-root/rag-seed}"
+STORAGE_RAG_DIR="${STORAGE_RAG_DIR:-/opt/app-root/src/.llama/storage/rag}"
+if [ -d "$RAG_SEED_DIR" ]; then
+    echo "Seeding RAG kvstore from $RAG_SEED_DIR into $STORAGE_RAG_DIR..."
+    mkdir -p "$STORAGE_RAG_DIR"
+    cp -f "$RAG_SEED_DIR"/*.db "$STORAGE_RAG_DIR"/
+fi
-if [ -d "$RAG_SEED_DIR" ]; then
-    echo "Seeding RAG kvstore from $RAG_SEED_DIR into $STORAGE_RAG_DIR..."
-    mkdir -p "$STORAGE_RAG_DIR"
-    cp -f "$RAG_SEED_DIR"/*.db "$STORAGE_RAG_DIR"/
-fi
+if [ -d "$RAG_SEED_DIR" ]; then
+    echo "Seeding RAG kvstore from $RAG_SEED_DIR into $STORAGE_RAG_DIR..."
+    mkdir -p "$STORAGE_RAG_DIR"
+    if ! compgen -G "$RAG_SEED_DIR/*.db" > /dev/null; then
+        echo "ERROR: No .db files found in $RAG_SEED_DIR"
+        exit 1
+    fi
+    cp -f "$RAG_SEED_DIR"/*.db "$STORAGE_RAG_DIR"/
+fi
-if [ -d "$RAG_SEED_DIR" ]; then
-    echo "Seeding RAG kvstore from $RAG_SEED_DIR into $STORAGE_RAG_DIR..."
-    mkdir -p "$STORAGE_RAG_DIR"
-    cp -f "$RAG_SEED_DIR"/*.db "$STORAGE_RAG_DIR"/
-fi
+if [ -d "$RAG_SEED_DIR" ]; then
+    echo "Seeding RAG kvstore from $RAG_SEED_DIR into $STORAGE_RAG_DIR..."
+    mkdir -p "$STORAGE_RAG_DIR"
+    if ! compgen -G "$RAG_SEED_DIR/*.db" > /dev/null; then
+        echo "ERROR: No .db files found in $RAG_SEED_DIR"
+        exit 1
+    fi
+    cp -f "$RAG_SEED_DIR"/*.db "$STORAGE_RAG_DIR"/
+fi
+
+# Use the venv interpreter explicitly: overriding the image entrypoint changes
+# PATH ordering, so a bare `python3.12` may resolve to the system interpreter
+# (without the app's dependencies) instead of the venv at /app-root/.venv.
+exec /app-root/.venv/bin/python3.12 src/lightspeed_stack.py "$@"
diff --git a/scripts/llama-stack-entrypoint.sh b/scripts/llama-stack-entrypoint.sh
@@ -8,6 +8,21 @@ INPUT_CONFIG="${LLAMA_STACK_CONFIG:-/opt/app-root/run.yaml}"
 ENRICHED_CONFIG="/tmp/enriched-run.yaml"
 LIGHTSPEED_CONFIG="${LIGHTSPEED_CONFIG:-/opt/app-root/lightspeed-stack.yaml}"
 
+# Seed the RAG kvstore into the writable storage volume.
+#
+# The seed db is mounted read-only from the host (owned by the host user), but
+# llama-stack runs as a non-root user and must write to this kvstore at startup
+# (the resource registry shares it). Copying it into the storage volume makes
+# the runtime db owned by the container user, so it is writable regardless of
+# the host UID. See run.yaml -> storage.backends.kv_default.
+RAG_SEED_DIR="${RAG_SEED_DIR:-/opt/app-root/rag-seed}"
+STORAGE_RAG_DIR="${STORAGE_RAG_DIR:-/opt/app-root/src/.llama/storage/rag}"
+if [ -d "$RAG_SEED_DIR" ]; then
+    echo "Seeding RAG kvstore from $RAG_SEED_DIR into $STORAGE_RAG_DIR..."
+    mkdir -p "$STORAGE_RAG_DIR"
+    cp -f "$RAG_SEED_DIR"/*.db "$STORAGE_RAG_DIR"/
+fi
-if [ -d "$RAG_SEED_DIR" ]; then
-    echo "Seeding RAG kvstore from $RAG_SEED_DIR into $STORAGE_RAG_DIR..."
-    mkdir -p "$STORAGE_RAG_DIR"
-    cp -f "$RAG_SEED_DIR"/*.db "$STORAGE_RAG_DIR"/
-fi
+if [ -d "$RAG_SEED_DIR" ]; then
+    echo "Seeding RAG kvstore from $RAG_SEED_DIR into $STORAGE_RAG_DIR..."
+    mkdir -p "$STORAGE_RAG_DIR"
+    if ! compgen -G "$RAG_SEED_DIR/*.db" > /dev/null; then
+        echo "ERROR: No .db files found in $RAG_SEED_DIR"
+        exit 1
+    fi
+    cp -f "$RAG_SEED_DIR"/*.db "$STORAGE_RAG_DIR"/
+fi
-if [ -d "$RAG_SEED_DIR" ]; then
-    echo "Seeding RAG kvstore from $RAG_SEED_DIR into $STORAGE_RAG_DIR..."
-    mkdir -p "$STORAGE_RAG_DIR"
-    cp -f "$RAG_SEED_DIR"/*.db "$STORAGE_RAG_DIR"/
-fi
+if [ -d "$RAG_SEED_DIR" ]; then
+    echo "Seeding RAG kvstore from $RAG_SEED_DIR into $STORAGE_RAG_DIR..."
+    mkdir -p "$STORAGE_RAG_DIR"
+    if ! compgen -G "$RAG_SEED_DIR/*.db" > /dev/null; then
+        echo "ERROR: No .db files found in $RAG_SEED_DIR"
+        exit 1
+    fi
+    cp -f "$RAG_SEED_DIR"/*.db "$STORAGE_RAG_DIR"/
+fi
+
 # Enrich config if lightspeed config exists
 if [ -f "$LIGHTSPEED_CONFIG" ]; then
     echo "Enriching llama-stack config..."

diff --git a/src/app/endpoints/streaming_query.py b/src/app/endpoints/streaming_query.py
@@ -644,11 +644,23 @@ async def generate_response(
         should_generate = context.query_request.generate_topic_summary
         if should_generate:
             logger.debug("Generating topic summary for new conversation")
-            topic_summary = await get_topic_summary(
-                context.query_request.query,
-                context.client,
-                responses_params.model,
-            )
+            # The stream has already started here, so a raised exception cannot
+            # be turned into a clean HTTP error response ("response already
+            # started"). Topic summaries are non-essential metadata, so failures
+            # (e.g. context-length overflow on a very large query) are caught and
+            # logged rather than aborting the already-streamed answer.
+            try:
+                topic_summary = await get_topic_summary(
+                    context.query_request.query,
+                    context.client,
+                    responses_params.model,
+                )
+            except Exception:  # pylint: disable=broad-except
+                logger.exception(
+                    "Failed to generate topic summary for new conversation, "
+                    "request %s",
+                    context.request_id,
+                )
 
     # Consume tokens
     logger.info("Consuming tokens")

diff --git a/tests/unit/app/endpoints/test_streaming_query.py b/tests/unit/app/endpoints/test_streaming_query.py
@@ -1216,6 +1216,73 @@ async def mock_generator() -> AsyncIterator[str]:
 
         assert len(result) > 0
 
+    @pytest.mark.asyncio
+    async def test_generate_response_topic_summary_failure_does_not_abort_stream(
+        self, mocker: MockerFixture
+    ) -> None:
+        """Test stream completes when post-stream topic summary generation fails.
+
+        The topic summary is generated after the stream has already started, so a
+        raised exception must be caught and logged rather than propagated (which
+        would otherwise trigger "response already started").
+        """
+
+        async def mock_generator() -> AsyncIterator[str]:
+            yield "data: token\n\n"
+
+        mock_context = mocker.Mock(spec=ResponseGeneratorContext)
+        mock_context.conversation_id = "conv_123"
+        mock_context.user_id = "user_123"
+        mock_context.vector_store_ids = []
+        mock_context.rag_id_mapping = {}
+        mock_context.inline_rag_context = RAGContext()
+        mock_context.query_request = QueryRequest(
+            query="test", generate_topic_summary=True
+        )  # pyright: ignore[reportCallIssue]
+        mock_context.started_at = "2024-01-01T00:00:00Z"
+        mock_context.skip_userid_check = False
+        mock_context.request_id = "123e4567-e89b-12d3-a456-426614174000"
+        mock_context.client = mocker.AsyncMock(spec=AsyncLlamaStackClient)
+
+        mock_responses_params = mocker.Mock(spec=ResponsesApiParams)
+        mock_responses_params.model = "provider1/model1"
+
+        mock_turn_summary = TurnSummary()
+        mock_turn_summary.token_usage = TokenCounter(input_tokens=10, output_tokens=5)
+
+        mock_config = mocker.Mock()
+        mock_config.quota_limiters = []
+        mocker.patch("app.endpoints.streaming_query.configuration", mock_config)
+        mocker.patch("app.endpoints.streaming_query.consume_query_tokens")
+        mocker.patch(
+            "app.endpoints.streaming_query.get_available_quotas", return_value={}
+        )
+        mocker.patch(
+            "app.endpoints.streaming_query.get_topic_summary",
+            new=mocker.AsyncMock(
+                side_effect=HTTPException(status_code=500, detail="context length")
+            ),
+        )
+        store_results_mock = mocker.patch(
+            "app.endpoints.streaming_query.store_query_results"
+        )
+
+        result = []
+        async for item in generate_response(
+            mock_generator(),
+            mock_context,
+            mock_responses_params,
+            mock_turn_summary,
+        ):
+            result.append(item)
+
+        # The stream still completes (post-stream side effects run) and the
+        # topic summary failure does not propagate.
+        assert len(result) > 0
+        store_results_mock.assert_called_once()
+        call_kwargs = store_results_mock.call_args.kwargs
+        assert call_kwargs["topic_summary"] is None
+
     @pytest.mark.asyncio
     async def test_generate_response_connection_error(
         self, mocker: MockerFixture