Imaging-Plaza · caviri · Feb 16, 2026 · Nov 11, 2025 · Nov 12, 2025 · Nov 12, 2025
diff --git a/.cursor/rules/academic-catalog-enrichment.mdc b/.cursor/rules/academic-catalog-enrichment.mdc
@@ -16,7 +16,7 @@ The Academic Catalog Enrichment system provides integration with academic reposi
                               │
                               ▼
 ┌─────────────────────────────────────────────────────────────────┐
-│              AcademicCatalogEnrichmentResult                     │
+│              linkedEntitiesEnrichmentResult                     │
 │  ┌────────────────┬──────────────────┬──────────────────────┐  │
 │  │ repository_    │ author_relations │ organization_        │  │
 │  │ relations      │ Dict[str, List]  │ relations           │  │
@@ -36,13 +36,13 @@ The Academic Catalog Enrichment system provides integration with academic reposi
 ## Data Models
 
 ### Location
-- **Path**: `src/data_models/academic_catalog.py`
+- **Path**: `src/data_models/linked_entities.py`
 
 ### Key Models
 
-#### 1. AcademicCatalogRelation
+#### 1. linkedEntitiesRelation
 ```python
-class AcademicCatalogRelation(BaseModel):
+class linkedEntitiesRelation(BaseModel):
     """A single relation to an academic catalog entity."""
 
     catalogType: CatalogType  # "infoscience", "openalex", "epfl_graph"
@@ -54,19 +54,19 @@ class AcademicCatalogRelation(BaseModel):
     # Note: externalId field has been removed
 ```
 
-#### 2. AcademicCatalogEnrichmentResult (Structured Output)
+#### 2. linkedEntitiesEnrichmentResult (Structured Output)
 ```python
-class AcademicCatalogEnrichmentResult(BaseModel):
+class linkedEntitiesEnrichmentResult(BaseModel):
     """Organized results by what was searched for."""
 
     # Publications about the repository/project itself
-    repository_relations: List[AcademicCatalogRelation] = []
+    repository_relations: List[linkedEntitiesRelation] = []
 
     # Keyed by exact author name provided
-    author_relations: Dict[str, List[AcademicCatalogRelation]] = {}
+    author_relations: Dict[str, List[linkedEntitiesRelation]] = {}
 
     # Keyed by exact organization name provided
-    organization_relations: Dict[str, List[AcademicCatalogRelation]] = {}
+    organization_relations: Dict[str, List[linkedEntitiesRelation]] = {}
 
     # Metadata
     searchStrategy: Optional[str] = None
@@ -102,7 +102,7 @@ result = agent.run(prompt, authors=["Alexander Mathis", ...])
   }
 }
 # Direct assignment:
-author.academicCatalogRelations = result.author_relations[author.name]
+author.linkedEntities = result.author_relations[author.name]
 ```
 
 ### Agent Responsibilities
@@ -124,8 +124,8 @@ Python code is responsible for:
 
 ### 1. Agent Call
 ```python
-# src/agents/academic_catalog_enrichment.py
-async def enrich_repository_academic_catalog(
+# src/agents/linked_entities_enrichment.py
+async def enrich_repository_linked_entities(
     repository_url: str,
     repository_name: str,
     description: str,
@@ -161,9 +161,9 @@ async def enrich_repository_academic_catalog(
 ### 3. Direct Assignment
 ```python
 # src/analysis/repositories.py
-async def run_academic_catalog_enrichment(self):
+async def run_linked_entities_enrichment(self):
     # Call agent
-    result = await enrich_repository_academic_catalog(
+    result = await enrich_repository_linked_entities(
         repository_url=self.full_path,
         repository_name=repository_name,
         authors=author_names,           # ["Alexander Mathis", ...]
@@ -173,21 +173,21 @@ async def run_academic_catalog_enrichment(self):
     enrichment_data = result.get("data")
 
     # 1. Repository-level
-    self.data.academicCatalogRelations = enrichment_data.repository_relations
+    self.data.linkedEntities = enrichment_data.repository_relations
 
     # 2. Author-level (direct lookup by name)
     for author in self.data.author:
         if author.name in enrichment_data.author_relations:
-            author.academicCatalogRelations = enrichment_data.author_relations[author.name]
+            author.linkedEntities = enrichment_data.author_relations[author.name]
         else:
-            author.academicCatalogRelations = []
+            author.linkedEntities = []
 
     # 3. Organization-level (direct lookup by name)
     for org in self.data.author:
         if org.legalName in enrichment_data.organization_relations:
-            org.academicCatalogRelations = enrichment_data.organization_relations[org.legalName]
+            org.linkedEntities = enrichment_data.organization_relations[org.legalName]
         else:
-            org.academicCatalogRelations = []
+            org.linkedEntities = []
 ```
 
 **Key Points**:
@@ -309,10 +309,10 @@ return InfoscienceAuthor(
 
 4. **Agent extracts from markdown**:
    - Agent prompt explicitly instructs: "Extract UUID from '*UUID:* <uuid>' in markdown"
-   - Agent populates `AcademicCatalogRelation.uuid` field
+   - Agent populates `linkedEntitiesRelation.uuid` field
    - Agent populates `entity.uuid` in the full entity object
 
-**Chain of custody**: API → Parser → Pydantic Model → Markdown → Agent → AcademicCatalogRelation
+**Chain of custody**: API → Parser → Pydantic Model → Markdown → Agent → linkedEntitiesRelation
 
 #### 3. Markdown as Transport Layer
 Since tools return markdown (not structured data), markdown must include ALL critical fields:
@@ -466,42 +466,69 @@ ENV_VAR_MAPPINGS = {
 | LLM Analysis | `run_llm_analysis` | o4-mini | Main repository analysis |
 | User Enrichment | `run_user_enrichment` | o4-mini | Author enrichment with ORCID |
 | Org Enrichment | `run_organization_enrichment` | o4-mini | ROR matching |
-| Academic Catalog | `run_academic_catalog_enrichment` | o4-mini | Infoscience searches (tool-heavy) |
+| Academic Catalog | `run_linked_entities_searcher` | o4-mini | Infoscience searches (tool-heavy, repository-level only) |
 | EPFL Assessment | `run_epfl_assessment` | o4-mini | Final holistic assessment |
+| Repository Classifier | `run_repository_classifier` | o4-mini | Repository type and discipline classification |
+| Organization Identifier | `run_organization_identifier` | o4-mini | Organization identification |
 
 ## Integration into Analysis Pipeline
 
 ### Repository Analysis Flow
 
 ```python
 # src/analysis/repositories.py
-async def run_analysis(self):
+async def run_analysis(self, run_author_linked_entities: bool = False):
     # 1. Extract metadata with GIMIE
     await self.run_gimie()
 
-    # 2. LLM analysis (main agent)
-    await self.run_llm_analysis()
+    # 2. Atomic LLM pipeline (stages 1-5)
+    await self.run_atomic_llm_pipeline()
+    # Stage 1: Context compiler
+    # Stage 2: Structured output
+    # Stage 3: Repository classifier
+    # Stage 4: Organization identifier
+    # Stage 5: Linked entities searcher (repository-level only)
 
     # 3. ORCID enrichment (no LLM)
     self.run_authors_enrichment()
 
-    # 4. Organization enrichment (ROR agent)
+    # 4. User enrichment (optional)
+    await self.run_user_enrichment()
+
+    # 5. Organization enrichment (optional)
     await self.run_organization_enrichment()
 
-    # 5. User enrichment (author agent)
-    await self.run_user_enrichment()
+    # 6. Academic catalog enrichment (repository-level - runs in atomic pipeline)
+    # Already completed in Stage 5 of atomic pipeline
 
-    # 6. Academic catalog enrichment (NEW!)
-    await self.run_academic_catalog_enrichment()
+    # 7. Optional: Author-level linked entities enrichment
+    if run_author_linked_entities:
+        await self.run_author_linked_entities_enrichment()
 
-    # 7. Final EPFL assessment (holistic)
+    # 8. Final EPFL assessment (holistic)
     await self.run_epfl_final_assessment()
 ```
 
 **Order matters**:
-- Academic catalog enrichment runs AFTER user/org enrichment (needs author names)
+- Academic catalog enrichment (repository-level) runs in Stage 5 of atomic pipeline
+- Author-level linked entities enrichment is optional and runs separately
 - EPFL assessment runs LAST (reviews all collected data)
 
+### Linked Entities Enrichment Scope
+
+**Repository-Level (Default)**:
+- Runs automatically in Stage 5 of atomic pipeline
+- Searches Infoscience for publications about the repository/tool name
+- Stores results in `repository.linkedEntities`
+- Uses `search_infoscience_publications_tool` with repository name as query
+
+**Author-Level (Optional)**:
+- Controlled by `run_author_linked_entities` parameter
+- Separate method: `run_author_linked_entities_enrichment()`
+- Searches Infoscience for each author individually
+- Assigns results to `author.linkedEntities` for each Person
+- Only runs when explicitly requested (default: `False`)
+
 ### Estimated Token Accumulation
 
 **EVERY agent must accumulate estimated tokens**:
@@ -517,7 +544,7 @@ if usage and "estimated_input_tokens" in usage:
 - ✅ `run_llm_analysis()`
 - ✅ `run_organization_enrichment()`
 - ✅ `run_user_enrichment()`
-- ✅ `run_academic_catalog_enrichment()`
+- ✅ `run_linked_entities_enrichment()`
 - ✅ `run_epfl_final_assessment()`
 
 ## Testing Guidelines
@@ -539,8 +566,8 @@ curl "http://0.0.0.0:1234/v1/extract/json/https://github.com/DeepLabCut/DeepLabC
 
 ### Verification Checklist
 
-- [ ] Repository `academicCatalogRelations` populated
-- [ ] Each author has `academicCatalogRelations` (may be empty)
+- [ ] Repository `linkedEntities` populated
+- [ ] Each author has `linkedEntities` (may be empty)
 - [ ] Relations include full entity objects (not just UUIDs)
 - [ ] **UUIDs are populated** (not null) for all matched entities
 - [ ] **URLs/profile_urls are populated** for all matched entities
@@ -552,7 +579,7 @@ curl "http://0.0.0.0:1234/v1/extract/json/https://github.com/DeepLabCut/DeepLabC
 
 ## Common Issues & Solutions
 
-### Issue: UUID is null in academicCatalogRelations
+### Issue: UUID is null in linkedEntities
 **Cause**: Field name mismatch in parser (e.g., `url=` instead of `profile_url=`)
 **Symptoms**:
 ```json
@@ -604,6 +631,26 @@ curl "http://0.0.0.0:1234/v1/extract/json/https://github.com/DeepLabCut/DeepLabC
 **Cause**: Parser passing wrong field name to Pydantic model
 **Solution**: Pydantic silently ignores unknown fields - verify field names match model definition
 
+### Issue: Validation errors for union fields (entityInfosciencePublication, entityInfoscienceAuthor, entityInfoscienceLab)
+**Cause**: LLM populating all three union fields with the same data, or wrong entity type in wrong field
+**Symptoms**:
+```json
+{
+  "entityType": "publication",
+  "entityInfosciencePublication": {...},  // ✅ Correct
+  "entityInfoscienceAuthor": {...},       // ❌ Should be None/omitted
+  "entityInfoscienceLab": {...}           // ❌ Should be None/omitted
+}
+```
+
+**Solution**:
+1. **System prompt**: Explicitly instruct LLM to populate ONLY the field matching `entityType`
+2. **Reconciliation method**: `_reconcile_entity_union()` in `repositories.py`:
+   - Checks `entityType` to select correct union variant
+   - Removes other two fields
+   - Converts `None` to empty lists for list fields (`subjects`, `authors`, `keywords`)
+3. **List field handling**: Convert `None` to `[]` for list fields before validation
+
 ## Future Extensions
 
 ### Adding New Catalogs
@@ -642,7 +689,7 @@ Future enhancement: Match same entities across catalogs using:
 
 ```python
 # Example future feature
-def deduplicate_across_catalogs(relations: List[AcademicCatalogRelation]):
+def deduplicate_across_catalogs(relations: List[linkedEntitiesRelation]):
     """Merge same entities from different catalogs."""
     # Group by DOI, ORCID, or other stable identifiers
     # Provide unified view across catalogs
@@ -660,8 +707,8 @@ def deduplicate_across_catalogs(relations: List[AcademicCatalogRelation]):
 
 ## References
 
-- Implementation: `src/agents/academic_catalog_enrichment.py`
-- Data Models: `src/data_models/academic_catalog.py`
+- Implementation: `src/agents/linked_entities_enrichment.py`
+- Data Models: `src/data_models/linked_entities.py`
 - Infoscience Client: `src/context/infoscience.py`
 - Integration: `src/analysis/repositories.py`
-- Documentation: `ACADEMIC_CATALOG_OPTION_B_IMPLEMENTATION.md` (if exists)
+- Documentation: `linked_entities_OPTION_B_IMPLEMENTATION.md` (if exists)