diff --git a/.github/workflows/docs_pages.yml b/.github/workflows/docs_pages.yml new file mode 100644 index 0000000..27a2d3f --- /dev/null +++ b/.github/workflows/docs_pages.yml @@ -0,0 +1,51 @@ +name: Publish Documentation + +on: + push: + branches: + - main + tags: + - "v*" + workflow_dispatch: + +permissions: + contents: write + +jobs: + deploy-docs: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install documentation dependencies + run: | + python -m pip install --upgrade pip + python -m pip install -e ".[docs]" + + - name: Validate documentation build + run: mkdocs build --strict + + - name: Configure git identity for mike + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + + - name: Deploy dev and latest aliases from main + if: github.ref == 'refs/heads/main' + run: mike deploy --push --branch gh-pages --update-aliases dev latest + + - name: Deploy release version from tag and update stable alias + if: startsWith(github.ref, 'refs/tags/v') + run: | + VERSION="${GITHUB_REF_NAME#v}" + mike deploy --push --branch gh-pages --update-aliases "${VERSION}" stable + mike set-default --push --branch gh-pages stable diff --git a/CHANGELOG.md b/CHANGELOG.md index 126fcab..3688617 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,17 @@ All notable changes to this project will be documented in this file. +## [2.0.1] - 2026-02-16 + +### Added +- Documentation and CI for github-pages + +### Changed +- Bumped project version to `2.0.1`. +- Updated API version metadata and root welcome message to `v2.0.1`. + + + ## [2.0.0] - 2025-10-07 ### Added diff --git a/README.md b/README.md index 437ad36..5bd714c 100644 --- a/README.md +++ b/README.md @@ -79,6 +79,48 @@ python src/main.py --url https://github.com/qchapp/lungs-segmentation --output_p If no arguments are provided, it will use the default repository and output path. +## Versioned documentation (GitHub Pages) + +The repository includes a versioned documentation site under `docs/` powered by MkDocs Material + Mike. + +Install docs dependencies: + +```bash +uv pip install -e ".[docs]" +``` + +Local docs preview: + +```bash +just docs-serve +``` + +Strict docs build: + +```bash +just docs-build +``` + +Manual publish commands: + +```bash +# Publish dev/latest from current branch +just docs-deploy-dev + +# Publish a release version and update stable alias +just docs-deploy-release 2.0.1 + +# Set default version in selector +just docs-set-default stable +``` + +Automation: + +- `.github/workflows/docs_pages.yml` publishes docs on: + - Pushes to `main` (`dev` + `latest`) + - Pushes of tags matching `v*` (release version + `stable`) +- Configure GitHub Pages to serve from the `gh-pages` branch root. + ## How to run the tool using Docker? 1. You need to build the image. diff --git a/docs/ACADEMIC_CATALOG_OPTION_B_IMPLEMENTATION.md b/docs/ACADEMIC_CATALOG_OPTION_B_IMPLEMENTATION.md index f6262de..8fff1e0 100644 --- a/docs/ACADEMIC_CATALOG_OPTION_B_IMPLEMENTATION.md +++ b/docs/ACADEMIC_CATALOG_OPTION_B_IMPLEMENTATION.md @@ -1,318 +1,46 @@ -# Academic Catalog Enrichment - Option B Implementation +# Academic Catalog Enrichment - Assignment Strategy (Current) -## Date: 2025-11-02 +This page documents how linked-entity results are assigned to output models in the current code. -## Overview +## Current assignment behavior -Implemented **Option B**: Academic catalog agent searches for repository, authors, and organizations **individually** and returns **organized results** keyed by who was searched for. No complex name matching needed! +1. Repository analysis (`src/analysis/repositories.py`) +- Runs two-stage linked-entity pipeline: + - `search_academic_catalogs` + - `structure_linked_entities` +- Assigns only `repository_relations` to `SoftwareSourceCode.linkedEntities`. +- Author-level linked entities are not yet fully materialized (follow-up method exists). -## Architecture +2. User analysis (`src/analysis/user.py`) +- Uses `enrich_user_linked_entities`. +- Assigns resulting relations directly to `GitHubUser.linkedEntities`. -### Before (Complex Name Matching) -``` -1. Agent searches everything → returns flat list -2. Try to match "Mathis, Alexander" with "Alexander Mathis" ❌ -3. Complex regex/fuzzy matching logic -4. Fragile, error-prone -``` - -### After (Direct Assignment) -``` -1. Agent searches: - - Repository name → repository_relations - - Each author individually → author_relations["Alexander Mathis"] - - Each org individually → organization_relations["DeepLabCut"] -2. Direct dictionary lookup by exact name ✅ -3. Simple, explicit, reliable -``` - -## Data Model Changes - -### `linkedEntitiesEnrichmentResult` - -**New structured fields:** - -```python -class linkedEntitiesEnrichmentResult(BaseModel): - repository_relations: List[linkedEntitiesRelation] = [] - # Publications about the repository/project itself +3. Organization analysis (`src/analysis/organization.py`) +- Uses `enrich_organization_linked_entities`. +- Assigns resulting relations directly to `GitHubOrganization.linkedEntities`. - author_relations: Dict[str, List[linkedEntitiesRelation]] = {} - # Keyed by author name as provided: {"Alexander Mathis": [...relations...]} +## Data structure reference - organization_relations: Dict[str, List[linkedEntitiesRelation]] = {} - # Keyed by org name as provided: {"DeepLabCut": [...relations...]} +`linkedEntitiesEnrichmentResult` contains: - # Metadata fields... - searchStrategy: Optional[str] = None - catalogsSearched: List[CatalogType] = [] - totalSearches: int = 0 -``` - -**Backward compatibility:** - -```python -@property -def relations(self) -> List[linkedEntitiesRelation]: - """Combines all relations for backward compatibility.""" - return ( - list(repository_relations) + - flatten(author_relations.values()) + - flatten(organization_relations.values()) - ) -``` - -## Agent Behavior +- `repository_relations: list[linkedEntitiesRelation]` +- `author_relations: dict[str, list[linkedEntitiesRelation]]` +- `organization_relations: dict[str, list[linkedEntitiesRelation]]` -### Repository Enrichment +## Assignment flow -**Input:** -```python -enrich_repository_linked_entities( - repository_url="https://github.com/DeepLabCut/DeepLabCut", - repository_name="DeepLabCut", - description="...", - readme_excerpt="...", - authors=["Alexander Mathis", "Mackenzie Weygandt Mathis"], - organizations=["DeepLabCut"] -) +```mermaid +flowchart TD + A[linkedEntitiesEnrichmentResult] --> B{Resource type} + B -- Repository --> C[use repository_relations] + B -- User --> D[use relations from user enrichment] + B -- Organization --> E[use relations from org enrichment] + C --> F[model.linkedEntities] + D --> F + E --> F ``` -**Agent searches:** - -1. **Repository-level:** - - `search_infoscience_publications_tool("DeepLabCut")` - - Finds publications **about DeepLabCut** - - → Adds to `repository_relations` - -2. **For each author:** - - `search_infoscience_authors_tool("Alexander Mathis")` - - Finds person profile (even if stored as "Mathis, Alexander") - - `get_author_publications_tool("Alexander Mathis")` - - Finds their publications - - → Adds ALL to `author_relations["Alexander Mathis"]` - - - `search_infoscience_authors_tool("Mackenzie Weygandt Mathis")` - - → Adds to `author_relations["Mackenzie Weygandt Mathis"]` - -3. **For each organization:** - - `search_infoscience_labs_tool("DeepLabCut")` - - Finds orgunit profiles - - → Adds to `organization_relations["DeepLabCut"]` - -**Output structure:** -```json -{ - "repository_relations": [ - { - "entityType": "publication", - "entity": {"title": "DeepLabCut: markerless pose estimation..."}, - "confidence": 0.95 - } - ], - "author_relations": { - "Alexander Mathis": [ - {"entityType": "person", "entity": {...}, "confidence": 0.95}, - {"entityType": "publication", "entity": {...}, "confidence": 0.9} - ], - "Mackenzie Weygandt Mathis": [ - {"entityType": "person", "entity": {...}, "confidence": 0.95} - ] - }, - "organization_relations": { - "DeepLabCut": [ - {"entityType": "orgunit", "entity": {...}, "confidence": 0.8} - ] - } -} -``` - -## Assignment Logic - -### In `Repository.run_linked_entities_enrichment()`: - -```python -# 1. Repository-level relations -self.data.linkedEntities = enrichment_data.repository_relations - -# 2. Author-level relations (direct lookup by name) -for author in self.data.author: - if author.name in enrichment_data.author_relations: - author.linkedEntities = enrichment_data.author_relations[author.name] - else: - author.linkedEntities = [] - -# 3. Organization-level relations (direct lookup by name) -for org in self.data.author: # Orgs can be in author list - if org.legalName in enrichment_data.organization_relations: - org.linkedEntities = enrichment_data.organization_relations[org.legalName] - else: - org.linkedEntities = [] -``` - -**No name matching needed!** The agent uses the exact names we provide as dictionary keys. - -## Benefits - -### 1. **Explicit and Clear** -- Each author is searched **individually** by the exact name we provide -- No guessing about "does 'Alexander Mathis' match 'Mathis, Alexander'?" -- The agent decides what matches during search time - -### 2. **Simple Assignment** -- Direct dictionary lookup: `author_relations["Alexander Mathis"]` -- No complex regex, no fuzzy matching, no subset logic -- Either the key exists or it doesn't - -### 3. **Debuggable** -- Log shows: "Searching for author: Alexander Mathis" -- Log shows: "Found 2 relations for: Alexander Mathis" -- Log shows: "Assigned 2 relations to author: Alexander Mathis" -- Clear 1:1 relationship - -### 4. **Agent Responsibility** -- The **agent** handles name variations (Infoscience stores "Mathis, Alexander") -- The agent's search tools are smart enough to find "Mathis, Alexander" when searching for "Alexander Mathis" -- We don't need to replicate that logic in Python - -### 5. **Extensible** -- Easy to add more catalogs (OpenAlex, EPFL Graph) -- Easy to add more entity types -- Each search is independent and cacheable - -## Example Flow: DeepLabCut - -### Input to Agent: -``` -Repository: DeepLabCut -Authors: ["Alexander Mathis", "Mackenzie Weygandt Mathis"] -Organizations: ["DeepLabCut"] -``` - -### Agent Executes: -``` -1. search_infoscience_publications_tool("DeepLabCut") - → Found 4 publications about DeepLabCut - → Add to repository_relations - -2. search_infoscience_authors_tool("Alexander Mathis") - → Found person profile (UUID: xxx, name: "Mathis, Alexander") - → Add to author_relations["Alexander Mathis"] - -3. get_author_publications_tool("Alexander Mathis") - → Found 10 publications - → Add to author_relations["Alexander Mathis"] - -4. search_infoscience_authors_tool("Mackenzie Weygandt Mathis") - → Found person profile (UUID: yyy, name: "Mathis, Mackenzie") - → Add to author_relations["Mackenzie Weygandt Mathis"] - -5. search_infoscience_labs_tool("DeepLabCut") - → Found 0 orgunits (DeepLabCut is not an EPFL org) - → author_relations["DeepLabCut"] = [] -``` - -### Python Assigns: -```python -# Repository -repository.linkedEntities = [4 publications about DeepLabCut] - -# Author: Alexander Mathis -author1.linkedEntities = author_relations["Alexander Mathis"] -# = [person profile + 10 publications] - -# Author: Mackenzie Weygandt Mathis -author2.linkedEntities = author_relations["Mackenzie Weygandt Mathis"] -# = [person profile] - -# Org: DeepLabCut -org.linkedEntities = organization_relations["DeepLabCut"] -# = [] (no EPFL orgunit found) -``` - -### Output: -```json -{ - "repository": { - "linkedEntities": [ - "4 publications about DeepLabCut" - ] - }, - "authors": [ - { - "name": "Alexander Mathis", - "linkedEntities": [ - "person profile", - "10 publications" - ] - }, - { - "name": "Mackenzie Weygandt Mathis", - "linkedEntities": [ - "person profile" - ] - } - ] -} -``` - -## Migration Notes - -### Old Code (if any): -```python -# Old: Flat list, required name matching -relations = enrichment_data.relations -for author in authors: - # Complex matching logic... - if _names_match(author.name, relation.entity.name): - ... -``` - -### New Code: -```python -# New: Organized dict, direct lookup -if author.name in enrichment_data.author_relations: - author.linkedEntities = enrichment_data.author_relations[author.name] -``` - -## Testing - -### Test Case: DeepLabCut - -```bash -curl "http://0.0.0.0:1234/v1/extract/json/https://github.com/DeepLabCut/DeepLabCut?force_refresh=true&enrich_orgs=true&enrich_users=true" -``` - -**Expected:** -- ✅ Repository-level: Publications about DeepLabCut -- ✅ Alexander Mathis: Person profile + publications -- ✅ Mackenzie Weygandt Mathis: Person profile + publications -- ✅ Direct assignment without name matching errors - -## Files Modified - -### Data Models: -- `src/data_models/linked_entities.py` - Added structured fields - -### Agent: -- `src/agents/linked_entities_prompts.py` - Updated output format instructions - -### Analysis: -- `src/analysis/repositories.py` - Simplified assignment logic - -### Documentation: -- `linked_entities_OPTION_B_IMPLEMENTATION.md` (this file) - -## Conclusion - -✅ **Option B is implemented!** - -The academic catalog enrichment now: -1. Searches repository publications by repository name -2. Searches each author individually by exact name provided -3. Searches each organization individually by exact name provided -4. Returns organized results in dictionaries -5. Python code does direct dictionary lookup for assignment -6. No complex name matching needed! +## Why this strategy -**Result:** Clean, explicit, debuggable, and reliable academic catalog enrichment! 🎉 +- Keeps assignment deterministic and avoids ambiguous name matching in the hot path. +- Preserves room for richer per-author assignment logic without destabilizing repository baseline responses. diff --git a/docs/ACADEMIC_CATALOG_REFACTOR_SUMMARY.md b/docs/ACADEMIC_CATALOG_REFACTOR_SUMMARY.md index d3adee5..ca09212 100644 --- a/docs/ACADEMIC_CATALOG_REFACTOR_SUMMARY.md +++ b/docs/ACADEMIC_CATALOG_REFACTOR_SUMMARY.md @@ -1,308 +1,45 @@ -# Academic Catalog Refactor - Implementation Summary +# Academic Catalog Refactor Summary -## Date: 2025-11-02 +This summary reflects the current state of academic-catalog enrichment in the codebase. -## Overview +## What is in production path -Successfully refactored the Infoscience-specific integration into a broader academic catalog system that supports multiple catalogs (Infoscience, OpenAlex, EPFL Graph, etc.) with a dedicated enrichment agent. +- Canonical relation model: + - `linkedEntitiesRelation` + - `linkedEntitiesEnrichmentResult` + - Defined in `src/data_models/linked_entities.py` +- Shared search tooling: + - Infoscience tool wrappers in `src/context/infoscience.py` +- Agent implementations: + - `src/agents/linked_entities_enrichment.py` + - `src/agents/atomic_agents/linked_entities_searcher.py` -## What Was Implemented +## Pipeline usage by resource type -### ✅ 1. API Investigation (INFOSCIENCE_API_FINDINGS.md) +```mermaid +flowchart LR + R[Repository] --> R1[search_academic_catalogs] + R1 --> R2[structure_linked_entities] + R2 --> R3[Assign repository_relations to data.linkedEntities] -**Key Findings:** -- `/eperson/profiles/search/byName` endpoint doesn't exist (404 error) -- `dsoType=community/collection` parameters return empty results (not used at EPFL) -- General search without dsoType works well -- Direct UUID access via `/core/items/{uuid}` works perfectly -- Publications search is very effective + U[User] --> U1[enrich_user_linked_entities] + U1 --> U2[Assign relations to user.linkedEntities] -**Actions Taken:** -- Fixed author search to use publication-based fallback -- Updated lab search to extract from publication metadata -- Added `get_entity_by_uuid()` function for direct UUID access -- Documented all findings - -### ✅ 2. New Data Models (src/data_models/linked_entities.py) - -**Created:** -- `CatalogType` enum: infoscience, openalex, epfl_graph -- `EntityType` enum: publication, person, orgunit -- `linkedEntitiesRelation`: Unified relation model with: - - `catalogType`: Which catalog (Infoscience, OpenAlex, etc.) - - `entityType`: Type of entity (publication, person, orgunit) - - `entity`: Full entity details embedded (InfosciencePublication, InfoscienceAuthor, InfoscienceLab, or Dict) - - `confidence`: Confidence score (0.0-1.0) - - `justification`: Explanation of the match - - `externalId`, `matchedOn`: Optional matching metadata - - Helper methods: `get_display_name()`, `get_url()`, `to_markdown()` - -- `linkedEntitiesEnrichmentResult`: Agent output model with: - - `relations`: List of catalog relations found - - `searchStrategy`: Description of search approach - - `catalogsSearched`: List of catalogs searched - - `totalSearches`: Number of searches performed - - Token usage tracking fields - - Helper methods: `get_by_catalog()`, `get_by_entity_type()`, `get_publications()`, etc. - -### ✅ 3. Updated Core Models - -**Replaced `infoscienceEntity`/`infoscienceEntities` with `linkedEntities` in:** -- `Person` (src/data_models/models.py) -- `Organization` (src/data_models/models.py) -- `SoftwareSourceCode` (src/data_models/repository.py) -- `EnrichedAuthor` (src/data_models/user.py) -- `GitHubUser` (src/data_models/user.py) -- `GitHubOrganization` (src/data_models/organization.py) - -**Field Structure:** -```python -linkedEntities: Optional[List["linkedEntitiesRelation"]] = Field( - description="Relations to entities in academic catalogs (Infoscience, OpenAlex, EPFL Graph, etc.)", - default_factory=list, -) + O[Organization] --> O1[enrich_organization_linked_entities] + O1 --> O2[Assign relations to organization.linkedEntities] ``` -**Forward References:** -- Added proper TYPE_CHECKING imports -- Implemented model_rebuild() in `__init__.py` for all affected models -- Deprecated but kept `InfoscienceEntity` for backward compatibility - -### ✅ 4. Fixed Infoscience API (src/context/infoscience.py) - -**Updated Functions:** -- `search_authors()`: Removed broken profile endpoint, uses publication-based search -- `search_labs()`: Removed dsoType approach, extracts labs from publication metadata -- Added `get_entity_by_uuid()`: Direct entity access by UUID - -**Improvements:** -- Better error handling -- Clearer documentation -- More resilient to API limitations -- Supports direct UUID-based access - -### ✅ 5. Academic Catalog Enrichment Agent - -**New Files:** -- `src/agents/linked_entities_enrichment.py`: Agent implementation -- `src/agents/linked_entities_prompts.py`: System and contextual prompts - -**Agent Features:** -- **Three specialized enrichment functions:** - - `enrich_repository_linked_entities()`: For repositories - - `enrich_user_linked_entities()`: For users - - `enrich_organization_linked_entities()`: For organizations - -- **Tools available:** - - `search_infoscience_publications_tool` - - `search_infoscience_authors_tool` - - `search_infoscience_labs_tool` - - `get_author_publications_tool` - -- **Strategic Search Guidelines:** - - Start with most specific information - - ONE search per subject (cached automatically) - - Maximum 2 attempts per subject - - Accept when not found - - Be selective and efficient - -- **Output:** Returns `linkedEntitiesEnrichmentResult` with structured relations - -### ✅ 6. Pipeline Integration - -**Integrated into analysis classes:** - -**Repository (src/analysis/repositories.py):** -- Added `run_linked_entities_enrichment()` method -- Runs after organization enrichment, before EPFL assessment -- Extracts repository name, description, README excerpt -- Stores relations in `data.linkedEntities` -- Tracks token usage - -**User (src/analysis/user.py):** -- Added `run_linked_entities_enrichment()` method -- Runs after user enrichment, before EPFL assessment -- Extracts username, full name, bio, organizations -- Stores relations in `data.linkedEntities` -- Tracks token usage - -**Organization (src/analysis/organization.py):** -- Added `run_linked_entities_enrichment()` method -- Runs after organization enrichment, before EPFL assessment -- Extracts org name, description, website, members -- Stores relations in `data.linkedEntities` -- Tracks token usage - -**All integrations:** -- ✅ Properly wrapped in try-except (don't fail entire analysis) -- ✅ Token usage tracked and accumulated -- ✅ Logging at INFO level -- ✅ Called automatically in run_analysis() pipeline - -### ✅ 7. Exports and Dependencies - -**Updated src/data_models/__init__.py:** -- Added academic catalog model exports -- Proper model_rebuild() for all models with forward references -- Maintained backward compatibility - -**No changes needed to agent prompts:** -- Checked all agent files - no references to old `infoscienceEntity` field - -## Testing - -### Expected Test Case: DeepLabCut Repository - -**URL:** `https://github.com/DeepLabCut/DeepLabCut` - -**Expected Relations:** - -**Publications:** -- UUID: `492614b1-7dc9-4d24-81f7-648f1223de71` -- UUID: `f97b60da-bcab-4f2e-ba12-0ee0c4d0d6eb` - -**Persons:** -- UUID: `2e985179-c5f5-41b2-aa2d-367f2564acca` (Mackenzie Mathis) -- UUID: `01654480-b4ac-4bb0-bb0a-20f6eef92316` - -**Organizational Units:** -- UUID: `4935f194-314a-44ef-b0ac-a6b2197df007` -- UUID: `dc9cc862-b234-4886-83b0-7fd422e50f24` - -### How to Test - -```bash -# Test with force_refresh and enrichments enabled -curl "http://0.0.0.0:1234/v1/extract/json/https://github.com/DeepLabCut/DeepLabCut?force_refresh=true&enrich_orgs=true&enrich_users=true" -``` - -**What to verify:** -1. `linkedEntities` field exists in output -2. Relations have `catalogType: "infoscience"` -3. Relations have correct `entityType` (publication, person, orgunit) -4. Entity objects are fully populated with UUIDs and URLs -5. Confidence scores are meaningful (0.0-1.0) -6. Justifications explain how entities were found - -## Architecture Benefits - -### 1. Extensibility -- Easy to add new catalogs (OpenAlex, EPFL Graph) -- Standardized relation structure -- Catalog-agnostic API - -### 2. Separation of Concerns -- Dedicated agent for academic catalog enrichment -- Clear separation from EPFL assessment -- Runs independently of other enrichments - -### 3. Maintainability -- Single source of truth for catalog relations -- Centralized Infoscience API handling -- Clear documentation and error handling - -### 4. Future-Proof -- Designed for multiple catalogs -- Entity type extensibility -- Confidence and justification tracking - -## Future Extensions - -### Easy Additions: -1. **OpenAlex Integration** - - Add `CatalogType.OPENALEX` - - Create OpenAlex search functions - - Add tools to academic catalog agent - -2. **EPFL Graph Integration** - - Add `CatalogType.EPFL_GRAPH` - - Create EPFL Graph API client - - Add tools to academic catalog agent - -3. **Cross-Catalog Matching** - - Match same entities across catalogs - - Deduplicate based on DOI, ORCID, etc. - - Provide unified entity views - -4. **Entity Resolution** - - Confidence scoring across catalogs - - Conflict resolution strategies - - Canonical entity selection - -## Files Created - -### New Files: -- `src/data_models/linked_entities.py` -- `src/agents/linked_entities_enrichment.py` -- `src/agents/linked_entities_prompts.py` -- `INFOSCIENCE_API_FINDINGS.md` -- `linked_entities_REFACTOR_SUMMARY.md` (this file) - -### Modified Files: -- `src/data_models/models.py` -- `src/data_models/repository.py` -- `src/data_models/user.py` -- `src/data_models/organization.py` -- `src/data_models/__init__.py` -- `src/context/infoscience.py` -- `src/analysis/repositories.py` -- `src/analysis/user.py` -- `src/analysis/organization.py` - -### Deleted Files: -- `test_infoscience_api.py` (temporary investigation script) -- `test_infoscience_simple.py` (temporary test script) - -## Breaking Changes - -### ⚠️ API Changes: -- **Removed field:** `infoscienceEntity` (singular) from `Person`, `Organization` -- **Removed field:** `infoscienceEntities` (plural) from `SoftwareSourceCode`, `GitHubUser`, `GitHubOrganization` -- **Added field:** `linkedEntities` (always plural) to all above models - -### Migration Path: -Old code accessing `infoscienceEntity`: -```python -# OLD -if person.infoscienceEntity: - print(person.infoscienceEntity.name) -``` - -New code: -```python -# NEW -if person.linkedEntities: - for relation in person.linkedEntities: - if relation.catalogType == CatalogType.INFOSCIENCE: - print(relation.entity.name) -``` - -Helper methods: -```python -# Get Infoscience publications -catalog_result = enrichment_result # linkedEntitiesEnrichmentResult -infoscience_relations = catalog_result.get_by_catalog(CatalogType.INFOSCIENCE) -publications = catalog_result.get_publications() -persons = catalog_result.get_persons() -orgunits = catalog_result.get_orgunits() -``` +## Key design outcomes -## Conclusion +- Unified relation shape across catalog entity types (`publication`, `person`, `orgunit`). +- Explicit catalog namespace via `CatalogType` enum. +- Model-level utilities for filtering by catalog and entity type. -Successfully completed a comprehensive refactoring of the Infoscience integration into a broader, extensible academic catalog system. The implementation: +## Known gap -✅ Fixes all API issues -✅ Provides better data models -✅ Introduces dedicated enrichment agent -✅ Maintains backward compatibility where possible -✅ Sets foundation for multi-catalog support -✅ Follows all project patterns and conventions -✅ Includes comprehensive documentation +- Repository `run_author_linked_entities_enrichment` is currently scaffolded and logs search attempts, but relation parsing/assignment for each author is not fully implemented yet. -The system is now ready to: -1. Find and link academic catalog entities -2. Support multiple catalogs -3. Provide rich relation metadata -4. Scale to future requirements +## Validation notes -**Status:** All TODOs completed. Ready for testing with DeepLabCut repository. +- The conversion layer in `src/data_models/conversion.py` includes JSON-LD mapping for linked entities. +- Response-level usage metrics are reported through `APIStats` in API responses. diff --git a/docs/AFFILIATION_CHANGES.md b/docs/AFFILIATION_CHANGES.md index acf863f..9ea7b79 100644 --- a/docs/AFFILIATION_CHANGES.md +++ b/docs/AFFILIATION_CHANGES.md @@ -1,251 +1,43 @@ -# Enhanced Affiliation Tracking - Implementation Summary +# Affiliation Model Notes -## Overview -Replaced simple string-based `affiliations: List[str]` with structured `affiliations: List[Affiliation]` throughout the codebase to track organization identifiers and data provenance. +This document reflects the current affiliation representation used in the v2 codebase. -## Breaking Changes ⚠️ +## Canonical affiliation object -This is a **breaking change**. API responses and cached data have changed format: +`src/data_models/models.py` defines: -### Before (Old Format) -```json -{ - "affiliations": ["EPFL", "Swiss Data Science Center", "Hackuarium"] -} -``` +- `Affiliation.name` +- `Affiliation.organizationId` +- `Affiliation.source` -### After (New Format) -```json -{ - "affiliations": [ - { - "name": "EPFL", - "organizationId": "https://ror.org/02s376052", - "source": "orcid" - }, - { - "name": "Swiss Data Science Center", - "organizationId": "SwissDataScienceCenter", - "source": "github_profile" - }, - { - "name": "Hackuarium", - "organizationId": null, - "source": "agent_user_enrichment" - } - ] -} -``` +`Person.affiliations` is a list of these objects. -## New Data Model +## Source provenance values in use -### Affiliation Model -Location: `src/data_models/models.py` +Typical values include: -```python -class Affiliation(BaseModel): - """Structured affiliation with provenance tracking""" +- `gimie` +- `orcid` +- `agent_org_enrichment` +- `agent_user_enrichment` +- `github_profile` +- `email_domain` - name: str = Field( - description="Organization name (e.g., 'Swiss Data Science Center', 'EPFL')" - ) - organizationId: Optional[str] = Field( - default=None, - description="Organization identifier: ROR ID, GitHub handle, or internal ID" - ) - source: str = Field( - description="Data source: 'gimie', 'orcid', 'agent_org_enrichment', 'agent_user_enrichment', 'github_profile', 'email_domain'" - ) -``` +## Where affiliations are populated -### Source Types -- `orcid` - From ORCID employment records -- `github_profile` - From GitHub organization memberships -- `email_domain` - Inferred from email domains (@epfl.ch, etc.) -- `agent_user_enrichment` - From user enrichment AI agent -- `agent_org_enrichment` - From organization enrichment AI agent -- `gimie` - From GIMIE repository metadata +- Repository flow: + - ORCID enrichment in `Repository.run_authors_enrichment` + - optional user and organization enrichment stages +- User flow: + - LLM + enrichment paths in `src/analysis/user.py` +- Organization flow: + - organization enrichment in `src/analysis/organization.py` -### Organization ID Types -- **ROR ID**: Full URL format (e.g., `https://ror.org/02s376052`) -- **GitHub Handle**: Organization handle (e.g., `SwissDataScienceCenter`) -- **Internal ID**: Any internal identifier from source systems -- **null**: When no identifier is available +## Data quality and privacy notes -## Files Modified +- ORCID values are normalized/validated in models. +- Person and GitAuthor emails are anonymized by model validators/utilities before output persistence. -### 1. Core Data Models -- ✅ `src/data_models/models.py` - Added Affiliation model, updated Person.affiliations -- ✅ `src/data_models/user.py` - Updated EnrichedAuthor.affiliations -- ✅ `src/data_models/__init__.py` - Exported Affiliation model +## Migration note -### 2. Utilities -- ✅ `src/utils/utils.py` - - Updated `get_orcid_affiliations()` to return `List[Affiliation]` - - Updated `enrich_author_with_orcid()` to handle Affiliation objects - - Merging now uses name-based deduplication - -### 3. Repository Analysis -- ✅ `src/analysis/repositories.py` - - Updated GIMIE affiliation extraction to create Affiliation objects - - Updated affiliation merging logic in `_convert_simplified_to_full()` - - Handles dict and Affiliation object formats - -### 4. Agent Prompts -- ✅ `src/agents/user_prompts.py` - - Updated system prompt to explain Affiliation structure - - Formatted affiliation display in prompts as structured objects -- ✅ `src/agents/organization_prompts.py` - - Updated affiliation display for ORCID authors (2 locations) - - Shows name, organizationId, and source in prompts -- ✅ `src/agents/organization_enrichment.py` - - Updated `_pre_search_ror_for_organizations()` to handle Affiliation objects - - Handles dict, object, and legacy string formats - -### 5. JSON-LD Conversion -- ✅ `src/data_models/conversion.py` - - Added Affiliation to `PYDANTIC_TO_ZOD_MAPPING` - - Added Affiliation to type_mapping - - Mapped fields: name → schema:name, organizationId → schema:identifier, source → imag:source - -### 6. Simplified Models -- ✅ `src/data_models/repository.py` - - Updated `to_simplified_schema()` to extract names from Affiliation objects - - Converts Affiliation objects to simple strings for atomic agents - -## Benefits - -### 1. Provenance Tracking -Now you can see exactly where each affiliation came from: -```python -for aff in person.affiliations: - print(f"{aff.name} - Source: {aff.source}") -``` - -### 2. Organization Linking -Can track organization identifiers (ROR, GitHub handles): -```python -epfl_affs = [aff for aff in person.affiliations if aff.organizationId == "https://ror.org/02s376052"] -``` - -### 3. Common Organization Detection -Can now identify when authors share organizations: -```python -# Find all authors affiliated with SwissCat+ -swisscat_authors = [] -for author in repository.author: - for aff in author.affiliations: - if "SwissCat" in aff.name or aff.organizationId == "SwissCat+": - swisscat_authors.append(author) -``` - -### 4. Multi-Source Enrichment -Same organization from multiple sources is properly tracked: -```python -# EPFL from ORCID -Affiliation(name="EPFL", organizationId="https://ror.org/02s376052", source="orcid") -# EPFL from email -Affiliation(name="EPFL", organizationId=None, source="email_domain") -``` - -### 5. Deduplication -Smart merging prevents duplicates based on organization name (case-insensitive): -```python -existing_names = {aff.name.lower(): aff for aff in person.affiliations} -# Only adds if name doesn't already exist -``` - -## Migration Notes - -### Cache Impact -- **All cached data will be in old format** (List[str]) -- **New analysis will return new format** (List[Affiliation]) -- Recommendation: Clear cache after deployment or add version check - -### API Consumers -API consumers will need to update to handle the new structure: - -**Old code:** -```python -affiliations = person["affiliations"] # List of strings -print(affiliations[0]) # "EPFL" -``` - -**New code:** -```python -affiliations = person["affiliations"] # List of Affiliation objects -print(affiliations[0]["name"]) # "EPFL" -print(affiliations[0]["organizationId"]) # "https://ror.org/02s376052" -print(affiliations[0]["source"]) # "orcid" -``` - -### Backward Compatibility -**None.** This is an intentional breaking change for better data quality. - -## Testing - -To test the implementation with a real repository: - -```bash -# Test with Carlos Vivar Rios' profile -curl "http://0.0.0.0:1234/v1/user/llm/json/github.com/caviri?force_refresh=true" - -# Look for the affiliations field in the response -# Each affiliation should have: name, organizationId, source -``` - -Expected result: -- Affiliations will be objects with provenance information -- GitHub organizations will have their handles as organizationId -- ORCID affiliations will have ROR IDs (when available) -- Source field will indicate where each affiliation came from - -## Future Enhancements - -Potential improvements: -- [ ] Add confidence scores to Affiliation model -- [ ] Add temporal information (start/end dates) -- [ ] Automatic ROR ID lookup for all affiliations -- [ ] Affiliation validation and normalization -- [ ] Affiliation history tracking (separate from affiliations list) -- [ ] Cross-reference with other catalogs (OpenAlex, EPFL Graph) - -## Rollback Plan - -If issues arise, to rollback: -1. Revert changes to `src/data_models/models.py` (Affiliation model and Person.affiliations) -2. Revert changes to `src/utils/utils.py` -3. Revert changes to agent prompts -4. Clear cache to remove mixed-format data -5. Restart server - -## Fixes Applied - -### Issue 1: Nested Organization Objects in Affiliation.name -**Problem**: GIMIE extraction was passing full organization dicts to `Affiliation.name` instead of just the organization name string. - -**Fix** (lines 703-705, 863-892 in `src/analysis/repositories.py`): -- Extract name string from organization dicts: `org_data.get("legalName") or org_data.get("name")` -- Add validation to ensure `name` is always a string -- Recursively extract name if nested dict is encountered -- Log warnings when unexpected data types are found - -### Issue 2: Affiliation Objects Not JSON Serializable -**Problem**: When passing GIMIE data to the atomic LLM pipeline, `json.dumps()` failed because Affiliation (Pydantic) objects aren't directly JSON serializable. - -**Error**: `TypeError: Object of type Affiliation is not JSON serializable` - -**Fix** (lines 119-136 in `src/analysis/repositories.py`): -- Convert Person objects to dicts using `model_dump()` before JSON serialization -- Convert Organization objects to dicts using `model_dump()` before JSON serialization -- Added `default=str` fallback to handle any other non-serializable objects -- This ensures all Pydantic models (including nested Affiliation objects) are properly serialized - -## Questions or Issues? - -If you encounter problems with the new affiliation tracking: -1. Check that all Affiliation objects have required fields (name, source) -2. Verify organizationId is either a string or null (not empty string) -3. Ensure source is one of the valid source types -4. Check logs for validation errors during model creation -5. If you see "Affiliation name is not a string" warnings, check GIMIE extraction logic +Legacy flat string affiliation assumptions should be updated to consume structured `Affiliation` entries with provenance. diff --git a/docs/AGENT_STRATEGY.md b/docs/AGENT_STRATEGY.md index 4edfde1..20d19cf 100644 --- a/docs/AGENT_STRATEGY.md +++ b/docs/AGENT_STRATEGY.md @@ -1,117 +1,62 @@ # Repository Analysis Agent Strategy -This document outlines the step-by-step analysis pipeline executed by the `Repository` class in `src/analysis/repositories.py`. The strategy involves a sequence of data extraction, AI-powered analysis, and enrichment steps to produce a comprehensive metadata profile for a given software repository. +This document describes the current `Repository.run_analysis()` behavior in `src/analysis/repositories.py`. -## Analysis Pipeline Flowchart - -The following diagram illustrates the complete analysis flow, including optional enrichment steps and the data models used at each stage. +## Pipeline flow ```mermaid -graph TD - subgraph "Start" - A[Input: Repository URL] - end - - subgraph "Cache & Pre-computation" - B{Cache Check}; - C[run_gimie_analysis]; - end - - subgraph "Core LLM Analysis" - D[run_llm_analysis
Agent: llm_request_repo_infos
DataModel: SoftwareSourceCode]; - E[run_authors_enrichment
(ORCID Scraping)
DataModel: Person]; - end - - subgraph "Optional Enrichments" - F{enrich_users?}; - G[run_user_enrichment
Agent: enrich_users_from_dict
DataModel: UserEnrichmentResult]; - H{enrich_orgs?}; - I[run_organization_enrichment
Agent: enrich_organizations_from_dict
DataModel: OrganizationEnrichmentResult]; - end - - subgraph "Final Assessments" - J[run_linked_entities_enrichment
Agent: enrich_repository_linked_entities
DataModel: linkedEntitiesEnrichmentResult]; - K[run_epfl_final_assessment
Agent: assess_epfl_relationship
DataModel: EPFLAssessmentResult]; - end - - subgraph "Finalization" - L[run_validation]; - M[save_in_cache]; - Y[End: Return Enriched Data]; - Z[End: Return Cached Data]; +flowchart TD + A[Input repository URL] --> B{Public repository} + B -- No --> X[Stop with error] + B -- Yes --> C{Cached and not force refresh} + C -- Yes --> Z[Load cached result] + C -- No --> D[Run GIMIE analysis] + D --> E[Run atomic LLM pipeline] + + subgraph S1[Atomic LLM pipeline] + E1[Prepare repository context] + E2[Compile repository context] + E3[Generate structured output] + E4[Classify repository type and discipline] + E5[Identify related organizations] + E6[Build SoftwareSourceCode model] end - %% --- Define Flow --- - A --> B; - B -- Cache Miss / Force Refresh --> C; - B -- Cache Hit --> Z; - - C --> D; - D --> E; - E --> F; - - F -- Yes --> G; - G --> H; - F -- No --> H; - - H -- Yes --> I; - I --> J; - H -- No --> J; - - J --> K; - K --> L; - L --> M; - M --> Y; - - %% --- Style Definitions --- - style A fill:#f9f,stroke:#333,stroke-width:2px - style Z fill:#bfa,stroke:#333,stroke-width.md:2px - style Y fill:#bfa,stroke:#333,stroke-width.md:2px - classDef agentNode fill:#dff,stroke:#333,stroke-width.md:2px - class D,G,I,J,K agentNode + E --> E1 --> E2 --> E3 --> E4 --> E5 --> E6 + E6 --> F[Run ORCID author enrichment] + F --> G{Run user enrichment} + G -- Yes --> H[Run user enrichment step] + G -- No --> I[Skip user enrichment] + H --> J{Run organization enrichment} + I --> J + J -- Yes --> K[Run organization enrichment step] + J -- No --> L[Skip organization enrichment] + K --> M[Run linked entities enrichment] + L --> M + M --> N{Run author linked entities} + N -- Yes --> O[Run author linked entities step] + N -- No --> P[Skip author linked entities] + O --> Q[Run final EPFL assessment] + P --> Q + Q --> R[Run validation] + R --> S[Save in cache] + S --> T[Return output and usage stats] ``` -## Pipeline Steps Explained - -The `Repository.run_analysis` method orchestrates the following steps in sequence: - -1. **Cache Check**: Before any processing, the system checks if a complete, cached result for the given repository URL already exists. If a valid cache entry is found and `force_refresh` is `false`, the cached data is returned immediately, and the pipeline stops. - -2. **GIMIE Analysis (`run_gimie_analysis`)**: - - **Purpose**: Extracts basic, structured metadata from the repository using the `gimie` tool. - - **Output**: A JSON-LD graph which is used as context for the subsequent LLM analysis. - -3. **Core LLM Analysis (`run_llm_analysis`)**: - - **Agent**: `llm_request_repo_infos` - - **Purpose**: This is the main analysis step. The agent receives the repository's content (code, READMEs, etc.) and the GIMIE output. It analyzes this context to generate the initial `SoftwareSourceCode` object. - - **Data Model**: `SoftwareSourceCode` - -4. **Author ORCID Enrichment (`run_authors_enrichment`)**: - - **Purpose**: A non-agent step that iterates through the authors identified by the LLM. If an author has an ORCID iD, this step scrapes their public ORCID profile to add affiliation data. - - **Data Model**: Modifies the `Person` objects within the `SoftwareSourceCode.author` list. +## Core stages -5. **User Enrichment (`run_user_enrichment`)** - *Optional*: - - **Triggered by**: `enrich_users=true` query parameter. - - **Agent**: `enrich_users_from_dict` - - **Purpose**: Performs a deep analysis of git authors and existing author data. It uses tools to search ORCID and the web to create detailed author profiles, including affiliation history and contribution summaries. - - **Data Model**: The agent returns a `UserEnrichmentResult`, and the `EnrichedAuthor` objects within it are converted to `Person` objects, replacing the existing author list in `self.data`. +1. Cache and repository accessibility checks. +2. GIMIE metadata retrieval. +3. Atomic LLM pipeline for core repository structure. +4. Optional enrichment branches (users, organizations, author-level linked entities). +5. Academic catalog linked entities + final EPFL assessment. +6. Validation and cache persistence. -6. **Organization Enrichment (`run_organization_enrichment`)** - *Optional*: - - **Triggered by**: `enrich_orgs=true` query parameter. - - **Agent**: `enrich_organizations_from_dict` - - **Purpose**: Analyzes git author emails and existing organization mentions to identify and standardize institutional affiliations. It uses the ROR (Research Organization Registry) API to fetch canonical data for organizations. - - **Data Model**: The agent returns an `OrganizationEnrichmentResult`. The `Organization` objects from this result replace the `relatedToOrganizations` list in `self.data`. +## Token accounting -7. **Academic Catalog Enrichment (`run_linked_entities_enrichment`)**: - - **Agent**: `enrich_repository_linked_entities` - - **Purpose**: Searches academic catalogs (currently EPFL Infoscience) for publications, researchers, and labs related to the repository, its authors, and its affiliated organizations. - - **Data Model**: Returns an `linkedEntitiesEnrichmentResult`. The `linkedEntitiesRelation` objects are then assigned to the `linkedEntities` fields on the main `SoftwareSourceCode` object as well as on the individual `Person` and `Organization` objects. +The repository analysis aggregates both: -8. **EPFL Final Assessment (`run_epfl_final_assessment`)**: - - **Agent**: `assess_epfl_relationship` - - **Purpose**: This is the final step in the analysis. This agent performs a holistic review of all data collected in the previous steps to make a definitive, evidence-based judgment on the repository's relationship to EPFL. - - **Data Model**: Returns an `EPFLAssessmentResult`. The findings (`relatedToEPFL`, `relatedToEPFLConfidence`, `relatedToEPFLJustification`) overwrite any previous values in `self.data` to ensure consistency. +- official API token usage (`input_tokens`, `output_tokens`), and +- estimated token usage (`estimated_input_tokens`, `estimated_output_tokens`). -9. **Validation & Caching (`run_validation`, `save_in_cache`)**: - - **Purpose**: The final, enriched `SoftwareSourceCode` object is validated against the Pydantic model one last time. If valid, the complete result is saved to the SQLite cache for future requests. - - **Output**: The final, enriched `SoftwareSourceCode` object is returned. +These values are returned in `APIOutput.stats`. diff --git a/docs/ESTIMATED_TOKENS_FIX.md b/docs/ESTIMATED_TOKENS_FIX.md index ffd1c0c..ad5918b 100644 --- a/docs/ESTIMATED_TOKENS_FIX.md +++ b/docs/ESTIMATED_TOKENS_FIX.md @@ -1,198 +1,28 @@ -# Estimated Token Tracking - Complete Fix +# Estimated Token Tracking - Current Status -## Date: 2025-11-02 +Token accounting is now implemented as a first-class part of analysis outputs. -## Problem +## What is tracked -User noticed estimated tokens were consistently similar (~56k input, ~3.4k output) regardless of analysis complexity, suggesting not all agents were being tracked properly. +Each analysis pipeline accumulates both: -## Root Cause Analysis +- official API token counts (`input_tokens`, `output_tokens`), and +- estimated token counts (`estimated_input_tokens`, `estimated_output_tokens`). -After comprehensive audit, found **TWO critical bugs** where estimated tokens were either: -1. Using wrong key names to extract from `estimate_tokens_from_messages()` -2. Not being calculated at all (hardcoded to 0) +These are aggregated into `APIStats` (`src/data_models/api.py`) and returned in endpoint responses. -## Bugs Found +## Where aggregation happens -### Bug #1: Academic Catalog Enrichment - Wrong Key Names ❌ +- Repository pipeline: `src/analysis/repositories.py` +- User pipeline: `src/analysis/user.py` +- Organization pipeline: `src/analysis/organization.py` -**Location:** `src/agents/linked_entities_enrichment.py` lines 155-157 +Each stage updates cumulative counters when `usage` metadata is present. -**Problem:** -```python -# WRONG - Using OpenAI-style key names -usage_data["estimated_input_tokens"] = estimated.get("prompt_tokens", 0) # ❌ -usage_data["estimated_output_tokens"] = estimated.get("completion_tokens", 0) # ❌ -``` +## Why estimated tokens still matter -**What happened:** -- `estimate_tokens_from_messages()` returns: `{"input_tokens": ..., "output_tokens": ..., "total_tokens": ...}` -- But code was trying to extract `"prompt_tokens"` and `"completion_tokens"` (which don't exist!) -- Result: `estimated.get("prompt_tokens", 0)` always returned `0` -- **Estimated tokens for academic catalog enrichment were always 0!** +Some model/provider paths may not always return complete usage metadata. Estimated counts provide a fallback metric for observability and cost tracking. -**Fix:** -```python -# CORRECT - Use standard key names -usage_data["estimated_input_tokens"] = estimated.get("input_tokens", 0) # ✅ -usage_data["estimated_output_tokens"] = estimated.get("output_tokens", 0) # ✅ -``` +## Operational check -### Bug #2: EPFL Assessment - Not Calculated At All ❌ - -**Location:** `src/agents/epfl_assessment.py` lines 99-104 and 119-123 - -**Problem:** -```python -# Hardcoded to 0 - no estimation at all! -"usage": { - "input_tokens": getattr(result, "input_tokens", 0), - "output_tokens": getattr(result, "output_tokens", 0), - "estimated_input_tokens": 0, # ❌ HARDCODED! - "estimated_output_tokens": 0, # ❌ HARDCODED! -} -``` - -**What happened:** -- EPFL assessment never called `estimate_tokens_from_messages()` -- Just hardcoded estimated tokens to 0 -- **No estimation tracking for final EPFL assessment at all!** - -**Fix:** -```python -# Added import -from ..utils.token_counter import estimate_tokens_from_messages - -# Calculate estimates -response_text = assessment_data.model_dump_json() if hasattr(assessment_data, "model_dump_json") else "" -estimated = estimate_tokens_from_messages( - system_prompt=epfl_assessment_system_prompt, - user_prompt=prompt, - response=response_text, -) - -# Extract actual tokens from result -input_tokens = 0 -output_tokens = 0 -if hasattr(result, "usage"): - usage = result.usage - input_tokens = getattr(usage, "input_tokens", 0) or 0 - output_tokens = getattr(usage, "output_tokens", 0) or 0 - - # Fallback to details if needed - if input_tokens == 0 and output_tokens == 0 and hasattr(usage, "details"): - details = usage.details - if isinstance(details, dict): - input_tokens = details.get("input_tokens", 0) - output_tokens = details.get("output_tokens", 0) - -# Return with proper usage statistics -"usage": { - "input_tokens": input_tokens, - "output_tokens": output_tokens, - "estimated_input_tokens": estimated.get("input_tokens", 0), # ✅ CALCULATED! - "estimated_output_tokens": estimated.get("output_tokens", 0), # ✅ CALCULATED! -} -``` - -## Verification - All Agents Checked ✅ - -### Repository Analysis Pipeline - -| Agent | Estimated Tokens | Status | -|-------|------------------|--------| -| LLM Analysis | ✅ Tracked | Working | -| Organization Enrichment | ✅ Tracked | Working | -| User Enrichment | ✅ Tracked | Working | -| Academic Catalog Enrichment | ❌ → ✅ | **FIXED** (wrong keys) | -| EPFL Assessment | ❌ → ✅ | **FIXED** (not calculated) | - -### User Analysis Pipeline - -| Agent | Estimated Tokens | Status | -|-------|------------------|--------| -| LLM Analysis | ✅ Tracked | Working | -| Organization Enrichment | ✅ Tracked | Working | -| User Enrichment | ✅ Tracked | Working | -| Academic Catalog Enrichment | ❌ → ✅ | **FIXED** (wrong keys) | -| EPFL Assessment | ❌ → ✅ | **FIXED** (not calculated) | - -### Organization Analysis Pipeline - -| Agent | Estimated Tokens | Status | -|-------|------------------|--------| -| LLM Analysis | ✅ Tracked | Working | -| Organization Enrichment | ✅ Tracked | Working | -| Academic Catalog Enrichment | ❌ → ✅ | **FIXED** (wrong keys) | -| EPFL Assessment | ❌ → ✅ | **FIXED** (not calculated) | - -## Impact - -### Before Fixes: -- **Academic catalog enrichment**: Estimated tokens always 0 (missing ~10-15k tokens per run) -- **EPFL assessment**: Estimated tokens always 0 (missing ~5-10k tokens per run) -- **Total missing**: ~15-25k estimated tokens per analysis run -- **Result**: Reported estimates were ~40% too low! - -### After Fixes: -- ✅ Academic catalog enrichment properly estimates tokens -- ✅ EPFL assessment properly estimates tokens -- ✅ All agents now contribute to total estimated token count -- ✅ Estimated totals should be **significantly higher** and vary by analysis complexity - -## Testing - -### Expected Changes: - -**Before:** -```json -{ - "estimated_input_tokens": 56761, // Missing ~20k - "estimated_output_tokens": 3417, // Missing ~2k - "estimated_total_tokens": 60178 // Should be ~80-85k -} -``` - -**After:** -```json -{ - "estimated_input_tokens": 75000-80000, // +academic catalog +EPFL - "estimated_output_tokens": 5000-6000, // +academic catalog +EPFL - "estimated_total_tokens": 80000-86000 // More accurate! -} -``` - -### Variation by Complexity: - -**Simple repo** (few authors, no EPFL relation): -- Estimated total: ~60-70k tokens - -**Complex repo** (many authors, EPFL related, academic catalog hits): -- Estimated total: ~90-110k tokens - -**DeepLabCut example** (lots of authors, publications, EPFL): -- Estimated total: ~100-120k tokens - -## Files Modified - -1. `src/agents/linked_entities_enrichment.py` - - Fixed key names: `prompt_tokens` → `input_tokens` - - Fixed key names: `completion_tokens` → `output_tokens` - -2. `src/agents/epfl_assessment.py` - - Added import: `estimate_tokens_from_messages` - - Added token estimation calculation - - Properly extract actual tokens from result - - Return calculated estimated tokens instead of hardcoded 0 - -## Conclusion - -✅ **All agents now properly track estimated tokens!** - -The estimated token counts will now: -1. Include ALL agent calls (academic catalog + EPFL assessment were missing) -2. Vary based on actual analysis complexity -3. Be ~30-40% higher than before (more accurate) -4. Better reflect the actual LLM usage in the system - -The user's suspicion was **100% correct** - estimated tokens were not fully added! 🎯 +When validating new agents/stages, confirm both official and estimated counters increase in the endpoint `stats` payload. diff --git a/docs/INFOSCIENCE_API_FINDINGS.md b/docs/INFOSCIENCE_API_FINDINGS.md index dfb02c9..32d860f 100644 --- a/docs/INFOSCIENCE_API_FINDINGS.md +++ b/docs/INFOSCIENCE_API_FINDINGS.md @@ -1,137 +1,40 @@ -# Infoscience API Investigation Findings +# Infoscience API Findings -## Date: 2025-11-02 +This page captures implementation-level findings reflected in `src/context/infoscience.py`. -## Summary -Investigation of EPFL's Infoscience API (DSpace 7.6) to understand endpoint behavior and fix 403/404 errors. +## Effective API base -## Key Findings +- `https://infoscience.epfl.ch/server/api` -### 1. General Search (WITHOUT dsoType) -- **Endpoint**: `/discover/search/objects` -- **Status**: ✅ WORKS -- **Query**: `?query=Mathis Lab&size=3` -- **Results**: 95 results found -- **Returns**: Mixed types (items, etc.) +## Practical endpoint behavior used by this project -### 2. Search with dsoType=item -- **Endpoint**: `/discover/search/objects?dsoType=item` -- **Status**: ✅ WORKS -- **Query**: `?query=DeepLabCut&size=3&dsoType=item` -- **Results**: 273 results found -- **Use for**: Publications, items +1. `GET /discover/search/objects` with `configuration=researchoutputs` +- Reliable for publication search and repository-name matching. -### 3. Search with dsoType=community -- **Endpoint**: `/discover/search/objects?dsoType=community` -- **Status**: ⚠️ RETURNS EMPTY (not 403) -- **Results**: 0 results -- **Conclusion**: EPFL may not use DSpace communities or they're not searchable +2. `GET /discover/search/objects` with `configuration=person` +- Preferred path for person profiles. +- Fallback to publication author metadata when no person profile is returned. -### 4. Direct UUID Access -- **Endpoint**: `/core/items/{uuid}` -- **Status**: ✅ WORKS -- **Example**: `/core/items/492614b1-7dc9-4d24-81f7-648f1223de71` -- **Returns**: Full item metadata -- **Use for**: Direct access to publications, persons, orgunits by UUID +3. `GET /discover/search/objects` with `configuration=orgunit` +- Preferred path for labs/orgunits. +- Fallback to publication affiliation metadata when no orgunit record is returned. -### 5. Search Publications -- **Function**: `search_publications()` -- **Status**: ✅ WORKS -- **Uses**: `/discover/search/objects` with `configuration=researchoutputs` -- **Results**: 273 results for "DeepLabCut" +4. `GET /entities/{entity_type}/{uuid}` and `GET /core/items/{uuid}` +- Used for direct UUID resolution. -### 6. Search Authors -- **Function**: `search_authors()` -- **Status**: ✅ FIXED -- **Endpoint Tried**: `/eperson/profiles/search/byName` (404 Not Found - doesn't exist) -- **Solution**: Use `configuration=person` like the web UI -- **Working Endpoint**: `/discover/search/objects?query=alexander%20mathis&configuration=person` -- **Web UI**: https://infoscience.epfl.ch/search?page=1&configuration=person&query=alexander%20mathis -- **Results**: Successfully returns person profiles with full metadata -- **Fallback**: Search by author name in publications (dc.contributor.author field) if no person profiles found +## Normalization rules implemented -### 7. Search Labs -- **Function**: `search_labs()` -- **Status**: ✅ FIXED -- **Endpoint Tried**: `/discover/search/objects?dsoType=community` (returns 0 results) -- **Solution**: Use `configuration=orgunit` like the web UI for organizational units -- **Working Endpoint**: `/discover/search/objects?query=mathis+lab&configuration=orgunit` -- **Results**: Successfully returns organizational unit profiles -- **Fallback**: Search publications and extract lab info from metadata (dc.contributor.lab, dc.contributor.unit, etc.) +- Publication UUID -> `https://infoscience.epfl.ch/entities/publication/{uuid}` +- Person UUID -> `https://infoscience.epfl.ch/entities/person/{uuid}` +- Orgunit UUID -> `https://infoscience.epfl.ch/entities/orgunit/{uuid}` -## Entity Endpoints from User URLs +## Failure handling -The user provided these working entity URLs: -- **Orgunits**: `https://infoscience.epfl.ch/entities/orgunit/{uuid}` -- **Persons**: `https://infoscience.epfl.ch/entities/person/{uuid}` -- **Publications**: `https://infoscience.epfl.ch/entities/publication/{uuid}` +- HTTP and timeout errors are logged and converted to safe empty result structures. +- Parsing errors for individual result items are skipped; remaining items still return. +- Agent tools include local in-memory search caching to avoid repeated identical queries. -These suggest the existence of `/entities/` API endpoints that we should investigate and potentially use. +## Current limitation boundary -## Implementation Status - -### ✅ 1. Author Search - FIXED -- Removed the broken `/eperson/profiles/search/byName` endpoint reference -- Now uses `configuration=person` (primary method) -- Falls back to publication author search if needed -- Successfully finds profiles like "Alexander Mathis" and "Mackenzie Weygandt Mathis" - -### ✅ 2. Lab Search - FIXED -- Removed `dsoType=community` approach (was returning empty) -- Now uses `configuration=orgunit` (primary method) -- Falls back to searching publications and extracting lab affiliations -- Successfully finds organizational units like "Mathis Lab" - -### ✅ 3. Direct Entity Access - IMPLEMENTED -Created `get_entity_by_uuid()` function: -```python -async def get_entity_by_uuid(uuid: str, entity_type: Optional[str] = None): - """ - Get entity directly by UUID using /core/items/{uuid} - - Args: - uuid: Entity UUID - entity_type: Optional hint ("publication", "person", "orgunit") - - Returns: - Entity data parsed based on type - """ - # Uses /core/items/{uuid} which works for all entity types -``` - -### ✅ 4. Entity Type Detection - IMPLEMENTED -- Parser functions detect entity type from metadata -- `_parse_publication()`, `_parse_author()`, `_parse_lab()` handle different types -- Automatic type detection based on metadata structure - -## Configuration Parameter - -The `configuration` parameter works and maps to the web UI search configurations: -- `configuration=researchoutputs` - for publications ✅ TESTED -- `configuration=person` - for person profiles ✅ TESTED (like web UI person search) -- `configuration=orgunit` - for organizational units ✅ TESTED (labs, departments, etc.) - -## Conclusion - -### ✅ All Issues Fixed! - -Original problems: -1. **Author search endpoint doesn't exist** - ✅ FIXED: Use `configuration=person` -2. **dsoType=community/collection returns empty** - ✅ FIXED: Use `configuration=orgunit` -3. **Direct entity access** - ✅ IMPLEMENTED: `get_entity_by_uuid()` function added - -The API now works excellently for: -- ✅ Publication search (very effective) - `configuration=researchoutputs` -- ✅ Person/author search - `configuration=person` -- ✅ Organizational unit/lab search - `configuration=orgunit` -- ✅ Direct UUID-based item retrieval - `/core/items/{uuid}` -- ✅ General keyword search - -### Key Insight - -The key was understanding that Infoscience uses **configuration-based search** (like the web UI) rather than the traditional DSpace dsoType filtering: -- **Web UI**: Uses `?configuration=person` query parameter -- **API**: Same parameter works in `/discover/search/objects` endpoint -- **Configurations available**: `researchoutputs`, `person`, `orgunit` - -This matches how the web UI works and provides direct access to typed entity searches! +- Infoscience retrieval is integrated end-to-end for repository-, user-, and organization-level analysis. +- In repository flow, only repository-level linked entities are currently assigned automatically; author-level relation assignment remains a follow-up path (`run_author_linked_entities_enrichment` contains TODO parsing logic). diff --git a/docs/INFOSCIENCE_INTEGRATION.md b/docs/INFOSCIENCE_INTEGRATION.md index f8b53b3..e360583 100644 --- a/docs/INFOSCIENCE_INTEGRATION.md +++ b/docs/INFOSCIENCE_INTEGRATION.md @@ -1,361 +1,61 @@ -# Infoscience API Integration - Implementation Summary - -## Overview -This document describes the implementation of Infoscience API integration for querying EPFL's research repository from the three AI agents (repository, user, and organization enrichment). - -## What Was Implemented - -### 1. Data Models (`src/data_models/infoscience.py`) -Created comprehensive Pydantic models for Infoscience data: - -- **`InfosciencePublication`**: Publication metadata including title, authors, DOI, abstract, dates, lab info, and repository URLs -- **`InfoscienceAuthor`**: Author/researcher information including name, email, ORCID, affiliation, and publication count -- **`InfoscienceLab`**: Laboratory/organizational unit details including name, description, parent organization, and research areas -- **`InfoscienceSearchResult`**: Wrapper for search results with pagination information - -Each model includes a `to_markdown()` method for converting structured data to LLM-friendly markdown format. - -### 2. HTTP Client & API Functions (`src/context/infoscience.py`) - -#### Configuration Constants -- `INFOSCIENCE_BASE_URL`: Base URL for EPFL's Infoscience API -- `DEFAULT_MAX_RESULTS`: Default result limit (10) -- `REQUEST_TIMEOUT`: Request timeout (30 seconds) - -#### Core HTTP Functions -- `_make_api_request()`: Async HTTP request helper with error handling -- `_parse_metadata()`: Extract single metadata field from DSpace responses -- `_parse_metadata_list()`: Extract multiple metadata values -- `_parse_publication()`: Convert DSpace item to InfosciencePublication - -#### Search Functions -- `search_publications()`: Search for publications by title, DOI, or keywords -- `search_authors()`: Search for researchers by name -- `search_labs()`: Search for labs and organizational units -- `get_author_publications()`: Get all publications by a specific author - -All functions use `httpx` for async operations and return structured Pydantic models. - -### 3. PydanticAI Tool Functions (`src/context/infoscience.py`) - -Four tool functions that agents can call: - -- **`search_infoscience_publications_tool(query, max_results=10)`** - - Searches publications by any criteria - - Returns markdown-formatted results - - Max 50 results per query - -- **`search_infoscience_authors_tool(name, max_results=10)`** - - Searches for authors/researchers - - Returns author profiles with affiliations - - Max 50 results per query - -- **`search_infoscience_labs_tool(name, max_results=10)`** - - Searches for labs and organizational units - - Returns lab information with descriptions - - Max 50 results per query - -- **`get_author_publications_tool(author_name, max_results=10)`** - - Gets all publications by a specific author - - Returns full publication list in markdown - - Max 50 results per query - -### 4. Agent Integration - -#### Repository Agent (`src/agents/repository.py`) -- Imported Infoscience tools: `search_infoscience_publications_tool`, `get_author_publications_tool` -- Tools registered when creating agent -- Updated system prompt with tool documentation - -**Use cases:** -- Verify publication citations mentioned in README -- Find related publications for software -- Verify author EPFL affiliations - -#### User Agent (`src/agents/user.py`) -- Imported Infoscience tools: `search_infoscience_authors_tool`, `get_author_publications_tool` -- Fixed to use proper `run_agent_with_fallback` signature with output_type and system_prompt -- Tools registered when creating agent -- Updated system prompt with tool documentation - -**Use cases:** -- Find EPFL profiles for GitHub users -- Verify researcher affiliations -- Get publication history to determine research areas - -#### Organization Enrichment Agent (`src/agents/organization_enrichment.py`) -- Imported Infoscience tools: `search_infoscience_labs_tool`, `search_infoscience_publications_tool`, `get_author_publications_tool` -- Tools added to agent creation function -- Updated system prompt with tool documentation - -**Use cases:** -- Verify lab names are actual EPFL labs -- Confirm author affiliations via publications -- Get detailed organizational structure information - -### 5. Agent Management Updates (`src/agents/agents_management.py`) - -Modified to support tool registration: -- `create_agent_from_config()`: Added optional `tools` parameter -- `run_agent_with_fallback()`: Added optional `tools` parameter and passes through to agent creation - -### 6. System Prompts Updated - -#### Repository Agent Prompt (`src/agents/repository_prompts.py`) -Added section explaining: -- Available Infoscience tools -- When to use them (author verification, citation lookup, EPFL relationship) -- Example usage scenarios - -#### User Agent Prompt (`src/agents/prompts.py`) -Added section explaining: -- Available Infoscience tools for user analysis -- When to use them (finding EPFL profiles, verifying affiliations) -- Example usage scenarios - -#### Organization Agent Prompt (`src/agents/organization_prompts.py`) -Added section explaining: -- Available Infoscience tools for organization analysis -- When to use them (lab verification, author affiliation confirmation) -- Example usage scenarios - -### 7. Module Exports - -#### `src/context/__init__.py` -Exported all four Infoscience tool functions for easy import. - -#### `src/data_models/__init__.py` -Exported all four Infoscience data models for type hints and validation. - -## API Endpoints Used - -The implementation queries these DSpace 7.6 API endpoints: - -1. **`/api/discover/search/objects`** - General search with query parameters - - Used for publication, author, and lab searches - - Supports DSpace query syntax (e.g., `dc.contributor.author:name`) - - Supports `dsoType` parameter to filter by type (item, community, collection) - -2. **`/api/eperson/profiles/search/byName`** - Search author profiles - - Used for direct author profile lookups - -## Authentication - -The implementation supports **optional authentication** via the `INFOSCIENCE_TOKEN` environment variable: - -```bash -export INFOSCIENCE_TOKEN="your-token-here" -``` - -**When authentication is used:** -- Lab/organization searches use the token (some endpoints may require it) -- Token is sent as `Authorization: Bearer {token}` header -- Enables access to more comprehensive search results - -**When to use authentication:** -- If you get 404 errors on lab/community searches -- If search results seem limited -- For accessing protected or detailed metadata - -**Getting a token:** -Visit [EPFL Infoscience API documentation](https://www.epfl.ch/campus/library/services-researchers/infoscience-en/help-infoscience/export-share-and-reuse-infoscience-data-api-oai-exports-etc/) for token generation instructions. - -## Features - -### Logging & Monitoring -- **Tool invocation logging**: Each tool call logs with 🔍 emoji when called by agents -- **Success logging**: Results logged with ✓ showing total results found -- **Warning logging**: Empty results or issues logged with ⚠ -- **Error logging**: Failures logged with ✗ including full exception details - -Example log output: -``` -INFO: 🔍 Agent tool called: search_infoscience_publications_tool(query='deep learning', max_results=10) -INFO: ✓ Infoscience publications search returned 24 total results -``` - -### Error Handling -- HTTP errors caught and logged with full details -- Timeouts handled gracefully (30s timeout) -- Empty results return structured responses (not errors) -- Invalid responses logged with exception tracebacks -- All errors return user-friendly markdown messages to the agent - -### Pagination -- Default: 10 results per query -- Configurable up to 50 results -- Results include total count and current page info - -### Markdown Formatting -- Clean, readable output for LLM consumption -- Includes all relevant metadata -- Links to original resources -- Result counts and pagination info - -### Type Safety -- All responses validated with Pydantic models -- Type hints throughout -- Structured data with optional fields properly handled - -## Testing Recommendations - -To test the implementation: - -1. **Unit Tests** (suggested location: `tests/test_infoscience.py`): - ```python - # Test data model validation - # Test markdown conversion - # Test API request functions (with mocked responses) - # Test tool function output formats - ``` - -2. **Integration Tests**: - ```python - # Test actual API calls (may be slow) - # Test agent tool usage - # Test end-to-end workflows - ``` - -3. **Manual Testing**: - - Run repository analysis on EPFL repos - - Check if agents use tools appropriately - - Verify tool responses are helpful - -## Monitoring Tool Usage - -### Log Output -When agents call Infoscience tools, you'll see clear logging output following this pattern: - -```log -INFO: 🔍 Agent tool called: search_infoscience_publications_tool(query='machine learning imaging', max_results=10) -INFO: Found 15 publications for query: machine learning imaging -INFO: ✓ Infoscience publications search returned 15 total results - -INFO: 🔍 Agent tool called: get_author_publications_tool(author_name='Martin Vetterli', max_results=5) -INFO: Fetching publications for author: Martin Vetterli -INFO: Found 5 publications for query: Martin Vetterli -INFO: ✓ Found 127 publications for author 'Martin Vetterli' -``` - -### Searching Logs -To find when tools were used: -```bash -# Find all Infoscience tool calls -grep "🔍 Agent tool called: search_infoscience" logs/*.log - -# Find successful searches -grep "✓ Infoscience" logs/*.log - -# Find errors -grep "✗ Error in search_infoscience" logs/*.log -``` - -The logging pattern matches the existing tool logging in the organization enrichment agent (ROR search, web search), making it consistent across all agent tools. - -## Usage Examples - -### Direct API Usage -```python -from src.context.infoscience import search_publications, search_authors - -# Search for publications -results = await search_publications("deep learning", max_results=5) -for pub in results.publications: - print(pub.to_markdown()) - -# Search for authors -authors = await search_authors("Jean Dupont", max_results=3) -for author in authors.authors: - print(author.to_markdown()) -``` - -### Agent Tool Usage -The tools are automatically available to agents during their execution. The agent can call them like: - -**Best Practice - Search by Repository Name:** -```python -# Repository: https://github.com/sdsc-ordes/gimie -# The agent extracts "gimie" and searches for it -search_infoscience_publications_tool("gimie") -# This finds publications that mention the tool! -``` - -**Other common usage:** -```python -# Search for publications -search_infoscience_publications_tool("computer vision") - -# Find author's publications and affiliations -get_author_publications_tool("Martin Vetterli") - -# Verify if a lab exists -search_infoscience_labs_tool("CVLAB") -``` - -**Strategy:** -1. **First:** Search for the repository/tool name itself to find related publications -2. **Then:** Search for authors mentioned in the repository -3. **Finally:** Verify lab affiliations if needed - -This approach helps find publications that cite or describe the software tool! - -## Dependencies - -All required dependencies were already present: -- `httpx` - Async HTTP requests -- `pydantic` - Data validation -- `pydantic-ai` - Agent framework - -## Files Created/Modified - -### New Files -- `src/data_models/infoscience.py` - Data models (348 lines) -- `src/context/infoscience.py` - API client and tools (698 lines) - -### Modified Files -- `src/agents/agents_management.py` - Added tools parameter -- `src/agents/repository.py` - Integrated Infoscience tools -- `src/agents/user.py` - Integrated Infoscience tools, fixed signature -- `src/agents/organization_enrichment.py` - Integrated Infoscience tools -- `src/agents/repository_prompts.py` - Added tool documentation -- `src/agents/prompts.py` - Added tool documentation -- `src/agents/organization_prompts.py` - Added tool documentation -- `src/context/__init__.py` - Exported Infoscience tools -- `src/data_models/__init__.py` - Exported Infoscience models - -### Total Lines Added -Approximately 1,100+ lines of code including: -- Data models with validation -- API client functions -- Tool wrappers -- Documentation updates -- Type hints and error handling - -## Known Limitations - -1. **Rate Limiting**: No rate limiting implemented - may need to add if API has limits -2. **Caching**: No caching of results - repeated searches will hit API each time -3. **Authentication**: Current implementation uses public API only - no authentication -4. **Testing**: No automated tests included - should be added -5. **Mock Data**: No mock responses for development/testing - -## Future Enhancements - -Potential improvements: -1. Add response caching (Redis/in-memory) -2. Implement rate limiting -3. Add authentication support for protected resources -4. Create comprehensive test suite -5. Add more specific search methods (by DOI, by lab, etc.) -6. Implement pagination for results > 50 -7. Add search result relevance scoring -8. Support for advanced DSpace query syntax - -## Conclusion - -The Infoscience API integration is complete and functional. All three agents now have access to search EPFL's repository for: -- Publications and citations -- Author profiles and affiliations -- Laboratory and organizational information - -The implementation follows the project's existing patterns and provides type-safe, well-documented tools that agents can use to enrich their analysis with EPFL-specific information. +# Infoscience Integration + +This project integrates EPFL Infoscience through `src/context/infoscience.py` and uses it from linked-entities and enrichment pipelines. + +## Scope in current code + +- HTTP client and parsing helpers: + - `_make_api_request` + - `_parse_publication`, `_parse_author`, `_parse_lab` +- Search functions: + - `search_publications` + - `search_authors` + - `search_labs` + - `get_author_publications` + - `get_entity_by_uuid` +- Agent tool wrappers (markdown output + in-memory dedup cache): + - `search_infoscience_publications_tool` + - `search_infoscience_authors_tool` + - `search_infoscience_labs_tool` + - `get_author_publications_tool` + +## Data models used + +- `InfosciencePublication` (`src/data_models/infoscience.py`) +- `InfoscienceAuthor` (`src/data_models/infoscience.py`) +- `InfoscienceOrgUnit` (`src/data_models/infoscience.py`) +- `linkedEntitiesRelation` and `linkedEntitiesEnrichmentResult` (`src/data_models/linked_entities.py`) + +## Search strategy + +```mermaid +flowchart TD + A[Agent asks for Infoscience context] --> B[Tool wrapper in src/context/infoscience.py] + B --> C{Cached in _search_cache?} + C -- Yes --> D[Return cached markdown] + C -- No --> E[Call /server/api/discover/search/objects] + E --> F[Parse DSpace payload to Pydantic models] + F --> G[Render markdown for agent] + G --> H[Cache result in-memory] +``` + +## Query configurations currently used + +- Publications: `configuration=researchoutputs` +- Person profiles: `configuration=person` (with publication fallback) +- Organizational units/labs: `configuration=orgunit` (with publication metadata fallback) + +## Where Infoscience is consumed + +- Repository pipeline (`src/analysis/repositories.py`): + - `run_linked_entities_enrichment` uses atomic linked-entity search + structuring. +- User pipeline (`src/analysis/user.py`): + - `run_linked_entities_enrichment` calls `enrich_user_linked_entities`. +- Organization pipeline (`src/analysis/organization.py`): + - `run_linked_entities_enrichment` calls `enrich_organization_linked_entities`. + +## Operational notes + +- `INFOSCIENCE_TOKEN` is optional and used when authenticated requests are needed. +- Tool wrappers intentionally cache identical searches during one process lifetime to reduce repeated calls. +- Returned entities are normalized to Infoscience entity URLs when UUIDs are available. diff --git a/docs/JSONLD_CONVERSION.md b/docs/JSONLD_CONVERSION.md index b98b1e6..e6a5c8f 100644 --- a/docs/JSONLD_CONVERSION.md +++ b/docs/JSONLD_CONVERSION.md @@ -1,1149 +1,55 @@ # JSON-LD Conversion Guide -This document explains how the Git Metadata Extractor converts Pydantic models to JSON-LD (JSON for Linking Data) format, and how to extend this system to new models. +This guide documents the active conversion implementation in `src/data_models/conversion.py`. -## Table of Contents +## Supported conversion directions -1. [Overview](#overview) -2. [Architecture](#architecture) -3. [How It Works](#how-it-works) -4. [Extending to New Models](#extending-to-new-models) -5. [Field Mapping Reference](#field-mapping-reference) -6. [API Integration](#api-integration) -7. [Troubleshooting](#troubleshooting) -8. [Examples](#examples) +1. Pydantic -> JSON-LD +- Entry: `convert_pydantic_to_jsonld(pydantic_obj, base_url=None)` +- Produces `{"@context": ..., "@graph": [...]}` +- Used by repository API endpoint `/v1/repository/llm/json-ld/{full_path:path}` ---- +2. JSON-LD -> Pydantic +- Entry: `convert_jsonld_to_pydantic(jsonld_graph)` +- Current target: `SoftwareSourceCode` (repository graph) -## Overview +## Conversion flow -### What is JSON-LD? - -JSON-LD (JSON for Linking Data) is a lightweight syntax for encoding Linked Data using JSON. It allows data to be: -- **Machine-readable**: Structured for automated processing -- **Semantically rich**: Fields mapped to standard vocabularies (schema.org, custom ontologies) -- **Interoperable**: Can be integrated with other semantic web systems -- **Human-friendly**: Still readable as plain JSON - -### Why Use JSON-LD? - -1. **Imaging Plaza Integration**: The Imaging Plaza project uses JSON-LD for metadata -2. **Semantic Web Compatibility**: Compatible with RDF, SPARQL, and other semantic tools -3. **Standard Vocabularies**: Leverages schema.org and domain-specific ontologies -4. **Data Integration**: Enables linking across different data sources - -### JSON-LD Structure - -```json -{ - "@context": { - "schema": "http://schema.org/", - "imag": "https://imaging-plaza.epfl.ch/ontology/", - "md4i": "https://w3id.org/md4i/" - }, - "@graph": [ - { - "@id": "https://github.com/user/repo", - "@type": "http://schema.org/SoftwareSourceCode", - "schema:name": "Repository Name", - "schema:author": [ - { - "@type": "http://schema.org/Person", - "schema:name": "Jane Doe" - } - ] - } - ] -} -``` - -**Key Components:** -- **@context**: Namespace prefix definitions -- **@graph**: Array of entities (resources) -- **@id**: Unique identifier (usually a URL) -- **@type**: Semantic type (from schema.org or custom ontology) - ---- - -## Architecture - -### System Components - -``` -┌─────────────────┐ -│ Pydantic Model │ (SoftwareSourceCode, GitHubUser, etc.) -└────────┬────────┘ - │ - ├─── model.convert_pydantic_to_jsonld() - │ - v -┌─────────────────────────────┐ -│ Generic Converter Function │ (convert_pydantic_to_jsonld) -│ - Field mapping lookup │ -│ - Recursive conversion │ -│ - Special type handling │ -└────────┬────────────────────┘ - │ - ├─── PYDANTIC_TO_ZOD_MAPPING (field names → URIs) - ├─── type_mapping (classes → semantic types) - │ - v -┌─────────────────┐ -│ JSON-LD Dict │ {@context, @graph} -└─────────────────┘ -``` - -### File Locations - -- **Generic Converter**: `src/data_models/conversion.py` - - `convert_pydantic_to_jsonld()` function - - `PYDANTIC_TO_ZOD_MAPPING` dictionary - - Type mappings - -- **Model-Specific Methods**: In respective model files - - `src/data_models/repository.py` → `SoftwareSourceCode.convert_pydantic_to_jsonld()` - - `src/data_models/user.py` → `GitHubUser.convert_pydantic_to_jsonld()` (if implemented) - - `src/data_models/organization.py` → `GitHubOrganization.convert_pydantic_to_jsonld()` (if implemented) - -- **API Integration**: `src/api.py` - - JSON-LD endpoints (`/v1/repository/llm/json-ld/`, `/v1/repository/gimie/json-ld/`) - ---- - -## How It Works - -### Step-by-Step Conversion Process - -#### 1. Model Method Call - -The model instance calls the generic converter: - -```python -class SoftwareSourceCode(BaseModel): - name: str - author: List[Person] - # ... more fields - - def convert_pydantic_to_jsonld(self) -> dict: - from src.data_models.conversion import convert_pydantic_to_jsonld - - # Determine base URL for @id - base_url = str(self.codeRepository[0]) if self.codeRepository else None - - return convert_pydantic_to_jsonld(self, base_url=base_url) -``` - -#### 2. Generic Converter - -The generic converter (`convert_pydantic_to_jsonld()`) processes the model: - -```python -def convert_pydantic_to_jsonld( - pydantic_obj: Any, - base_url: Optional[str] = None -) -> Union[Dict, List]: - """Convert any Pydantic model to JSON-LD format.""" - - # 1. Get model class name - model_name = type(pydantic_obj).__name__ - - # 2. Look up field mappings - field_mapping = PYDANTIC_TO_ZOD_MAPPING.get(model_name, {}) - - # 3. Create entity dict - entity = {} - - # 4. Add @id and @type - entity["@id"] = base_url or f"urn:{model_name}:{id(pydantic_obj)}" - entity["@type"] = type_mapping.get(type(pydantic_obj), "http://schema.org/Thing") - - # 5. Convert fields - for field_name, field_value in pydantic_obj.model_dump().items(): - if field_value is None: - continue - - # Look up semantic URI for this field - semantic_key = field_mapping.get(field_name, field_name) - - # Convert field value based on type - entity[semantic_key] = convert_field_value(field_value) - - # 6. Wrap in @context and @graph - return { - "@context": {...}, - "@graph": [entity] - } -``` - -#### 3. Field Value Conversion - -Different types are handled specially: - -**Simple Types** (str, int, float, bool): -```python -"schema:name": {"@value": "Repository Name"} -``` - -**URLs** (HttpUrl): -```python -"schema:codeRepository": [{"@id": "https://github.com/user/repo"}] -``` - -**Dates** (date, datetime): -```python -"schema:datePublished": {"@value": "2024-01-15"} -``` - -**Enums**: -```python -"imag:discipline": [{"@value": "Computer Science"}] -``` - -**Nested Models** (Person, Organization): -```python -"schema:author": [ - { - "@type": "http://schema.org/Person", - "schema:name": {"@value": "Jane Doe"}, - "md4i:orcidId": {"@id": "https://orcid.org/0000-0001-2345-6789"} - } -] -``` - -**Lists**: -Each item is converted recursively, maintaining structure. - -#### 4. Field Mapping Lookup - -The `PYDANTIC_TO_ZOD_MAPPING` dictionary maps Pydantic field names to semantic URIs: - -```python -PYDANTIC_TO_ZOD_MAPPING = { - "SoftwareSourceCode": { - "name": "schema:name", - "description": "schema:description", - "codeRepository": "schema:codeRepository", - "author": "schema:author", - "license": "schema:license", - "programmingLanguage": "schema:programmingLanguage", - "discipline": "imag:discipline", - "relatedToOrganizationsROR": "imag:relatedToOrganizationsROR", - "relatedToEPFL": "imag:relatedToEPFL", - # ... more fields - }, -} -``` - -**Namespace Prefixes:** -- `schema:` → `http://schema.org/` (Standard web schemas) -- `sd:` → `https://w3id.org/okn/o/sd#` (Software Description Ontology) -- `imag:` → `https://imaging-plaza.epfl.ch/ontology/` (Imaging Plaza custom ontology) -- `md4i:` → `https://w3id.org/md4i/` (Metadata for Images ontology) - ---- - -## Extending to New Models - -### Complete Example: Adding JSON-LD to `GitHubUser` - -Let's walk through adding JSON-LD support to the `GitHubUser` model step by step. - -#### Step 1: Define Field Mappings - -In `src/data_models/conversion.py`, add to `PYDANTIC_TO_ZOD_MAPPING`: - -```python -PYDANTIC_TO_ZOD_MAPPING: Dict[str, Dict[str, str]] = { - # ... existing mappings ... - - "GitHubUser": { - # Core identity - "name": "schema:name", - "fullname": "schema:givenName", - "githubHandle": "schema:identifier", - - # GitHub metadata - "githubUserMetadata": "imag:githubUserMetadata", - - # Organization relationships - "relatedToOrganization": "imag:relatedToOrganizations", - "relatedToOrganizationsROR": "imag:relatedToOrganizationsROR", - "relatedToOrganizationJustification": "imag:relatedToOrganizationJustification", - - # Discipline and position - "discipline": "imag:discipline", - "disciplineJustification": "imag:disciplineJustification", - "position": "schema:jobTitle", - "positionJustification": "imag:positionJustification", - - # EPFL relationship - "relatedToEPFL": "imag:relatedToEPFL", - "relatedToEPFLJustification": "imag:relatedToEPFLJustification", - "relatedToEPFLConfidence": "imag:relatedToEPFLConfidence", - - # Infoscience - "infoscienceEntities": "imag:infoscienceEntities", - }, -} -``` - -**Mapping Strategy:** -1. Use `schema:` for standard fields (name, jobTitle, identifier) -2. Use `imag:` for Imaging Plaza-specific fields (discipline, relatedToEPFL) -3. Use `md4i:` for metadata fields (usually in nested objects) -4. Keep semantic meaning consistent with schema.org when possible - -#### Step 2: Add Type Mapping - -In `convert_pydantic_to_jsonld()` function, add to `type_mapping`: - -```python -def convert_pydantic_to_jsonld( - pydantic_obj: Any, - base_url: Optional[str] = None -) -> Union[Dict, List]: - """Convert any Pydantic model to JSON-LD format.""" - - # ... existing code ... - - # Type mappings - maps Pydantic classes to semantic types - type_mapping = { - SoftwareSourceCode: "http://schema.org/SoftwareSourceCode", - Person: "http://schema.org/Person", - Organization: "http://schema.org/Organization", - InfoscienceEntity: "http://schema.org/Thing", - GitHubUser: "http://schema.org/Person", # ← Add this - # ... more types - } - - # ... rest of function ... -``` - -**Type Selection:** -- Use schema.org types when available (`Person`, `Organization`, `SoftwareSourceCode`) -- Use `Thing` as a fallback for generic entities -- Consider custom ontology types for domain-specific entities - -#### Step 3: Add Model Method - -In `src/data_models/user.py`, add the conversion method: - -```python -from typing import Optional - -class GitHubUser(BaseModel): - """GitHub user profile with enrichment data""" - - name: Optional[str] = None - fullname: Optional[str] = None - githubHandle: Optional[str] = None - # ... more fields ... - - def convert_pydantic_to_jsonld(self) -> dict: - """ - Convert this GitHubUser instance to JSON-LD format. - - Returns: - dict: JSON-LD formatted data with @context and @graph - """ - from src.data_models.conversion import convert_pydantic_to_jsonld - - # Determine base URL for @id generation - # Priority: GitHub profile URL > fallback to URN - base_url = None - if self.githubHandle: - base_url = f"https://github.com/{self.githubHandle}" - - return convert_pydantic_to_jsonld(self, base_url=base_url) -``` - -**Base URL Strategy:** -- Use the most canonical URL for the entity (GitHub profile, repository URL, etc.) -- If no URL available, let converter generate a URN (`urn:ModelName:id`) -- Base URL becomes the `@id` field in JSON-LD output - -#### Step 4: Update Analysis Class - -In `src/analysis/user.py`, update `dump_results()`: - -```python -class User: - """User analysis class""" - - def __init__(self, username: str, force_refresh: bool = False): - self.username = username - self.force_refresh = force_refresh - self.data: Optional[GitHubUser] = None - # ... other initialization - - async def run_analysis(self, ...): - """Run user analysis""" - # ... analysis logic ... - pass - - def dump_results(self, output_type: str = "pydantic"): - """ - Dump results in specified format. - - Args: - output_type: "pydantic" (default), "json-ld", "dict" - - Returns: - Pydantic model, JSON-LD dict, or plain dict depending on output_type - """ - if output_type == "json-ld": - if self.data: - return self.data.convert_pydantic_to_jsonld() - return None - elif output_type == "pydantic": - return self.data - elif output_type == "dict": - return self.data.model_dump() if self.data else None - else: - raise ValueError(f"Unknown output_type: {output_type}") +```mermaid +flowchart TD + A[Pydantic model] --> B[Model mapping lookup
PYDANTIC_TO_ZOD_MAPPING] + B --> C[Assign @id and @type] + C --> D[Convert fields and nested entities] + D --> E[Collect Person entities for graph] + E --> F[Build @context + @graph] + F --> G[JSON-serializable JSON-LD output] ``` -#### Step 5: Create API Endpoint +## Context prefixes currently emitted -In `src/api.py`, add a JSON-LD endpoint: +- `schema`: `http://schema.org/` +- `sd`: `https://w3id.org/okn/o/sd#` +- `pulse`: `https://open-pulse.epfl.ch/ontology#` +- `md4i`: `http://w3id.org/nfdi4ing/metadata4ing#` +- plus RDF/OWL/XSD helper prefixes. -```python -from fastapi import HTTPException -from src.data_models.api import APIOutput, APIStats, ResourceType -from datetime import datetime - -@app.get( - "/v1/user/llm/json-ld/{full_path:path}", - tags=["User"], - responses={ - 200: { - "description": "Successful Response", - "content": { - "application/json": { - "example": { - "link": "https://github.com/username", - "type": "user", - "parsedTimestamp": "2024-01-15T10:30:00.000Z", - "output": { - "@context": { - "schema": "http://schema.org/", - "imag": "https://imaging-plaza.epfl.ch/ontology/", - "md4i": "https://w3id.org/md4i/", - }, - "@graph": [{ - "@id": "https://github.com/username", - "@type": "http://schema.org/Person", - "schema:name": {"@value": "Jane Doe"}, - "schema:identifier": {"@value": "username"}, - "imag:discipline": [{"@value": "Computer Science"}], - "imag:relatedToEPFL": True, - }] - }, - "stats": { - "agent_input_tokens": 1234, - "agent_output_tokens": 567, - "total_tokens": 1801, - "duration": 45.23, - "status_code": 200 - } - } - } - } - } - } -) -async def get_user_jsonld( - full_path: str = Path(..., description="GitHub user URL or path"), - force_refresh: bool = Query(False, description="Force refresh from APIs"), - enrich_orgs: bool = Query(False, description="Enable organization enrichment"), - enrich_users: bool = Query(False, description="Enable user enrichment"), -) -> APIOutput: - """ - Retrieve GitHub user profile metadata in JSON-LD format. - - This endpoint returns semantic web compatible data with @context and @graph structures. - """ - with AsyncRequestContext( - request_type="user_jsonld", - resource_url=full_path - ): - try: - # Extract username from path - username = full_path.split("/")[-1] - - # Initialize user analysis - user = User(username, force_refresh=force_refresh) - - # Run analysis - await user.run_analysis( - run_organization_enrichment=enrich_orgs, - run_user_enrichment=enrich_users, - ) - - # Check if analysis succeeded - if user.data is None: - raise HTTPException( - status_code=500, - detail=f"User analysis failed: no data generated for {username}" - ) - - # Convert to JSON-LD - try: - jsonld_output = user.dump_results(output_type="json-ld") - - if jsonld_output is None: - raise ValueError("JSON-LD conversion returned None") - - # Verify JSON-LD structure - if "@context" not in jsonld_output or "@graph" not in jsonld_output: - raise ValueError("Missing @context or @graph in JSON-LD output") - - except Exception as e: - logger.error(f"Failed to convert user to JSON-LD: {e}", exc_info=True) - raise HTTPException( - status_code=500, - detail=f"Failed to convert user data to JSON-LD: {str(e)}" - ) - - # Get usage statistics - usage_stats = user.get_usage_stats() - - # Create API stats - stats = APIStats( - agent_input_tokens=usage_stats["input_tokens"], - agent_output_tokens=usage_stats["output_tokens"], - estimated_input_tokens=usage_stats["estimated_input_tokens"], - estimated_output_tokens=usage_stats["estimated_output_tokens"], - duration=usage_stats["duration"], - start_time=usage_stats["start_time"], - end_time=usage_stats["end_time"], - status_code=usage_stats["status_code"], - ) - stats.calculate_total_tokens() - - # Return response - response = APIOutput( - link=full_path, - type=ResourceType.USER, - parsedTimestamp=datetime.now(), - output=jsonld_output, # Raw JSON-LD dict - stats=stats, - ) - - return response - - except HTTPException: - raise - except Exception as e: - logger.error(f"Error in user JSON-LD endpoint: {e}", exc_info=True) - raise HTTPException( - status_code=500, - detail=f"Internal server error: {str(e)}" - ) -``` - -**Key Points:** -- Use `APIOutput` with `output: Union[dict, list, ...]` (dict/list FIRST!) -- Include comprehensive error handling -- Validate JSON-LD structure before returning -- Add OpenAPI example showing realistic JSON-LD output -- Return raw dict (not wrapped in additional structure) - ---- - -## Field Mapping Reference - -### Current Mappings - -#### SoftwareSourceCode - -```python -"SoftwareSourceCode": { - # Schema.org fields - "name": "schema:name", - "description": "schema:description", - "codeRepository": "schema:codeRepository", - "conditionsOfAccess": "schema:conditionsOfAccess", - "dateCreated": "schema:dateCreated", - "dateModified": "schema:dateModified", - "datePublished": "schema:datePublished", - "isAccessibleForFree": "schema:isAccessibleForFree", - "keywords": "schema:keywords", - "author": "schema:author", - "license": "schema:license", - "image": "schema:image", - "url": "schema:url", - "featureList": "schema:featureList", - "operatingSystem": "schema:operatingSystem", - "applicationCategory": "schema:applicationCategory", - "programmingLanguage": "schema:programmingLanguage", - "softwareRequirements": "schema:softwareRequirements", - - # Software Description Ontology (sd:) - "readme": "sd:readme", - "hasExecutableInstructions": "sd:hasExecutableInstructions", - "hasDocumentation": "sd:hasDocumentation", - - # Imaging Plaza custom fields (imag:) - "repositoryType": "imag:repositoryType", - "repositoryTypeJustification": "imag:repositoryTypeJustification", - "relatedToOrganization": "imag:relatedToOrganizations", - "relatedToOrganizationJustification": "imag:relatedToOrganizationJustification", - "relatedToOrganizationsROR": "imag:relatedToOrganizationsROR", - "discipline": "imag:discipline", - "disciplineJustification": "imag:disciplineJustification", - "relatedToEPFL": "imag:relatedToEPFL", - "relatedToEPFLJustification": "imag:relatedToEPFLJustification", - "relatedToEPFLConfidence": "imag:relatedToEPFLConfidence", - "infoscienceEntities": "imag:infoscienceEntities", - "gitAuthors": "imag:gitAuthors", - "webpagesToCheck": "imag:webpagesToCheck", -} -``` - -#### Person - -```python -"Person": { - "name": "schema:name", - "email": "schema:email", - "affiliation": "schema:affiliation", - "affiliations": "schema:affiliation", - "currentAffiliation": "schema:affiliation", - "orcidId": "md4i:orcidId", - "contributionSummary": "imag:contributionSummary", -} -``` - -#### Organization - -```python -"Organization": { - "legalName": "schema:legalName", - "alternateNames": "schema:alternateName", - "hasRorId": "md4i:hasRorId", - "organizationType": "schema:additionalType", - "parentOrganization": "schema:parentOrganization", - "country": "schema:addressCountry", - "website": "schema:url", - "attributionConfidence": "imag:attributionConfidence", -} -``` - -#### InfoscienceEntity - -```python -"InfoscienceEntity": { - "name": "schema:name", - "url": "schema:url", - "confidence": "imag:confidence", - "justification": "imag:justification", -} -``` +## Notable implementation details -### Namespace Prefixes +- Person IRI generation priority: explicit `id` -> `githubId` -> `orcid` -> first email. +- `linkedEntities` are preserved and can also contribute DOI-derived `schema:citation` entries. +- Enums and dates are normalized to JSON-compatible values before output. +- `convert_jsonld_to_pydantic` currently expects a `SoftwareSourceCode` entity in graph form. -| Prefix | Full URI | Purpose | -|--------|----------|---------| -| `schema:` | `http://schema.org/` | Standard web schemas (name, author, license, etc.) | -| `sd:` | `https://w3id.org/okn/o/sd#` | Software Description Ontology (readme, documentation) | -| `imag:` | `https://imaging-plaza.epfl.ch/ontology/` | Imaging Plaza custom ontology (discipline, EPFL relations) | -| `md4i:` | `https://w3id.org/md4i/` | Metadata for Images (ORCID, ROR IDs) | +## API usage -### Adding New Fields +Repository JSON-LD endpoint internally runs: -When adding new fields to Pydantic models: - -1. **Choose the right namespace**: - - Use `schema:` if the concept exists in schema.org - - Use `imag:` for domain-specific fields (imaging, research) - - Use `md4i:` for metadata/identifier fields - - Use `sd:` for software-specific fields - -2. **Check schema.org**: https://schema.org/ - - Search for the concept (e.g., "email" → `schema:email`) - - Use the exact property name from schema.org - -3. **Document custom fields**: If using `imag:` or custom namespaces, document in Imaging Plaza ontology - ---- - -## API Integration - -### APIOutput Model - -The `APIOutput` model wraps all API responses. For JSON-LD endpoints, special handling is required. - -#### Critical: Union Type Ordering - -```python -class APIOutput(BaseModel): - """API output model for all endpoints""" - - model_config = {"arbitrary_types_allowed": True} - - link: HttpUrl = None - type: ResourceType = None - parsedTimestamp: datetime = None - - # ✅ CORRECT - dict/list FIRST in Union - output: Union[dict, list, SoftwareSourceCode, GitHubOrganization, GitHubUser, Any] = None - - stats: APIStats = None -``` - -**Why this matters:** -- Pydantic validates Union types left-to-right -- If models come first, Pydantic tries to coerce dict to model -- This corrupts JSON-LD structure (loses @context, wrong field names) -- Putting `dict, list` first preserves raw JSON-LD structure - -#### Field Validator - -Preserve dict/list without conversion: - -```python -@field_validator("output", mode="before") -@classmethod -def preserve_dict_output(cls, v): - """Preserve dict/list output as-is without converting to Pydantic models.""" - if isinstance(v, (dict, list)): - return v - return v -``` - -#### Model Serializer - -Keep dict/list during serialization: - -```python -@model_serializer(mode='wrap') -def serialize_model(self, serializer): - """Custom serializer to preserve dict/list in output field.""" - data = serializer(self) - if isinstance(self.output, (dict, list)): - data['output'] = self.output - return data -``` - -### Cache Considerations - -JSON-LD endpoints should follow the same caching pattern: - -```python -# In endpoint -cache_manager = get_cache_manager() -cache_key = f"user_jsonld:{username}" - -# Check cache -if not force_refresh: - cached = cache_manager.get(cache_key) - if cached: - return cached - -# ... run analysis ... - -# Cache result (365 days) -cache_manager.set(cache_key, response, ttl=365*24*60*60) -``` - ---- +1. repository analysis (`Repository.run_analysis`) +2. model dump via `repository.dump_results(output_type="json-ld")` +3. JSON-LD sanity checks (`@context` and `@graph` present) ## Troubleshooting -### Common Issues and Solutions - -#### Issue 1: Fields Missing from JSON-LD Output - -**Symptom**: Some fields from your Pydantic model don't appear in JSON-LD output. - -**Cause**: Fields not mapped in `PYDANTIC_TO_ZOD_MAPPING`. - -**Solution**: Add field mappings: - -```python -# In src/data_models/conversion.py -PYDANTIC_TO_ZOD_MAPPING["YourModel"] = { - "missingField": "schema:appropriateProperty", - # ... other fields -} -``` - -**Verification**: Check conversion output, look for fields with original names (unmapped) vs. prefixed names (mapped). - ---- - -#### Issue 2: Wrong @type in Output - -**Symptom**: Entity has `@type: "http://schema.org/Thing"` instead of correct type. - -**Cause**: Model class not in `type_mapping` dict. - -**Solution**: Add type mapping in `convert_pydantic_to_jsonld()`: - -```python -type_mapping = { - # ... existing types ... - YourModel: "http://schema.org/YourType", -} -``` - ---- - -#### Issue 3: Pydantic Coerces JSON-LD to Model - -**Symptom**: API returns wrong model structure (e.g., `GitHubOrganization` instead of JSON-LD). - -**Cause**: `APIOutput.output` Union type has models before `dict`. - -**Solution**: Reorder Union: - -```python -# ❌ WRONG -output: Union[SoftwareSourceCode, dict, list, Any] - -# ✅ CORRECT -output: Union[dict, list, SoftwareSourceCode, Any] -``` - -**Debug**: Add logging before return: - -```python -logger.info(f"Response output type: {type(jsonld_output)}") -logger.info(f"Has @context: {'@context' in jsonld_output}") -``` - ---- - -#### Issue 4: Nested Models Not Converting - -**Symptom**: Nested objects appear as plain dicts instead of JSON-LD entities. - -**Cause**: Nested model class not in `type_mapping`. - -**Solution**: Add type mapping for nested model: - -```python -type_mapping = { - # ... existing types ... - NestedModel: "http://schema.org/NestedType", -} -``` - -**Verification**: Check if nested objects have `@type` field. - ---- - -#### Issue 5: None Values in Output - -**Symptom**: JSON-LD contains many `null` or empty fields. - -**Cause**: Pydantic fields with `None` values included in output. - -**Solution**: The converter already skips `None` values. Check if model is setting default values: - -```python -# ❌ Sets empty list even if no data -field: List[str] = Field(default_factory=list) - -# ✅ Only set if data exists -field: Optional[List[str]] = None -``` - ---- - -#### Issue 6: URLs Not Wrapped in @id - -**Symptom**: URLs appear as plain strings instead of `{"@id": "..."}`. - -**Cause**: Field type is `str` instead of `HttpUrl`. - -**Solution**: Use Pydantic `HttpUrl` type: - -```python -from pydantic import HttpUrl - -class YourModel(BaseModel): - website: HttpUrl # ✅ Will wrap in @id - # Not: website: str # ❌ Plain string -``` - ---- - -### Debugging Techniques - -#### 1. Add Logging - -In `convert_pydantic_to_jsonld()`: - -```python -logger.debug(f"Converting {model_name} to JSON-LD") -logger.debug(f"Base URL: {base_url}") -logger.debug(f"Field mapping keys: {list(field_mapping.keys())}") -logger.debug(f"Model fields: {list(pydantic_obj.model_fields_set)}") -``` - -In API endpoint: - -```python -logger.info(f"Repository data type: {type(repository.data).__name__}") -logger.info(f"JSON-LD output type: {type(jsonld_output)}") -logger.info(f"JSON-LD output keys: {jsonld_output.keys()}") -if "@graph" in jsonld_output: - logger.info(f"@graph length: {len(jsonld_output['@graph'])}") - logger.info(f"First entity @type: {jsonld_output['@graph'][0].get('@type')}") -``` - -#### 2. Validate JSON-LD Structure - -```python -def validate_jsonld(data: dict) -> bool: - """Validate basic JSON-LD structure""" - if not isinstance(data, dict): - return False - if "@context" not in data: - print("Missing @context") - return False - if "@graph" not in data: - print("Missing @graph") - return False - if not isinstance(data["@graph"], list): - print("@graph is not a list") - return False - if len(data["@graph"]) == 0: - print("@graph is empty") - return False - - first_entity = data["@graph"][0] - if "@type" not in first_entity: - print("First entity missing @type") - return False - - return True - -# Use in endpoint -jsonld_output = repository.dump_results(output_type="json-ld") -if not validate_jsonld(jsonld_output): - raise ValueError("Invalid JSON-LD structure") -``` - -#### 3. Compare Pydantic vs JSON-LD - -```python -# Dump both formats -pydantic_output = repository.dump_results(output_type="pydantic") -jsonld_output = repository.dump_results(output_type="json-ld") - -# Compare field presence -pydantic_fields = set(pydantic_output.model_dump().keys()) -jsonld_fields = set(jsonld_output["@graph"][0].keys()) - -missing_in_jsonld = pydantic_fields - jsonld_fields -logger.warning(f"Fields in Pydantic but not JSON-LD: {missing_in_jsonld}") -``` - ---- - -## Examples - -### Example 1: Complete Repository JSON-LD - -Input (Pydantic): -```python -SoftwareSourceCode( - name="gimie", - description="Git Meta Information Extractor", - codeRepository=[HttpUrl("https://github.com/sdsc-ordes/gimie")], - license="https://spdx.org/licenses/Apache-2.0.html", - author=[ - Person( - name="Cyril Matthey-Doret", - orcidId=HttpUrl("https://orcid.org/0000-0002-1126-1535"), - affiliations=["EPFL"] - ) - ], - programmingLanguage=["Python"], - discipline=[Discipline.COMPUTER_ENGINEERING], - relatedToEPFL=True, - relatedToOrganizationsROR=[ - Organization( - legalName="EPFL", - hasRorId=HttpUrl("https://ror.org/03yrm5c26"), - country="Switzerland" - ) - ] -) -``` - -Output (JSON-LD): -```json -{ - "@context": { - "schema": "http://schema.org/", - "sd": "https://w3id.org/okn/o/sd#", - "imag": "https://imaging-plaza.epfl.ch/ontology/", - "md4i": "https://w3id.org/md4i/" - }, - "@graph": [ - { - "@id": "https://github.com/sdsc-ordes/gimie", - "@type": "http://schema.org/SoftwareSourceCode", - "schema:name": {"@value": "gimie"}, - "schema:description": {"@value": "Git Meta Information Extractor"}, - "schema:codeRepository": [ - {"@id": "https://github.com/sdsc-ordes/gimie"} - ], - "schema:license": {"@id": "https://spdx.org/licenses/Apache-2.0.html"}, - "schema:author": [ - { - "@type": "http://schema.org/Person", - "schema:name": {"@value": "Cyril Matthey-Doret"}, - "md4i:orcidId": {"@id": "https://orcid.org/0000-0002-1126-1535"}, - "schema:affiliation": [{"@value": "EPFL"}] - } - ], - "schema:programmingLanguage": [{"@value": "Python"}], - "imag:discipline": [{"@value": "Computer engineering"}], - "imag:relatedToEPFL": true, - "imag:relatedToOrganizationsROR": [ - { - "@type": "http://schema.org/Organization", - "schema:legalName": {"@value": "EPFL"}, - "md4i:hasRorId": {"@id": "https://ror.org/03yrm5c26"}, - "schema:addressCountry": {"@value": "Switzerland"} - } - ] - } - ] -} -``` - -### Example 2: API Response - -GET `/v1/repository/llm/json-ld/https%3A//github.com/sdsc-ordes/gimie` - -Response: -```json -{ - "link": "https://github.com/sdsc-ordes/gimie", - "type": "repository", - "parsedTimestamp": "2025-10-31T18:06:24.938227", - "output": { - "@context": { - "schema": "http://schema.org/", - "sd": "https://w3id.org/okn/o/sd#", - "imag": "https://imaging-plaza.epfl.ch/ontology/", - "md4i": "https://w3id.org/md4i/" - }, - "@graph": [ - { - "@id": "https://github.com/sdsc-ordes/gimie", - "@type": "http://schema.org/SoftwareSourceCode", - "schema:name": {"@value": "gimie"}, - "schema:description": {"@value": "Git Meta Information Extractor"}, - "schema:author": [ - { - "@type": "http://schema.org/Person", - "schema:name": {"@value": "Cyril Matthey-Doret"}, - "md4i:orcidId": {"@id": "https://orcid.org/0000-0002-1126-1535"} - } - ], - "imag:relatedToEPFL": true - } - ] - }, - "stats": { - "agent_input_tokens": 0, - "agent_output_tokens": 0, - "total_tokens": 0, - "estimated_input_tokens": 33160, - "estimated_output_tokens": 3192, - "estimated_total_tokens": 36352, - "duration": 260.46219, - "start_time": "2025-10-31T18:02:04.472401", - "end_time": "2025-10-31T18:06:24.934591", - "status_code": 200 - } -} -``` - -### Example 3: Minimal Implementation for New Model - -```python -# 1. In src/data_models/conversion.py -PYDANTIC_TO_ZOD_MAPPING["NewModel"] = { - "field1": "schema:field1", - "field2": "imag:field2", -} - -# 2. In convert_pydantic_to_jsonld() -type_mapping = { - # ... - NewModel: "http://schema.org/Thing", -} - -# 3. In src/data_models/yourmodel.py -class NewModel(BaseModel): - field1: str - field2: Optional[str] = None - - def convert_pydantic_to_jsonld(self) -> dict: - from src.data_models.conversion import convert_pydantic_to_jsonld - return convert_pydantic_to_jsonld(self, base_url="https://example.com/entity") - -# 4. Test conversion -model = NewModel(field1="value1", field2="value2") -jsonld = model.convert_pydantic_to_jsonld() -print(jsonld) -``` - -Output: -```json -{ - "@context": {...}, - "@graph": [{ - "@id": "https://example.com/entity", - "@type": "http://schema.org/Thing", - "schema:field1": {"@value": "value1"}, - "imag:field2": {"@value": "value2"} - }] -} -``` - ---- - -## Best Practices - -1. **Complete Field Mappings**: Map all important fields to semantic URIs -2. **Use Standard Vocabularies**: Prefer schema.org over custom terms -3. **Consistent Namespaces**: Stick to established prefixes (schema, imag, md4i, sd) -4. **Appropriate Base URLs**: Choose canonical URLs for @id generation -5. **Type Validation**: Ensure all models have type mappings -6. **Test Output**: Validate JSON-LD with RDF tools -7. **Document Custom Terms**: Document Imaging Plaza ontology terms -8. **Error Handling**: Add try-catch in endpoints for conversion failures -9. **Logging**: Add debug logs to trace conversion issues -10. **Cache Results**: Cache JSON-LD output (365 days) like other endpoints - ---- - -## Related Documentation - -- [FastAPI Patterns](.cursor/rules/fastapi-patterns.mdc) - API endpoint patterns including JSON-LD -- [Pydantic Models](.cursor/rules/pydantic-models.mdc) - Model definitions and JSON-LD conversion -- [Project Architecture](.cursor/rules/project-architecture.mdc) - Overall system structure -- [Imaging Plaza Documentation](https://imaging-plaza.epfl.ch) - Ontology and schema definitions - ---- - -## Questions or Issues? - -If you encounter issues not covered in this guide: - -1. Check existing JSON-LD endpoints for reference patterns -2. Review error logs for conversion failures -3. Validate field mappings in `PYDANTIC_TO_ZOD_MAPPING` -4. Test with minimal examples before complex models -5. Consult schema.org for standard property names - -For Imaging Plaza ontology questions, contact the EPFL Center for Imaging team. +- Missing `@graph`: ensure conversion result is returned directly from `convert_pydantic_to_jsonld`. +- Empty repository on reverse conversion: ensure graph contains `@type` equivalent to `http://schema.org/SoftwareSourceCode`. +- Invalid URL fields: ensure values are valid URL strings before model validation. diff --git a/docs/JSONLD_CONVERSION_SUMMARY.md b/docs/JSONLD_CONVERSION_SUMMARY.md index 077c516..ace2a68 100644 --- a/docs/JSONLD_CONVERSION_SUMMARY.md +++ b/docs/JSONLD_CONVERSION_SUMMARY.md @@ -1,271 +1,32 @@ # JSON-LD Conversion Summary -## Quick Reference: Key Property Mappings +## What is stable now -This document provides a quick reference for the most commonly used Pydantic→JSON-LD property mappings. +- Repository JSON-LD output path is integrated in API. +- Conversion context uses `schema`, `sd`, `pulse`, and `md4i` namespaces. +- Token/runtime stats are returned alongside API output through `APIStats`. -### Core Repository Properties +## Quick conversion references -| Pydantic Field | JSON-LD Property | Notes | -|----------------|------------------|-------| -| `name` | `schema:name` | Repository name | -| `description` | `schema:description` | Repository description | -| `codeRepository` | `schema:codeRepository` | GitHub/GitLab URL | -| `author` | `schema:author` | List of Person/Organization | -| `license` | `schema:license` | SPDX license URL | -| `discipline` | `pulse:discipline` | Wikidata discipline URIs | -| `repositoryType` | `pulse:repositoryType` | PULSE enum values | +- Pydantic -> JSON-LD: `convert_pydantic_to_jsonld` +- JSON-LD -> Pydantic repository: `convert_jsonld_to_pydantic` +- CLI wrapper script: `scripts/convert_json_jsonld.py` -### Person Properties - -| Pydantic Field | JSON-LD Property | Notes | -|----------------|------------------|-------| -| `name` | `schema:name` | Full name | -| `email` | `pulse:email` | Email address | -| `orcid` | `md4i:orcidId` | ORCID identifier | -| `affiliation` | `schema:affiliation` | Institution/org | -| `academicCatalogRelations` | `pulse:hasAcademicCatalogRelation` | Catalog links | - -### Organization Properties - -| Pydantic Field | JSON-LD Property | Notes | -|----------------|------------------|-------| -| `legalName` | `schema:legalName` | Official name | -| `hasRorId` | `md4i:hasRorId` | ROR identifier URL | -| `website` | `schema:url` | Organization website | - -### Academic Catalog Relations - -| Pydantic Field | JSON-LD Property | Notes | -|----------------|------------------|-------| -| `catalogType` | `pulse:catalogType` | infoscience, orcid, ror, wikidata | -| `entityType` | `pulse:entityType` | person, organization, publication, project | -| `entity` | `pulse:hasCatalogEntity` | The actual entity | -| `confidence` | `pulse:confidence` | 0.0-1.0 | -| `justification` | `pulse:justification` | Why this relation exists | -| `matchedOn` | `pulse:matchedOn` | Fields used for matching | - -## Namespace Prefixes - -```turtle -@prefix schema: . -@prefix sd: . -@prefix pulse: . -@prefix md4i: . -@prefix rdf: . -@prefix rdfs: . -@prefix owl: . -@prefix xsd: . -@prefix wd: . -``` - -## Example JSON-LD Output - -### Repository with Author +## Minimal expected JSON-LD shape ```json { - "@context": { - "schema": "http://schema.org/", - "pulse": "https://open-pulse.epfl.ch/ontology#", - "md4i": "http://w3id.org/nfdi4ing/metadata4ing#" - }, + "@context": {"schema": "http://schema.org/"}, "@graph": [ { - "@id": "https://github.com/example/my-repo", "@type": "schema:SoftwareSourceCode", - "schema:name": "My Research Software", - "schema:description": "A tool for scientific computing", - "schema:codeRepository": [ - {"@id": "https://github.com/example/my-repo"} - ], - "schema:license": "https://spdx.org/licenses/MIT", - "schema:author": [ - { - "@type": "schema:Person", - "schema:name": "Jane Doe", - "md4i:orcidId": {"@id": "https://orcid.org/0000-0002-1234-5678"}, - "schema:affiliation": ["EPFL"] - } - ], - "pulse:repositoryType": "pulse:Software", - "pulse:discipline": [ - {"@id": "wd:Q420"} - ] + "schema:name": "Example" } ] } ``` -### Person with Academic Catalog Relations - -```json -{ - "@context": { - "schema": "http://schema.org/", - "pulse": "https://open-pulse.epfl.ch/ontology#", - "md4i": "http://w3id.org/nfdi4ing/metadata4ing#" - }, - "@graph": [ - { - "@type": "schema:Person", - "schema:name": "Jane Doe", - "pulse:email": "jane.doe@epfl.ch", - "md4i:orcidId": "0000-0002-1234-5678", - "schema:affiliation": ["EPFL", "CVLAB"], - "pulse:hasAcademicCatalogRelation": [ - { - "@type": "pulse:AcademicCatalogRelation", - "pulse:catalogType": "infoscience", - "pulse:entityType": "person", - "pulse:hasCatalogEntity": { - "@type": "pulse:CatalogEntity", - "pulse:uuid": "abc-123-def", - "schema:name": "Jane Doe", - "pulse:profileUrl": { - "@id": "https://infoscience.epfl.ch/entities/person/abc-123-def" - } - }, - "pulse:confidence": 0.95, - "pulse:justification": "Matched on name and email", - "pulse:matchedOn": ["name", "email"] - } - ] - } - ] -} -``` - -### Organization with ROR - -```json -{ - "@context": { - "schema": "http://schema.org/", - "md4i": "http://w3id.org/nfdi4ing/metadata4ing#" - }, - "@graph": [ - { - "@type": "schema:Organization", - "schema:legalName": "École Polytechnique Fédérale de Lausanne", - "md4i:hasRorId": {"@id": "https://ror.org/02s376052"}, - "schema:url": {"@id": "https://www.epfl.ch"} - } - ] -} -``` - -## Conversion Functions - -### Pydantic → JSON-LD - -```python -from src.data_models.conversion import convert_pydantic_to_jsonld - -# Convert any Pydantic model to JSON-LD -jsonld = convert_pydantic_to_jsonld(pydantic_model, base_url=optional_base_url) -``` - -The function: -1. Automatically detects the model type -2. Maps fields using `PYDANTIC_TO_ZOD_MAPPING` -3. Handles nested models recursively -4. Converts enums to proper values -5. Formats dates as ISO 8601 -6. Converts ORCID IDs to URLs - -### JSON-LD → Pydantic - -```python -from src.data_models.conversion import convert_jsonld_to_pydantic - -# Convert JSON-LD graph to Pydantic model -model = convert_jsonld_to_pydantic(jsonld_graph) -``` - -The function: -1. Parses the `@graph` array -2. Identifies entity types via `@type` -3. Maps JSON-LD properties to Pydantic fields using `JSONLD_TO_PYDANTIC_MAPPING` -4. Resolves nested entity references -5. Validates and constructs Pydantic models - -## Important Notes - -### ORCID Handling - -ORCID identifiers are stored as plain strings in Pydantic (`0000-0002-1234-5678`) but **always** converted to URL format in JSON-LD: - -```json -"md4i:orcidId": {"@id": "https://orcid.org/0000-0002-1234-5678"} -``` - -### Discipline Values - -Disciplines are Wikidata entity URIs: -- Biology: `wd:Q420` -- Mathematics: `wd:Q395` -- Physics: `wd:Q413` -- Computer Engineering: `wd:Q428691` - -Full list in PULSE ontology documentation. - -### Repository Types - -Repository types use PULSE enum values: -- Software: `pulse:Software` -- Educational Resource: `pulse:EducationalResource` -- Documentation: `pulse:Documentation` -- Data: `pulse:Data` -- Other: `pulse:Other` - -### Confidence Scores - -All confidence scores must be between 0.0 and 1.0 (inclusive). Used for: -- `pulse:confidence` in academic catalog relations -- `pulse:relatedToEPFLConfidence` -- `Organization.attributionConfidence` - -### Justification Fields - -Multiple fields map to `pulse:justification`: -- `disciplineJustification` -- `repositoryTypeJustification` -- `relatedToOrganizationJustification` -- `relatedToEPFLJustification` -- `AcademicCatalogRelation.justification` - -These are kept separate in Pydantic for context but may be merged in JSON-LD serialization. - -## Validation - -The PULSE ontology includes SHACL shapes for validation. Key rules: - -1. **Required fields**: Many properties are marked `sh:minCount 1` -2. **Pattern constraints**: URLs must match `^http.*` -3. **Length constraints**: `schema:name` has `sh:maxLength 60` -4. **Cardinality**: Some fields are `sh:maxCount 1` -5. **Enumerations**: `catalogType`, `entityType`, etc. have fixed value lists - -Run SHACL validation after conversion to ensure compliance. - -## Migration Notes - -### Changes from imaging-plaza to PULSE - -Key namespace changes: -- `imag:` → `pulse:` for custom properties -- `md4i:orcid` → `md4i:orcidId` -- Added academic catalog relation support -- Added Wikidata discipline mappings - -### Deprecated Properties - -- `imag:infoscienceEntities` → Use `pulse:hasAcademicCatalogRelation` -- `imag:relatedToOrganization` → `pulse:relatedToOrganization` - -## See Also +## Known boundaries -- [Full Mapping Documentation](./PYDANTIC_JSONLD_MAPPING.md) -- [PULSE Ontology](https://open-pulse.epfl.ch/ontology#) -- [Academic Catalog Integration](./ACADEMIC_CATALOG_OPTION_B_IMPLEMENTATION.md) +- Reverse conversion currently targets repository graphs (`SoftwareSourceCode`) as primary supported model. +- User/organization reverse conversion in the script is marked simplified. diff --git a/docs/JSONLD_MAPPING_UPDATE.md b/docs/JSONLD_MAPPING_UPDATE.md index 578e08c..8b0c6bd 100644 --- a/docs/JSONLD_MAPPING_UPDATE.md +++ b/docs/JSONLD_MAPPING_UPDATE.md @@ -1,290 +1,26 @@ -# JSON-LD Mapping Update - PULSE Ontology Integration +# JSON-LD Mapping Update Notes -## Summary +This page tracks the current mapping baseline in `v2.0.0`. -Updated the Pydantic→JSON-LD mapping system to align with the PULSE (EPFL Open Science) ontology. This enables proper semantic representation of research software metadata in RDF/JSON-LD format. +## Namespace baseline -## Changes Made +The active JSON-LD context includes: -### 1. Updated Namespace Prefixes +- `schema`: Schema.org +- `sd`: OKN software description vocabulary +- `pulse`: Open Pulse ontology +- `md4i`: metadata4ing -**File**: `src/data_models/conversion.py` +## Current mapping source of truth -Changed from `imaging-plaza` to `pulse` ontology: +- `src/data_models/conversion.py` + - `PYDANTIC_TO_ZOD_MAPPING` + - `JSONLD_TO_PYDANTIC_MAPPING` + - `convert_pydantic_to_jsonld` + - `convert_jsonld_to_pydantic` -```python -# Before -context = { - "schema": "http://schema.org/", - "sd": "https://w3id.org/okn/o/sd#", - "imag": "https://imaging-plaza.epfl.ch/ontology/", - "md4i": "https://w3id.org/md4i/", -} +## Practical implications -# After -context = { - "schema": "http://schema.org/", - "sd": "https://w3id.org/okn/o/sd#", - "pulse": "https://open-pulse.epfl.ch/ontology#", - "md4i": "http://w3id.org/nfdi4ing/metadata4ing#", - "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", - "rdfs": "http://www.w3.org/2000/01/rdf-schema#", - "owl": "http://www.w3.org/2002/07/owl#", - "xsd": "http://www.w3.org/2001/XMLSchema#", - "dcterms": "http://purl.org/dc/terms/", - "wd": "http://www.wikidata.org/entity/", -} -``` - -### 2. Extended Property Mappings - -**File**: `src/data_models/conversion.py` - -#### Added New Models -- `AcademicCatalogRelation`: Links to academic catalogs (Infoscience, ORCID, ROR, Wikidata) -- `CatalogEntity`: Entities from academic catalogs -- `InfosciencePublication`: Publications from EPFL's Infoscience -- `InfoscienceAuthor`: Authors from Infoscience -- `InfoscienceLab`: Labs/orgunits from Infoscience -- `GitHubOrganization`: GitHub org with enriched metadata - -#### Updated Existing Models -- **Person**: Added `academicCatalogRelations`, `gitAuthorIds`, `affiliationHistory`, etc. -- **Organization**: Added `academicCatalogRelations` -- **SoftwareSourceCode**: Added `academicCatalogRelations`, updated property mappings - -### 3. Property Mapping Updates - -Key changes in `PYDANTIC_TO_ZOD_MAPPING`: - -| Old Property | New Property | Model | -|--------------|--------------|-------| -| `imag:confidence` | `pulse:confidence` | All | -| `imag:justification` | `pulse:justification` | All | -| `imag:discipline` | `pulse:discipline` | SoftwareSourceCode | -| `imag:repositoryType` | `pulse:repositoryType` | SoftwareSourceCode | -| `imag:relatedToOrganization` | `pulse:relatedToOrganization` | SoftwareSourceCode | -| `md4i:orcid` | `md4i:orcidId` | Person | -| `schema:email` | `pulse:email` | Person, GitAuthor | - -### 4. Added Bidirectional Mappings - -Updated `JSONLD_TO_PYDANTIC_MAPPING` to support both full URIs and prefixed forms: - -```python -# Example: Both forms supported -"http://schema.org/name": "name", -"schema:name": "name", -"https://open-pulse.epfl.ch/ontology#confidence": "confidence", -"pulse:confidence": "confidence", -``` - -### 5. Type Mappings - -Updated to align with PULSE ontology SHACL shapes: - -```python -type_mapping = { - "SoftwareSourceCode": "schema:SoftwareSourceCode", - "Person": "schema:Person", - "Organization": "schema:Organization", - "GitHubOrganization": "schema:GitHubOrganization", - "AcademicCatalogRelation": "pulse:AcademicCatalogRelation", - "CatalogEntity": "pulse:CatalogEntity", - "InfosciencePublication": "schema:ScholarlyArticle", - "Discipline": "pulse:DisciplineEnumeration", - "RepositoryType": "pulse:RepositoryTypeEnumeration", - # ... more -} -``` - -### 6. Documentation - -Created comprehensive documentation: - -#### `docs/PYDANTIC_JSONLD_MAPPING.md` -- Complete property mappings for all models -- SHACL shape references -- Datatype specifications -- Usage examples -- Validation rules - -#### `docs/JSONLD_CONVERSION_SUMMARY.md` -- Quick reference tables -- Common use cases -- Example JSON-LD outputs -- Migration notes from imaging-plaza -- ORCID handling specifics - -## New Features - -### Academic Catalog Relations - -The system now supports linking entities to academic catalogs: - -```python -AcademicCatalogRelation( - catalogType="infoscience", - entityType="person", - entity=CatalogEntity( - uuid="abc-123", - name="Jane Doe", - email="jane@epfl.ch", - profileUrl="https://infoscience.epfl.ch/entities/person/abc-123" - ), - confidence=0.95, - justification="Matched on name and email", - matchedOn=["name", "email"] -) -``` - -This converts to: - -```json -{ - "@type": "pulse:AcademicCatalogRelation", - "pulse:catalogType": "infoscience", - "pulse:entityType": "person", - "pulse:hasCatalogEntity": { - "@type": "pulse:CatalogEntity", - "pulse:uuid": "abc-123", - "schema:name": "Jane Doe", - "pulse:email": "jane@epfl.ch", - "pulse:profileUrl": {"@id": "https://infoscience.epfl.ch/entities/person/abc-123"} - }, - "pulse:confidence": 0.95, - "pulse:justification": "Matched on name and email", - "pulse:matchedOn": ["name", "email"] -} -``` - -### Wikidata Discipline Mapping - -Disciplines are now mapped to Wikidata entities: - -```python -Discipline.BIOLOGY # → wd:Q420 -Discipline.MATHEMATICS # → wd:Q395 -Discipline.PHYSICS # → wd:Q413 -``` - -### PULSE Repository Types - -Repository types use PULSE ontology enumerations: - -```python -RepositoryType.SOFTWARE # → pulse:Software -RepositoryType.EDUCATIONAL_RESOURCE # → pulse:EducationalResource -RepositoryType.DATA # → pulse:Data -``` - -## Validation - -The mappings align with PULSE ontology SHACL shapes: - -### Key Constraints -- `schema:name`: max 60 characters -- `schema:description`: max 2000 characters -- `schema:codeRepository`: pattern `^http.*` -- `pulse:confidence`: range 0.0-1.0 -- `schema:author`: required, Person or Organization -- `pulse:catalogType`: enum (infoscience, orcid, ror, wikidata) -- `pulse:entityType`: enum (person, organization, publication, project) - -## Migration Path - -### Backward Compatibility - -Old properties are still supported in JSON-LD input but will be converted to new properties: - -```python -# Both work: -"imag:confidence" → mapped to "confidence" -"pulse:confidence" → mapped to "confidence" -``` - -### Code Changes Required - -If you're using the old property names in code: - -```python -# Before -"imag:relatedToOrganization" -"imag:infoscienceEntities" - -# After -"pulse:relatedToOrganization" -"pulse:hasAcademicCatalogRelation" -``` - -## Testing - -### Using the CLI Tool (Recommended) - -A command-line tool is available for easy conversion: - -```bash -# Convert JSON to JSON-LD -python scripts/convert_json_jsonld.py to-jsonld input.json output.jsonld \ - --base-url https://github.com/user/repo - -# Convert JSON-LD to JSON -python scripts/convert_json_jsonld.py to-json input.jsonld output.json -``` - -See [JSON-LD Conversion CLI Guide](./JSON_JSONLD_CONVERSION_CLI.md) for detailed usage. - -### Using Python Code - -To test the conversion in Python: - -```python -from src.data_models.repository import SoftwareSourceCode -from src.data_models.models import Person, RepositoryType -from src.data_models.conversion import convert_pydantic_to_jsonld - -repo = SoftwareSourceCode( - name="Test Repo", - description="A test repository", - codeRepository=["https://github.com/test/repo"], - author=[ - Person( - name="Test User", - orcid="0000-0002-1234-5678" - ) - ], - repositoryType=RepositoryType.SOFTWARE, - repositoryTypeJustification=["Contains source code"] -) - -jsonld = convert_pydantic_to_jsonld(repo, base_url="https://github.com/test/repo") -print(jsonld) -``` - -## Files Modified - -1. `src/data_models/conversion.py` - Main conversion logic -2. `docs/PYDANTIC_JSONLD_MAPPING.md` - Complete mapping documentation -3. `docs/JSONLD_CONVERSION_SUMMARY.md` - Quick reference guide - -## Next Steps - -1. **SHACL Validation**: Implement SHACL validation using the PULSE shapes -2. **RDF Export**: Add Turtle/N-Triples serialization options -3. **GraphDB Integration**: Connect to EPFL's triplestore -4. **SPARQL Queries**: Create example queries for common use cases -5. **CLI Tool**: Add command-line tool for JSON→JSON-LD conversion - -## References - -- [PULSE Ontology](https://open-pulse.epfl.ch/ontology#) -- [Schema.org](http://schema.org/) -- [Software Description Ontology](https://w3id.org/okn/o/sd#) -- [Metadata4Ing](http://w3id.org/nfdi4ing/metadata4ing#) -- [Wikidata](https://www.wikidata.org/) - -## Version - -- **Date**: 2025-11-06 -- **Author**: GitHub Copilot -- **Version**: 2.0.0 (PULSE integration) +- Repository, user, and organization model fields can be serialized to JSON-LD via shared mappings. +- Linked academic entities and EPFL assessment properties are represented with `pulse:*` fields. +- Conversion behavior should be treated as code-driven; update docs after mapping changes in `conversion.py`. diff --git a/docs/JSON_JSONLD_CONVERSION_CLI.md b/docs/JSON_JSONLD_CONVERSION_CLI.md index aeb7928..dbab89b 100644 --- a/docs/JSON_JSONLD_CONVERSION_CLI.md +++ b/docs/JSON_JSONLD_CONVERSION_CLI.md @@ -1,461 +1,43 @@ -# JSON ↔ JSON-LD Conversion Guide +# JSON <-> JSON-LD Conversion CLI -## Quick Start +Use the conversion script in this repository: -### Installation +- `scripts/convert_json_jsonld.py` -Make sure you're in the project directory and have the dependencies installed: +## Commands -```bash -cd /home/rmfranken/git-metadata-extractor -# If using uv (recommended) -uv sync -# Or with pip -pip install -e . -``` - -### Basic Usage - -#### Convert JSON to JSON-LD +### Convert JSON to JSON-LD ```bash python scripts/convert_json_jsonld.py to-jsonld input.json output.jsonld ``` -**With base URL (recommended):** -```bash -python scripts/convert_json_jsonld.py to-jsonld input.json output.jsonld \ - --base-url https://github.com/your-org/your-repo -``` - -#### Convert JSON-LD to JSON - -```bash -python scripts/convert_json_jsonld.py to-json input.jsonld output.json -``` - -## Detailed Examples - -### Example 1: Convert Repository Metadata to JSON-LD - -**Input file** (`my_repo.json`): -```json -{ - "name": "My Research Software", - "description": "A tool for scientific computing", - "codeRepository": ["https://github.com/example/my-repo"], - "license": "https://spdx.org/licenses/MIT", - "author": [ - { - "type": "Person", - "name": "Jane Doe", - "orcid": "0000-0002-1234-5678", - "affiliations": ["EPFL"] - } - ], - "repositoryType": "software", - "repositoryTypeJustification": ["Contains source code and documentation"], - "discipline": ["Biology", "Computer Engineering"], - "disciplineJustification": ["Computational biology tools", "Software engineering"] -} -``` - -**Command:** -```bash -python scripts/convert_json_jsonld.py to-jsonld my_repo.json my_repo.jsonld \ - --base-url https://github.com/example/my-repo -``` - -**Output** (`my_repo.jsonld`): -```json -{ - "@context": { - "schema": "http://schema.org/", - "sd": "https://w3id.org/okn/o/sd#", - "pulse": "https://open-pulse.epfl.ch/ontology#", - "md4i": "http://w3id.org/nfdi4ing/metadata4ing#", - "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", - "rdfs": "http://www.w3.org/2000/01/rdf-schema#", - "owl": "http://www.w3.org/2002/07/owl#", - "xsd": "http://www.w3.org/2001/XMLSchema#", - "dcterms": "http://purl.org/dc/terms/", - "wd": "http://www.wikidata.org/entity/" - }, - "@graph": [ - { - "@id": "https://github.com/example/my-repo", - "@type": "schema:SoftwareSourceCode", - "schema:name": "My Research Software", - "schema:description": "A tool for scientific computing", - "schema:codeRepository": [ - {"@id": "https://github.com/example/my-repo"} - ], - "schema:license": "https://spdx.org/licenses/MIT", - "schema:author": [ - { - "@type": "schema:Person", - "schema:name": "Jane Doe", - "md4i:orcidId": {"@id": "https://orcid.org/0000-0002-1234-5678"}, - "schema:affiliation": ["EPFL"] - } - ], - "pulse:repositoryType": "pulse:Software", - "pulse:justification": [ - "Contains source code and documentation", - "Computational biology tools", - "Software engineering" - ], - "pulse:discipline": ["Biology", "Computer Engineering"] - } - ] -} -``` - -### Example 2: Convert JSON-LD Back to JSON - -```bash -python scripts/convert_json_jsonld.py to-json my_repo.jsonld my_repo_restored.json -``` - -This will convert the JSON-LD back to the Pydantic JSON format. - -## Using in Python Code - -You can also use the conversion functions directly in Python: - -### Convert to JSON-LD - -```python -from src.data_models.repository import SoftwareSourceCode -from src.data_models.models import Person, RepositoryType -from src.data_models.conversion import convert_pydantic_to_jsonld -import json - -# Create a Pydantic model -repo = SoftwareSourceCode( - name="My Research Software", - description="A tool for scientific computing", - codeRepository=["https://github.com/example/my-repo"], - license="https://spdx.org/licenses/MIT", - author=[ - Person( - name="Jane Doe", - orcid="0000-0002-1234-5678", - affiliations=["EPFL"] - ) - ], - repositoryType=RepositoryType.SOFTWARE, - repositoryTypeJustification=["Contains source code"] -) - -# Convert to JSON-LD -jsonld = convert_pydantic_to_jsonld( - repo, - base_url="https://github.com/example/my-repo" -) - -# Save to file -with open('output.jsonld', 'w') as f: - json.dump(jsonld, f, indent=2) -``` - -### Convert from JSON-LD - -```python -from src.data_models.conversion import convert_jsonld_to_pydantic -import json - -# Load JSON-LD -with open('input.jsonld', 'r') as f: - jsonld_data = json.load(f) - -# Extract graph -graph = jsonld_data.get("@graph", [jsonld_data]) - -# Convert to Pydantic -software = convert_jsonld_to_pydantic(graph) - -# Access properties -print(f"Name: {software.name}") -print(f"Authors: {[a.name for a in software.author]}") - -# Convert back to dict/JSON -data = software.model_dump(exclude_none=True) -``` - -## Working with Existing Files - -### Convert Your Output File - -If you already have an output file from the metadata extractor: - -```bash -python scripts/convert_json_jsonld.py to-jsonld \ - src/files/output_file.json \ - src/files/output_file.jsonld \ - --base-url https://github.com/your-org/your-repo -``` - -### Batch Conversion - -Convert multiple files: - -```bash -# Create a simple bash script -for json_file in data/*.json; do - base_name=$(basename "$json_file" .json) - python scripts/convert_json_jsonld.py to-jsonld \ - "$json_file" \ - "data/${base_name}.jsonld" -done -``` - -Or in Python: - -```python -from pathlib import Path -from src.data_models.conversion import convert_pydantic_to_jsonld -from src.data_models.repository import SoftwareSourceCode -import json - -input_dir = Path("data/json") -output_dir = Path("data/jsonld") -output_dir.mkdir(exist_ok=True) - -for json_file in input_dir.glob("*.json"): - print(f"Converting {json_file.name}...") - - # Load and convert - with open(json_file) as f: - data = json.load(f) - - repo = SoftwareSourceCode(**data) - jsonld = convert_pydantic_to_jsonld(repo) - - # Save - output_file = output_dir / f"{json_file.stem}.jsonld" - with open(output_file, 'w') as f: - json.dump(jsonld, f, indent=2) - - print(f" → {output_file}") -``` - -## Command Reference - -### to-jsonld Command - -Convert Pydantic JSON to JSON-LD format. - -**Syntax:** -```bash -python scripts/convert_json_jsonld.py to-jsonld INPUT OUTPUT [--base-url URL] -``` - -**Arguments:** -- `INPUT`: Path to input JSON file (Pydantic format) -- `OUTPUT`: Path to output JSON-LD file -- `--base-url`: (Optional) Base URL for @id generation (typically the repository URL) +Optional base URL for `@id` generation: -**Examples:** ```bash -# Basic conversion -python scripts/convert_json_jsonld.py to-jsonld input.json output.jsonld - -# With base URL python scripts/convert_json_jsonld.py to-jsonld input.json output.jsonld \ - --base-url https://github.com/user/repo - -# Using absolute paths -python scripts/convert_json_jsonld.py to-jsonld \ - /path/to/input.json \ - /path/to/output.jsonld + --base-url https://github.com/org/repo ``` -### to-json Command +### Convert JSON-LD to JSON -Convert JSON-LD to Pydantic JSON format. - -**Syntax:** -```bash -python scripts/convert_json_jsonld.py to-json INPUT OUTPUT -``` - -**Arguments:** -- `INPUT`: Path to input JSON-LD file -- `OUTPUT`: Path to output JSON file (Pydantic format) - -**Examples:** ```bash -# Basic conversion python scripts/convert_json_jsonld.py to-json input.jsonld output.json - -# Using absolute paths -python scripts/convert_json_jsonld.py to-json \ - /path/to/input.jsonld \ - /path/to/output.json -``` - -## Validation - -### Validate JSON-LD Output - -You can validate your JSON-LD output using online tools or libraries: - -**Online Validators:** -- [JSON-LD Playground](https://json-ld.org/playground/) -- [RDF Translator](https://www.easyrdf.org/converter) - -**Using Python:** -```python -from pyld import jsonld -import json - -# Load your JSON-LD -with open('output.jsonld', 'r') as f: - doc = json.load(f) - -# Expand to see full URIs -expanded = jsonld.expand(doc) -print(json.dumps(expanded, indent=2)) - -# Convert to N-Quads (RDF) -nquads = jsonld.to_rdf(doc, {'format': 'application/n-quads'}) -print(nquads) -``` - -### SHACL Validation - -To validate against PULSE ontology SHACL shapes, you'll need a SHACL validator: - -```python -from pyshacl import validate -import json - -# Load your JSON-LD -with open('output.jsonld', 'r') as f: - data_graph = f.read() - -# Load PULSE SHACL shapes (you'll need the shapes file) -with open('pulse_shapes.ttl', 'r') as f: - shacl_graph = f.read() - -# Validate -conforms, results_graph, results_text = validate( - data_graph=data_graph, - data_graph_format='json-ld', - shacl_graph=shacl_graph, - shacl_graph_format='turtle' -) - -print(f"Conforms: {conforms}") -if not conforms: - print(results_text) -``` - -## Troubleshooting - -### Common Issues - -**Issue: "Module not found" error** -```bash -# Solution: Install dependencies -pip install -e . -# Or with uv -uv sync -``` - -**Issue: "No SoftwareSourceCode entity found"** -```bash -# Solution: Check your JSON-LD structure has @type: schema:SoftwareSourceCode -# and a @graph array -``` - -**Issue: "Invalid ORCID format"** -```bash -# Solution: Use format "0000-0002-1234-5678" or "https://orcid.org/0000-0002-1234-5678" -``` - -**Issue: Validation errors** -```bash -# Solution: Check required fields: -# - name (required) -# - description (required) -# - author (required, at least one) -# - repositoryType (required) -# - repositoryTypeJustification (required) -``` - -### Getting Help - -```bash -# Show help message -python scripts/convert_json_jsonld.py --help - -# Show detailed examples -python scripts/convert_json_jsonld.py to-jsonld --help -``` - -## Advanced Usage - -### Custom Context - -If you need to customize the JSON-LD context, modify `src/data_models/conversion.py`: - -```python -# In convert_pydantic_to_jsonld function -context = { - "schema": "http://schema.org/", - "pulse": "https://open-pulse.epfl.ch/ontology#", - # Add your custom prefixes here - "custom": "https://your-domain.com/ontology#", -} -``` - -### Converting Partial Models - -You can convert individual models (Person, Organization, etc.): - -```python -from src.data_models.models import Person -from src.data_models.conversion import convert_pydantic_to_jsonld - -person = Person( - name="Jane Doe", - orcid="0000-0002-1234-5678" -) - -jsonld = convert_pydantic_to_jsonld(person) ``` -## Integration with API - -To convert API responses: +## What the script validates -```python -from src.api import extract_metadata -from src.data_models.conversion import convert_pydantic_to_jsonld -import json +- Detects model type from API wrapper (`type` + `output`) or model-shaped JSON. +- Validates with `SoftwareSourceCode`, `GitHubUser`, or `GitHubOrganization` models. +- Uses conversion helpers from `src/data_models/conversion.py`. -# Extract metadata using API -result = extract_metadata( - repo_url="https://github.com/user/repo", - use_cache=True -) +## Current limitations -# Convert to JSON-LD -jsonld = convert_pydantic_to_jsonld( - result['data'], - base_url="https://github.com/user/repo" -) - -# Save -with open('output.jsonld', 'w') as f: - json.dump(jsonld, f, indent=2) -``` +- Reverse conversion is strongest for repository graphs. +- User/organization reverse conversion paths are currently simplified and marked with TODO comments in the script. -## See Also +## Typical workflow -- [Full Mapping Documentation](./PYDANTIC_JSONLD_MAPPING.md) -- [Quick Reference Guide](./JSONLD_CONVERSION_SUMMARY.md) -- [PULSE Ontology](https://open-pulse.epfl.ch/ontology#) -- [JSON-LD Specification](https://www.w3.org/TR/json-ld11/) +1. Call API endpoint (`/v1/repository/llm/json` or `/v1/repository/llm/json-ld`). +2. Save response payload. +3. Convert using this script as needed for downstream tooling. diff --git a/docs/PYDANTIC_JSONLD_MAPPING.md b/docs/PYDANTIC_JSONLD_MAPPING.md index e1f8708..c98efb9 100644 --- a/docs/PYDANTIC_JSONLD_MAPPING.md +++ b/docs/PYDANTIC_JSONLD_MAPPING.md @@ -1,416 +1,47 @@ -# Pydantic to JSON-LD Mapping Documentation +# Pydantic to JSON-LD Mapping -This document describes the mapping between Pydantic models and JSON-LD representations based on the PULSE ontology. +This page summarizes key mappings used by `PYDANTIC_TO_ZOD_MAPPING` in `src/data_models/conversion.py`. -## Ontology Namespaces +## Core model mappings -The following namespace prefixes are used in the JSON-LD context: +## `SoftwareSourceCode` -| Prefix | Namespace URI | Description | -|--------|---------------|-------------| -| `schema` | `http://schema.org/` | Schema.org vocabulary | -| `sd` | `https://w3id.org/okn/o/sd#` | Software Description Ontology | -| `pulse` | `https://open-pulse.epfl.ch/ontology#` | PULSE ontology (EPFL Open Science) | -| `md4i` | `http://w3id.org/nfdi4ing/metadata4ing#` | Metadata4Ing ontology | -| `rdf` | `http://www.w3.org/1999/02/22-rdf-syntax-ns#` | RDF vocabulary | -| `rdfs` | `http://www.w3.org/2000/01/rdf-schema#` | RDF Schema | -| `owl` | `http://www.w3.org/2002/07/owl#` | OWL vocabulary | -| `xsd` | `http://www.w3.org/2001/XMLSchema#` | XML Schema Datatypes | -| `dcterms` | `http://purl.org/dc/terms/` | Dublin Core Terms | -| `wd` | `http://www.wikidata.org/entity/` | Wikidata entities | +- `name` -> `schema:name` +- `description` -> `schema:description` +- `codeRepository` -> `schema:codeRepository` +- `author` -> `schema:author` +- `repositoryType` -> `pulse:repositoryType` +- `discipline` -> `pulse:discipline` +- `relatedToOrganizations` -> `pulse:relatedToOrganization` +- `linkedEntities` -> `pulse:linkedEntities` -## Core Data Models +## `Person` -### SoftwareSourceCode +- `name` -> `schema:name` +- `emails` -> `schema:email` +- `githubId` -> `schema:username` +- `orcid` -> `md4i:orcidId` +- `affiliations` -> `schema:affiliation` +- `linkedEntities` -> `pulse:linkedEntities` -Main model representing a software repository. +## `Organization` -| Pydantic Field | JSON-LD Property | RDF Type | Description | -|----------------|------------------|----------|-------------| -| `name` | `schema:name` | `xsd:string` | Repository name | -| `description` | `schema:description` | `xsd:string` | Repository description | -| `codeRepository` | `schema:codeRepository` | `xsd:anyURI` | Code repository URL | -| `dateCreated` | `schema:dateCreated` | `xsd:date` | Creation date | -| `datePublished` | `schema:datePublished` | `xsd:date` | Publication date | -| `license` | `schema:license` | `xsd:anyURI` | SPDX license URL | -| `author` | `schema:author` | `schema:Person` or `schema:Organization` | Authors/contributors | -| `url` | `schema:url` | `xsd:anyURI` | Repository homepage | -| `identifier` | `schema:identifier` | `xsd:string` | Unique identifier | -| `programmingLanguage` | `schema:programmingLanguage` | `xsd:string` | Programming languages | -| `citation` | `schema:citation` | `xsd:anyURI` | Citations | -| `isBasedOn` | `schema:isBasedOn` | `xsd:anyURI` | Based on URL | -| `readme` | `sd:readme` | `xsd:anyURI` | README file URL | -| `discipline` | `pulse:discipline` | `pulse:DisciplineEnumeration` | Scientific disciplines | -| `disciplineJustification` | `pulse:justification` | `xsd:string` | Justification for discipline | -| `repositoryType` | `pulse:repositoryType` | `pulse:RepositoryTypeEnumeration` | Repository type | -| `repositoryTypeJustification` | `pulse:justification` | `xsd:string` | Justification for type | -| `relatedToOrganizations` | `pulse:relatedToOrganization` | `xsd:string` | Related organizations | -| `relatedToOrganizationJustification` | `pulse:justification` | `xsd:string` | Justification for org relation | -| `relatedToEPFL` | `pulse:relatedToEPFL` | `xsd:boolean` | Whether related to EPFL | -| `relatedToEPFLConfidence` | `pulse:confidence` | `xsd:decimal` | Confidence score (0.0-1.0) | -| `relatedToEPFLJustification` | `pulse:justification` | `xsd:string` | Justification for EPFL relation | -| `gitAuthors` | `pulse:gitAuthors` | `schema:Person` | Git commit authors | -| `academicCatalogRelations` | `pulse:hasAcademicCatalogRelation` | `pulse:AcademicCatalogRelation` | Academic catalog relations | -| `applicationCategory` | `schema:applicationCategory` | `xsd:string` | Application categories | -| `featureList` | `schema:featureList` | `xsd:string` | Feature list | -| `image` | `schema:image` | `schema:ImageObject` | Images | -| `isAccessibleForFree` | `schema:isAccessibleForFree` | `xsd:boolean` | Free access | -| `operatingSystem` | `schema:operatingSystem` | `xsd:string` | Operating systems | -| `softwareRequirements` | `schema:softwareRequirements` | `xsd:string` | Software requirements | -| `processorRequirements` | `schema:processorRequirements` | `xsd:string` | Processor requirements | -| `memoryRequirements` | `schema:memoryRequirements` | `xsd:integer` | Memory requirements | -| `requiresGPU` | `pulse:requiresGPU` | `xsd:boolean` | GPU requirements | -| `supportingData` | `schema:supportingData` | `schema:DataFeed` | Supporting data | -| `conditionsOfAccess` | `schema:conditionsOfAccess` | `xsd:string` | Access conditions | -| `hasAcknowledgements` | `sd:hasAcknowledgements` | `xsd:string` | Acknowledgements | -| `hasDocumentation` | `sd:hasDocumentation` | `xsd:anyURI` | Documentation URL | -| `hasExecutableInstructions` | `sd:hasExecutableInstructions` | `xsd:string` | Executable instructions | -| `hasExecutableNotebook` | `pulse:hasExecutableNotebook` | `schema:SoftwareApplication` | Executable notebooks | -| `hasFunding` | `sd:hasFunding` | `schema:Grant` | Funding information | -| `hasSoftwareImage` | `sd:hasSoftwareImage` | `schema:SoftwareApplication` | Software images | -| `imagingModality` | `pulse:imagingModality` | `xsd:string` | Imaging modalities | -| `isPluginModuleOf` | `pulse:isPluginModuleOf` | `xsd:string` | Plugin module of | -| `relatedDatasets` | `pulse:relatedDatasets` | `xsd:string` | Related datasets | -| `relatedPublications` | `pulse:relatedPublications` | `xsd:string` | Related publications | -| `relatedModels` | `pulse:relatedModels` | `xsd:string` | Related models | -| `relatedAPIs` | `pulse:relatedAPIs` | `xsd:string` | Related APIs | +- `legalName` -> `schema:legalName` +- `hasRorId` -> `md4i:hasRorId` +- `organizationType` -> `schema:additionalType` +- `attributionConfidence` -> `pulse:confidence` -**JSON-LD Type**: `schema:SoftwareSourceCode` +## `GitHubUser` and `GitHubOrganization` -### Person +- Metadata fields map under `pulse:metadata` +- EPFL assessment fields map to `pulse:relatedToEPFL`, `pulse:confidence`, `pulse:justification` -Represents an individual author or contributor. +## Linked-entities mappings -| Pydantic Field | JSON-LD Property | RDF Type | Description | -|----------------|------------------|----------|-------------| -| `type` | `@type` | - | Type discriminator ("Person") | -| `name` | `schema:name` | `xsd:string` | Person's full name | -| `email` | `pulse:email` | `xsd:string` | Email address(es) | -| `orcid` | `md4i:orcidId` | `xsd:string` | ORCID identifier | -| `gitAuthorIds` | `pulse:gitAuthorIds` | `xsd:string` | Git author identifiers | -| `affiliations` | `schema:affiliation` | `xsd:string` | All affiliations | -| `currentAffiliation` | `schema:affiliation` | `xsd:string` | Current affiliation | -| `affiliationHistory` | `pulse:affiliationHistory` | - | Temporal affiliation data | -| `contributionSummary` | `pulse:contributionSummary` | `xsd:string` | Contribution summary | -| `biography` | `schema:description` | `xsd:string` | Biographical information | -| `academicCatalogRelations` | `pulse:hasAcademicCatalogRelation` | `pulse:AcademicCatalogRelation` | Academic catalog relations | +- Relation model includes `catalogType`, `entityType`, `confidence`, `justification` +- Infoscience-specific entity models map publication/person/orgunit details into schema/pulse fields. -**JSON-LD Type**: `schema:Person` +## Notes -**SHACL Shape**: Defined in PULSE ontology as `schema:Person` with properties: -- `schema:name` (required) -- `md4i:orcidId` (optional) -- `schema:affiliation` (optional) -- `pulse:username` (optional) - -### Organization - -Represents an institution, lab, or company. - -| Pydantic Field | JSON-LD Property | RDF Type | Description | -|----------------|------------------|----------|-------------| -| `type` | `@type` | - | Type discriminator ("Organization") | -| `legalName` | `schema:legalName` | `xsd:string` | Legal/official name | -| `hasRorId` | `md4i:hasRorId` | `xsd:anyURI` | ROR identifier URL | -| `alternateNames` | `schema:alternateName` | `xsd:string` | Alternative names | -| `organizationType` | `schema:additionalType` | `xsd:string` | Organization type | -| `parentOrganization` | `schema:parentOrganization` | `xsd:string` | Parent organization | -| `country` | `schema:addressCountry` | `xsd:string` | Country | -| `website` | `schema:url` | `xsd:anyURI` | Website URL | -| `attributionConfidence` | `pulse:confidence` | `xsd:decimal` | Attribution confidence | -| `academicCatalogRelations` | `pulse:hasAcademicCatalogRelation` | `pulse:AcademicCatalogRelation` | Academic catalog relations | - -**JSON-LD Type**: `schema:Organization` - -**SHACL Shape**: Defined in PULSE ontology as `schema:Organization` with properties: -- `schema:legalName` (required) -- `md4i:hasRorId` (optional) - -### GitHubOrganization - -Represents a GitHub organization with enriched metadata. - -| Pydantic Field | JSON-LD Property | RDF Type | Description | -|----------------|------------------|----------|-------------| -| `name` | `schema:name` | `xsd:string` | Organization name | -| `organizationType` | `schema:additionalType` | `xsd:string` | Organization type | -| `description` | `schema:description` | `xsd:string` | Description | -| `discipline` | `pulse:discipline` | `pulse:DisciplineEnumeration` | Disciplines | -| `disciplineJustification` | `pulse:justification` | `xsd:string` | Discipline justification | -| `relatedToEPFL` | `pulse:relatedToEPFL` | `xsd:boolean` | EPFL relation | -| `relatedToEPFLJustification` | `pulse:justification` | `xsd:string` | EPFL relation justification | -| `relatedToEPFLConfidence` | `pulse:confidence` | `xsd:decimal` | Confidence score | -| `academicCatalogRelations` | `pulse:hasAcademicCatalogRelation` | `pulse:AcademicCatalogRelation` | Academic catalog relations | -| `githubOrganizationMetadata` | `pulse:metadata` | - | GitHub metadata | - -**JSON-LD Type**: `schema:GitHubOrganization` - -**SHACL Shape**: Defined in PULSE ontology with properties: -- `pulse:username` (GitHub login) -- `pulse:hasRepository` (repositories) -- `schema:affiliation` (affiliations) - -## Academic Catalog Models - -### AcademicCatalogRelation - -Represents a relationship to an entity in an academic catalog (Infoscience, ORCID, ROR, etc.). - -| Pydantic Field | JSON-LD Property | RDF Type | Description | -|----------------|------------------|----------|-------------| -| `catalogType` | `pulse:catalogType` | `xsd:string` | Catalog type (infoscience, orcid, ror, wikidata) | -| `entityType` | `pulse:entityType` | `xsd:string` | Entity type (person, organization, publication, project) | -| `entity` | `pulse:hasCatalogEntity` | `pulse:CatalogEntity` | The catalog entity | -| `confidence` | `pulse:confidence` | `xsd:decimal` | Confidence score (0.0-1.0) | -| `justification` | `pulse:justification` | `xsd:string` | Justification text | -| `matchedOn` | `pulse:matchedOn` | `xsd:string` | Fields matched on | - -**JSON-LD Type**: `pulse:AcademicCatalogRelation` - -**SHACL Shape**: Defined in PULSE ontology with constraints: -- `pulse:catalogType` (required, enum: infoscience, orcid, ror, wikidata) -- `pulse:entityType` (required, enum: person, organization, publication, project) -- `pulse:hasCatalogEntity` (required) -- `pulse:confidence` (required, range: 0.0-1.0) -- `pulse:justification` (required) - -### CatalogEntity - -Represents an entity from an academic catalog. - -| Pydantic Field | JSON-LD Property | RDF Type | Description | -|----------------|------------------|----------|-------------| -| `uuid` | `pulse:uuid` | `xsd:string` | Unique identifier | -| `name` | `schema:name` | `xsd:string` | Entity name | -| `email` | `pulse:email` | `xsd:string` | Email address | -| `orcid` | `md4i:orcidId` | `xsd:string` | ORCID identifier | -| `affiliation` | `schema:affiliation` | `xsd:string` | Affiliation | -| `profileUrl` | `pulse:profileUrl` | `xsd:anyURI` | Profile URL | - -**JSON-LD Type**: `pulse:CatalogEntity` - -**SHACL Shape**: Defined in PULSE ontology with properties: -- `pulse:uuid` (required) -- `schema:name` (required) -- `pulse:email` (optional) -- `md4i:orcidId` (optional) -- `schema:affiliation` (optional) -- `pulse:profileUrl` (optional) - -### InfosciencePublication - -Publication from EPFL's Infoscience repository. - -| Pydantic Field | JSON-LD Property | RDF Type | Description | -|----------------|------------------|----------|-------------| -| `type` | `@type` | - | Type discriminator | -| `uuid` | `pulse:uuid` | `xsd:string` | DSpace UUID | -| `title` | `schema:name` | `xsd:string` | Publication title | -| `authors` | `schema:author` | `xsd:string` | Author names | -| `abstract` | `schema:abstract` | `xsd:string` | Abstract text | -| `doi` | `schema:identifier` | `xsd:string` | DOI | -| `publication_date` | `schema:datePublished` | `xsd:date` | Publication date | -| `publication_type` | `schema:additionalType` | `xsd:string` | Publication type | -| `url` | `schema:url` | `xsd:anyURI` | Infoscience URL | -| `repository_url` | `schema:codeRepository` | `xsd:anyURI` | Code repository | -| `lab` | `schema:affiliation` | `xsd:string` | Laboratory | -| `subjects` | `schema:keywords` | `xsd:string` | Subject keywords | - -**JSON-LD Type**: `schema:ScholarlyArticle` - -### InfoscienceAuthor - -Author/researcher from Infoscience. - -| Pydantic Field | JSON-LD Property | RDF Type | Description | -|----------------|------------------|----------|-------------| -| `type` | `@type` | - | Type discriminator | -| `uuid` | `pulse:uuid` | `xsd:string` | DSpace UUID | -| `name` | `schema:name` | `xsd:string` | Full name | -| `email` | `pulse:email` | `xsd:string` | Email | -| `orcid` | `md4i:orcidId` | `xsd:string` | ORCID | -| `affiliation` | `schema:affiliation` | `xsd:string` | Affiliation | -| `profile_url` | `pulse:profileUrl` | `xsd:anyURI` | Infoscience profile | - -**JSON-LD Type**: `schema:Person` - -### InfoscienceLab - -Laboratory or organizational unit from Infoscience. - -| Pydantic Field | JSON-LD Property | RDF Type | Description | -|----------------|------------------|----------|-------------| -| `type` | `@type` | - | Type discriminator | -| `uuid` | `pulse:uuid` | `xsd:string` | DSpace UUID | -| `name` | `schema:name` | `xsd:string` | Lab name | -| `description` | `schema:description` | `xsd:string` | Description | -| `url` | `schema:url` | `xsd:anyURI` | Infoscience URL | -| `parent_organization` | `schema:parentOrganization` | `xsd:string` | Parent org | -| `website` | `schema:url` | `xsd:anyURI` | External website | -| `research_areas` | `schema:knowsAbout` | `xsd:string` | Research areas | - -**JSON-LD Type**: `schema:Organization` - -## Supporting Models - -### GitAuthor - -Git commit author information. - -| Pydantic Field | JSON-LD Property | RDF Type | Description | -|----------------|------------------|----------|-------------| -| `name` | `schema:name` | `xsd:string` | Author name | -| `email` | `pulse:email` | `xsd:string` | Email | -| `commits` | `pulse:commits` | `pulse:Commits` | Commit statistics | - -**JSON-LD Type**: `schema:Person` - -### Commits - -Commit statistics. - -| Pydantic Field | JSON-LD Property | RDF Type | Description | -|----------------|------------------|----------|-------------| -| `total` | `pulse:totalCommits` | `xsd:integer` | Total commits | -| `firstCommitDate` | `pulse:firstCommitDate` | `xsd:date` | First commit date | -| `lastCommitDate` | `pulse:lastCommitDate` | `xsd:date` | Last commit date | - -### FundingInformation - -| Pydantic Field | JSON-LD Property | RDF Type | Description | -|----------------|------------------|----------|-------------| -| `identifier` | `schema:identifier` | `xsd:string` | Grant identifier | -| `fundingGrant` | `sd:fundingGrant` | `xsd:string` | Grant number | -| `fundingSource` | `sd:fundingSource` | `schema:Organization` | Funding organization | - -**JSON-LD Type**: `schema:Grant` - -### DataFeed - -| Pydantic Field | JSON-LD Property | RDF Type | Description | -|----------------|------------------|----------|-------------| -| `name` | `schema:name` | `xsd:string` | Name | -| `description` | `schema:description` | `xsd:string` | Description | -| `contentUrl` | `schema:contentUrl` | `xsd:anyURI` | Content URL | -| `measurementTechnique` | `schema:measurementTechnique` | `xsd:string` | Measurement technique | -| `variableMeasured` | `schema:variableMeasured` | `xsd:string` | Variable measured | - -**JSON-LD Type**: `schema:DataFeed` - -### Image - -| Pydantic Field | JSON-LD Property | RDF Type | Description | -|----------------|------------------|----------|-------------| -| `contentUrl` | `schema:contentUrl` | `xsd:anyURI` | Image URL | -| `keywords` | `schema:keywords` | `xsd:string` | Keywords | - -**JSON-LD Type**: `schema:ImageObject` - -## Enumerations - -### Discipline - -Scientific disciplines aligned with Wikidata entities. - -**JSON-LD Type**: `pulse:DisciplineEnumeration` - -**Values**: Mapped to Wikidata entities (e.g., `wd:Q420` for Biology, `wd:Q395` for Mathematics) - -Examples: -- `BIOLOGY` → `wd:Q420` -- `MATHEMATICS` → `wd:Q395` -- `PHYSICS` → `wd:Q413` -- `COMPUTER_ENGINEERING` → `wd:Q428691` - -### RepositoryType - -Repository classification. - -**JSON-LD Type**: `pulse:RepositoryTypeEnumeration` - -**Values**: -- `SOFTWARE` → `pulse:Software` -- `EDUCATIONAL_RESOURCE` → `pulse:EducationalResource` -- `DOCUMENTATION` → `pulse:Documentation` -- `DATA` → `pulse:Data` -- `OTHER` → `pulse:Other` - -## Usage Examples - -### Converting Pydantic to JSON-LD - -```python -from src.data_models.repository import SoftwareSourceCode -from src.data_models.conversion import convert_pydantic_to_jsonld - -# Create a Pydantic model instance -repo = SoftwareSourceCode( - name="My Research Software", - description="A tool for scientific computing", - codeRepository=["https://github.com/example/repo"], - license="https://spdx.org/licenses/MIT", - author=[ - Person( - name="Jane Doe", - orcid="0000-0002-1234-5678", - affiliation=["EPFL"] - ) - ], - repositoryType=RepositoryType.SOFTWARE, - repositoryTypeJustification=["Contains source code and documentation"] -) - -# Convert to JSON-LD -jsonld = convert_pydantic_to_jsonld(repo, base_url="https://github.com/example/repo") -``` - -### Converting JSON-LD to Pydantic - -```python -from src.data_models.conversion import convert_jsonld_to_pydantic - -jsonld_graph = [ - { - "@id": "https://github.com/example/repo", - "@type": "schema:SoftwareSourceCode", - "schema:name": "My Research Software", - "schema:description": "A tool for scientific computing", - # ... more properties - } -] - -repo = convert_jsonld_to_pydantic(jsonld_graph) -``` - -## SHACL Validation - -The PULSE ontology includes SHACL shapes for validation. Key constraints: - -### schema:SoftwareSourceCode -- `schema:name` (required, max 60 chars) -- `schema:description` (required, max 2000 chars) -- `schema:codeRepository` (required, pattern: `^http.*`) -- `schema:dateCreated` (required, datatype: xsd:date) -- `schema:license` (required, pattern: `.*spdx\.org.*`) -- `schema:author` (required, Person or Organization) -- `pulse:discipline` (class: pulse:DisciplineEnumeration) -- `pulse:repositoryType` (class: pulse:RepositoryTypeEnumeration) - -### pulse:AcademicCatalogRelation -- All fields required except `matchedOn` -- `confidence` must be between 0.0 and 1.0 -- `catalogType` must be one of: infoscience, orcid, ror, wikidata -- `entityType` must be one of: person, organization, publication, project - -## References - -- PULSE Ontology: `https://open-pulse.epfl.ch/ontology#` -- Schema.org: `http://schema.org/` -- Software Description Ontology: `https://w3id.org/okn/o/sd#` -- Metadata4Ing: `http://w3id.org/nfdi4ing/metadata4ing#` -- Wikidata: `http://www.wikidata.org/entity/` - -## Version History - -- **2025-11-06**: Updated to align with PULSE ontology, added academic catalog relations -- **Previous**: Based on imaging-plaza ontology +- The map name is historical (`PYDANTIC_TO_ZOD_MAPPING`) but is used by both JSON-LD and Zod-compatible conversion paths. +- If you add fields to models, update this mapping and associated tests/conversion checks. diff --git a/docs/UPDATES_SUMMARY.md b/docs/UPDATES_SUMMARY.md index 331fb3c..a5c5eab 100644 --- a/docs/UPDATES_SUMMARY.md +++ b/docs/UPDATES_SUMMARY.md @@ -1,422 +1,22 @@ -# Recent Updates Summary +# Updates Summary -## Date: October 31, 2025 +This page summarizes the current documentation baseline for `v2.0.0`. -### JSON-LD Conversion System ✅ +## Documentation structure now aligned with code -**New Files:** -- `docs/JSONLD_CONVERSION.md` - Comprehensive guide for JSON-LD conversion and extension +- API and runtime entrypoints documented from `src/api.py` and `src/analysis/*`. +- Architecture diagrams updated to reflect repository, user, and organization flows. +- Infoscience and academic catalog pages updated to match current tool and model names. +- JSON-LD docs now reference the active conversion implementation in `src/data_models/conversion.py`. -**Updated Files:** -- `.cursor/rules/fastapi-patterns.mdc` - Added JSON-LD endpoint patterns -- `.cursor/rules/pydantic-models.mdc` - Added JSON-LD conversion documentation -- `src/data_models/api.py` - Fixed Union type ordering for JSON-LD support -- `src/data_models/conversion.py` - Extended field mappings for SoftwareSourceCode -- `src/data_models/repository.py` - Added `convert_pydantic_to_jsonld()` method -- `src/api.py` - Implemented `/v1/repository/llm/json-ld/` and `/v1/repository/gimie/json-ld/` endpoints +## Key behavior notes now explicitly documented -#### Key Features Implemented +- Cache-first execution with `force_refresh` bypass behavior. +- Optional enrichment toggles (`enrich_orgs`, `enrich_users`). +- API `stats` payload includes token and timing metadata. +- CLI status clarified: legacy `src/main.py` imports need refresh in current layout. -✅ **Generic Conversion System** -- `convert_pydantic_to_jsonld()` function works with any Pydantic model -- Recursive conversion of nested models and lists -- Automatic `@id` and `@type` generation -- Special handling for URLs, dates, and enums +## Remaining known implementation gaps called out in docs -✅ **Field Mapping System** -- `PYDANTIC_TO_ZOD_MAPPING` dictionary maps Pydantic fields to semantic URIs -- Support for multiple namespaces: `schema:`, `sd:`, `imag:`, `md4i:` -- Complete mappings for `SoftwareSourceCode`, `Person`, `Organization`, `InfoscienceEntity` - -✅ **API Integration** -- Two JSON-LD endpoints: `/v1/repository/gimie/json-ld/` (GIMIE-only) and `/v1/repository/llm/json-ld/` (full LLM enrichment) -- Fixed Pydantic Union type coercion issue by reordering `APIOutput.output` Union -- Added field validator and model serializer to preserve raw JSON-LD dicts -- Comprehensive error handling and validation -- OpenAPI examples showing realistic JSON-LD output - -✅ **Documentation** -- Complete guide covering architecture, how it works, and extension process -- Step-by-step examples for adding JSON-LD to new models -- Field mapping reference with all current mappings -- Troubleshooting guide for common issues -- Best practices for semantic web integration - -#### Critical Implementation Detail: Union Type Ordering - -**Problem:** Pydantic's `Union` validation goes left-to-right. If Pydantic models come before `dict` in a Union, Pydantic tries to coerce JSON-LD dictionaries into models, corrupting the structure. - -**Solution:** Order Union types with `dict` and `list` FIRST: - -```python -# ✅ CORRECT -output: Union[dict, list, SoftwareSourceCode, GitHubOrganization, GitHubUser, Any] - -# ❌ WRONG - Causes JSON-LD to be coerced to GitHubOrganization -output: Union[SoftwareSourceCode, GitHubOrganization, GitHubUser, dict, list, Any] -``` - -**Additional Safeguards:** -```python -@field_validator("output", mode="before") -@classmethod -def preserve_dict_output(cls, v): - """Preserve dict/list output without converting to models.""" - if isinstance(v, (dict, list)): - return v - return v - -@model_serializer(mode='wrap') -def serialize_model(self, serializer): - """Custom serializer to preserve dict/list in output field.""" - data = serializer(self) - if isinstance(self.output, (dict, list)): - data['output'] = self.output - return data -``` - -#### JSON-LD Output Structure - -```json -{ - "@context": { - "schema": "http://schema.org/", - "sd": "https://w3id.org/okn/o/sd#", - "imag": "https://imaging-plaza.epfl.ch/ontology/", - "md4i": "https://w3id.org/md4i/" - }, - "@graph": [ - { - "@id": "https://github.com/user/repo", - "@type": "http://schema.org/SoftwareSourceCode", - "schema:name": {"@value": "Repository Name"}, - "schema:author": [ - { - "@type": "http://schema.org/Person", - "schema:name": {"@value": "Jane Doe"}, - "md4i:orcidId": {"@id": "https://orcid.org/0000-0001-2345-6789"} - } - ], - "imag:relatedToEPFL": true, - "imag:relatedToOrganizationsROR": [ - { - "@type": "http://schema.org/Organization", - "schema:legalName": {"@value": "EPFL"}, - "md4i:hasRorId": {"@id": "https://ror.org/03yrm5c26"} - } - ] - } - ] -} -``` - -#### Extension Process - -To add JSON-LD support to a new model (e.g., `GitHubUser`): - -1. **Add field mappings** in `src/data_models/conversion.py`: - ```python - PYDANTIC_TO_ZOD_MAPPING["GitHubUser"] = { - "name": "schema:name", - "githubHandle": "schema:identifier", - # ... more fields - } - ``` - -2. **Add type mapping** in `convert_pydantic_to_jsonld()`: - ```python - type_mapping = { - GitHubUser: "http://schema.org/Person", - } - ``` - -3. **Add model method** in model file: - ```python - def convert_pydantic_to_jsonld(self) -> dict: - from src.data_models.conversion import convert_pydantic_to_jsonld - base_url = f"https://github.com/{self.githubHandle}" - return convert_pydantic_to_jsonld(self, base_url=base_url) - ``` - -4. **Update `dump_results()`** in analysis class: - ```python - def dump_results(self, output_type: str = "pydantic"): - if output_type == "json-ld": - return self.data.convert_pydantic_to_jsonld() - # ... other formats - ``` - -5. **Create API endpoint** following the pattern in `src/api.py` - -#### Benefits - -✅ **Semantic Web Compatibility**: Standard JSON-LD format works with RDF tools -✅ **Extensible Design**: Easy to add JSON-LD to any Pydantic model -✅ **Imaging Plaza Integration**: Uses Imaging Plaza ontology and schema.org -✅ **Comprehensive Documentation**: Clear guide for future development -✅ **Type Safety**: Pydantic validation + custom serializers preserve structure -✅ **Namespace Support**: Multiple ontologies (schema.org, custom EPFL ontologies) - ---- - -## Date: October 29, 2025 - -### 1. Cache Configuration Changes ✅ - -**File:** `src/cache/cache_config.py` - -**Changes:** -- **All cache TTLs increased from short durations to 365 days** (essentially permanent storage) -- Cache only refreshes when explicitly using `force_refresh=true` - -**Before:** -```python -"gimie": 1 day # Was expiring too quickly! -"llm": 30 days -"github_user": 7 days -"github_org": 7 days -"orcid": 14 days -"llm_user": 7 days -"llm_org": 7 days -``` - -**After:** -```python -"gimie": 365 days # ✅ Essentially permanent -"llm": 365 days # ✅ Essentially permanent -"github_user": 365 days -"github_org": 365 days -"orcid": 365 days -"llm_user": 365 days -"llm_org": 365 days -``` - -**Benefits:** -- Cache persists across restarts -- No unexpected cache expiration -- Reduces API calls significantly -- Only refreshes when you explicitly request it - ---- - -### 2. Infoscience API Integration ✅ - -**New Files:** -- `src/data_models/infoscience.py` - Pydantic models for Infoscience entities -- `src/context/infoscience.py` - API client and PydanticAI tool functions -- `INFOSCIENCE_INTEGRATION.md` - Comprehensive integration documentation - -**Modified Files:** -- `src/agents/repository.py` - Registered Infoscience tools -- `src/agents/user.py` - Registered author search tools -- `src/agents/organization_enrichment.py` - Registered lab/publication tools -- `src/agents/repository_prompts.py` - Updated with tool usage guidelines -- `src/agents/prompts.py` - Updated user agent prompts -- `src/agents/organization_prompts.py` - Updated org agent prompts -- `src/context/__init__.py` - Exported Infoscience tools -- `src/data_models/__init__.py` - Exported Infoscience models - -#### New Tool Functions - -**1. `search_infoscience_publications_tool(query: str, max_results: int = 10)`** -- Search publications by title, DOI, keywords -- Returns markdown-formatted results -- In-memory caching to prevent duplicate searches - -**2. `search_infoscience_authors_tool(name: str, max_results: int = 10)`** -- Search for EPFL authors/researchers -- Returns author profiles with publications count - -**3. `search_infoscience_labs_tool(name: str, max_results: int = 10)`** -- Search for labs/organizational units -- Returns community/collection information - -**4. `get_author_publications_tool(author_name: str, max_results: int = 10)`** -- Get all publications by a specific author -- Includes metadata (DOI, date, abstract) - -#### Features Implemented - -✅ **API Integration** -- Base URL: `https://infoscience.epfl.ch/server/api` -- DSpace 7.6 API compatible -- Async HTTP with `httpx` -- Optional authentication via `INFOSCIENCE_TOKEN` - -✅ **In-Memory Caching** -- Prevents duplicate API calls within a session -- Caches both successful results and empty results -- Automatic cache key generation - -✅ **Pydantic Models** -- `InfosciencePublication` - Publication metadata with DOI, authors, abstract -- `InfoscienceAuthor` - Author profiles with affiliations -- `InfoscienceLab` - Lab/organizational unit metadata -- `InfoscienceSearchResult` - Wrapper with pagination info -- All models include `to_markdown()` methods - -✅ **Strategic Tool Usage** -- Agents instructed to search for repository/tool name FIRST -- ONE search per subject to avoid repetition -- Maximum 2 attempts per subject -- Accept when information is not found - -✅ **Error Handling** -- Graceful handling of HTTP errors (404, timeouts) -- Informative error messages in markdown format -- Comprehensive logging with debug/info/error levels - -#### Agent Integration - -**Repository Agent:** -- Searches for publications about the repository/tool itself -- Example: Repository "gimie" → searches "gimie" in Infoscience - -**User Enrichment Agent:** -- Searches for authors by name -- Gets their publication lists from Infoscience - -**Organization Enrichment Agent:** -- Searches for labs/organizational units -- Finds affiliated publications -- Can search by repository name to find related research - ---- - -### 3. Documentation Updates ✅ - -**Updated Files:** -- `.cursor/rules/ai-agents.mdc` - Added Infoscience tools section -- `.cursor/rules/project-architecture.mdc` - Added Infoscience integration details - -#### Changes in `ai-agents.mdc` - -**Added Section: "Infoscience Tools"** -- Complete tool function documentation -- Usage guidelines and strategic patterns -- Integration details for each agent type -- Caching behavior explanation - -**Updated Section: "Data Sources"** -- Added Infoscience as a primary data source -- Documented API endpoints and authentication - -#### Changes in `project-architecture.mdc` - -**Updated Directory Structure:** -- Added `context/infoscience.py` reference -- Added `data_models/infoscience.py` reference - -**New Module Documentation:** -- `context/` module purpose and patterns -- Infoscience integration architecture - -**Updated External Services:** -- Added Infoscience API details -- Documented DSpace 7.6 endpoints -- Added authentication requirements - -**Updated Environment Variables:** -- Added `INFOSCIENCE_TOKEN` (optional) -- Added all cache TTL configuration options -- Documented 365-day default TTL - -**Updated Cache Configuration:** -- Detailed TTL settings for all cache types -- Explained permanent storage behavior -- Documented `force_refresh` behavior - ---- - -## Environment Variables Reference - -### New/Updated Variables - -```bash -# Infoscience API (Optional) -INFOSCIENCE_TOKEN=your_token_here - -# Cache TTL Configuration (All default to 365 days) -CACHE_DEFAULT_TTL_DAYS=365 -CACHE_GIMIE_TTL_DAYS=365 -CACHE_LLM_TTL_DAYS=365 -CACHE_GITHUB_USER_TTL_DAYS=365 -CACHE_GITHUB_ORG_TTL_DAYS=365 -CACHE_ORCID_TTL_DAYS=365 -CACHE_LLM_USER_TTL_DAYS=365 -CACHE_LLM_ORG_TTL_DAYS=365 -``` - ---- - -## Testing the Changes - -### Test Cache TTL Changes -```bash -# Run API request -curl "http://localhost:1234/v1/repository/llm/json/https%3A//github.com/user/repo" - -# Check logs - should show "expires in 365 days" -# Second request should use cached data -``` - -### Test Infoscience Tools -```bash -# Run analysis with org enrichment (uses Infoscience tools) -curl "http://localhost:1234/v1/repository/llm/json/https%3A//github.com/sdsc-ordes/gimie?enrich_orgs=true" - -# Check logs for: -# - "🔍 Agent tool called: search_infoscience_publications_tool" -# - "⚡ Returning cached result" (on second call) -``` - ---- - -## Benefits Summary - -### Cache Changes -✅ Cache persists essentially forever (365 days) -✅ Significantly reduced API calls -✅ Faster response times on repeated requests -✅ Only refreshes when explicitly requested - -### Infoscience Integration -✅ Rich EPFL research context for repositories -✅ Author publication history integration -✅ Lab/organization affiliation data -✅ Strategic tool usage prevents excessive API calls -✅ In-memory caching for efficient agent behavior - -### Documentation -✅ Comprehensive rule files for future reference -✅ Clear integration patterns documented -✅ Environment variable reference updated -✅ Tool usage guidelines for AI agents - ---- - -## Next Steps (Optional) - -1. **Set Infoscience Token** (if needed for protected endpoints): - ```bash - export INFOSCIENCE_TOKEN=your_token_here - ``` - -2. **Monitor Agent Behavior**: - - Check logs for tool usage patterns - - Verify caching is working (look for "⚡ Returning cached result") - - Ensure agents don't make repetitive searches - -3. **Adjust Cache TTL** (if needed): - - Default 365 days should work for most cases - - Can increase to 3650 days (10 years) if desired - - Or set per-API-type using environment variables - -4. **Review Infoscience Results**: - - Check quality of publication searches - - Verify author/lab searches return relevant data - - Monitor API response times and errors - ---- - -**All updates completed successfully!** 🎉 +- Repository author-level linked-entity assignment remains a scaffolded follow-up path. +- Reverse JSON-LD conversion is strongest for repository graphs; user/org reverse paths remain simplified. diff --git a/docs/api-and-cli.md b/docs/api-and-cli.md new file mode 100644 index 0000000..1e2a51b --- /dev/null +++ b/docs/api-and-cli.md @@ -0,0 +1,66 @@ +# API and CLI + +## Main entrypoints + +- API app: `src/api.py` +- Repository analysis orchestrator: `src/analysis/repositories.py` +- User analysis orchestrator: `src/analysis/user.py` +- Organization analysis orchestrator: `src/analysis/organization.py` + +## Active API endpoints + +- `GET /v1/repository/gimie/json-ld/{full_path:path}`: GIMIE-only JSON-LD extraction. +- `GET /v1/repository/llm/json/{full_path:path}`: repository Pydantic output. +- `GET /v1/repository/llm/json-ld/{full_path:path}`: repository JSON-LD output. +- `GET /v1/user/llm/json/{full_path:path}`: user profile analysis. +- `GET /v1/org/llm/json/{full_path:path}`: organization analysis. +- Cache management endpoints under `/v1/cache/*`. + +Common query flags: + +- `force_refresh=true`: bypass cache. +- `enrich_orgs=true`: run organization enrichment. +- `enrich_users=true`: run user enrichment (repository/user routes). + +## Response shape + +All analysis endpoints return `APIOutput` (`src/data_models/api.py`): + +- `link` +- `type` (`repository`, `user`, `organization`) +- `parsedTimestamp` +- `output` (model object or JSON-LD dict) +- `stats` (`APIStats` token usage, duration, GitHub rate-limit headers) + +## Endpoint-to-pipeline map + +```mermaid +flowchart LR + A[/repository/llm/*] --> R[Repository.run_analysis] + B[/user/llm/json/*] --> U[User.run_analysis] + C[/org/llm/json/*] --> O[Organization.run_analysis] + + R --> RA[Atomic repository pipeline + optional enrichments] + U --> UA[GitHub parse + user/org enrich + linked entities + EPFL] + O --> OA[Atomic organization pipeline] +``` + +## Run the API + +```bash +just serve-dev +``` + +## Smoke tests + +```bash +just api-test-gimie +just api-test-extract +just api-test-extract-refresh +``` + +## CLI status and alternatives + +- `just extract ...` currently calls `src/main.py`, which still imports legacy `core.*` modules. +- Use API endpoints for full extraction flows. +- Use `scripts/convert_json_jsonld.py` for JSON <-> JSON-LD conversion workflows. diff --git a/docs/architecture/design-notes.md b/docs/architecture/design-notes.md new file mode 100644 index 0000000..99101cb --- /dev/null +++ b/docs/architecture/design-notes.md @@ -0,0 +1,97 @@ +# Design Notes + +This page documents the current runtime architecture in `v2.0.0`. + +## Component architecture + +```mermaid +flowchart TB + subgraph Client + C1[HTTP client / frontend] + end + + subgraph API + A1[FastAPI router
src/api.py] + A2[Request logging middleware
AsyncRequestContext] + end + + subgraph Analysis + R[Repository analysis] + U[User analysis] + O[Organization analysis] + AA[Atomic agents] + end + + subgraph Data + M[Data models
src/data_models] + Cache[SQLite cache
src/cache] + end + + subgraph External + G[GitHub API] + I[Infoscience API] + ORCID[ORCID / Selenium] + ROR[ROR API] + LLM[Configured LLM provider] + GIMIE[GIMIE] + end + + C1 --> A2 --> A1 + A1 --> R + A1 --> U + A1 --> O + + R --> AA + U --> AA + O --> AA + + R --> M + U --> M + O --> M + + R --> Cache + U --> Cache + O --> Cache + + AA --> LLM + R --> GIMIE + R --> G + U --> G + O --> G + AA --> I + AA --> ROR + R --> ORCID +``` + +## Repository request sequence + +```mermaid +sequenceDiagram + autonumber + participant Client + participant API as FastAPI /v1/repository/llm/json + participant Repo as Repository.run_analysis + participant Cache as CacheManager + participant Pipe as Atomic pipeline + enrichments + + Client->>API: GET /v1/repository/llm/json/{url} + API->>Repo: initialize + run_analysis(...) + Repo->>Cache: check repository cache + alt cache hit and not force_refresh + Cache-->>Repo: cached object + Repo-->>API: output + stats + else cache miss or forced refresh + Repo->>Pipe: run GIMIE + atomic stages + Pipe-->>Repo: structured repository model + Repo->>Pipe: optional enrichments + final EPFL assessment + Repo->>Cache: persist final model + Repo-->>API: output + stats + end + API-->>Client: APIOutput +``` + +## Notes + +- Cache TTL defaults are configured in `src/cache/cache_config.py` (default 365 days unless overridden). +- Repository pipeline includes optional enrichments (`enrich_orgs`, `enrich_users`) and always runs final validation before caching. +- Organization analysis uses an atomic 6-stage flow; user analysis combines GitHub parsing + LLM + enrichment steps. diff --git a/docs/assets/javascripts/mermaid-init.js b/docs/assets/javascripts/mermaid-init.js new file mode 100644 index 0000000..1e031bd --- /dev/null +++ b/docs/assets/javascripts/mermaid-init.js @@ -0,0 +1,23 @@ +(function () { + const renderMermaid = () => { + if (typeof mermaid === "undefined") { + return; + } + + mermaid.initialize({ + startOnLoad: false, + securityLevel: "loose", + theme: "default", + }); + + mermaid.run({ + querySelector: ".mermaid", + }); + }; + + if (typeof document$ !== "undefined" && document$.subscribe) { + document$.subscribe(renderMermaid); + } else { + window.addEventListener("load", renderMermaid); + } +})(); diff --git a/docs/getting-started.md b/docs/getting-started.md new file mode 100644 index 0000000..1fb23b8 --- /dev/null +++ b/docs/getting-started.md @@ -0,0 +1,76 @@ +# Getting Started + +## 1. Install dependencies + +```bash +just install-dev +``` + +## 2. Configure environment + +Create `.env` from `.env.dist` (or update your existing `.env`) and set the required variables: + +- `OPENAI_API_KEY` +- `OPENROUTER_API_KEY` +- `GITHUB_TOKEN` +- `GITLAB_TOKEN` +- `INFOSCIENCE_TOKEN` +- `MODEL` +- `PROVIDER` +- `SELENIUM_REMOTE_URL` +- `CACHE_DB_PATH` +- `MAX_SELENIUM_SESSIONS` +- `MAX_CACHE_ENTRIES` +- `GUNICORN_CMD_ARGS` + +Commonly used additional variable: + +- `RCP_TOKEN` (used by the configured `openai-compatible` endpoint in `src/llm/model_config.py`) + +## 3. Run the API locally + +```bash +just serve-dev +``` + +Swagger UI: + +- `http://localhost:1234/docs` + +## 4. Run tests and checks + +```bash +just test +just lint +just type-check +``` + +Or all checks together: + +```bash +just ci +``` + +## 5. Build and preview docs + +```bash +uv pip install -e ".[docs]" +just docs-build +just docs-serve +``` + +## Local workflow + +```mermaid +flowchart LR + A[Edit code or prompts] --> B[Run tests and checks] + B --> C[Run API locally] + C --> D[Call endpoints / inspect output] + D --> A +``` + +## CLI status + +- The primary production interface is the FastAPI service (`src/api.py`). +- `src/main.py` currently references legacy imports (`core.*`) and does not run as-is in the current v2 layout. +- For conversion workflows, use `scripts/convert_json_jsonld.py`. diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..bfaf66d --- /dev/null +++ b/docs/index.md @@ -0,0 +1,43 @@ +# Git Metadata Extractor Documentation + +Git Metadata Extractor analyzes Git repositories, GitHub users, and GitHub organizations, then enriches the output with LLM-based classification and academic catalog context. + +## Documentation map + +- [Getting Started](getting-started.md) +- [API and CLI](api-and-cli.md) +- [Design Notes](architecture/design-notes.md) +- [Repository Analysis Strategy](AGENT_STRATEGY.md) +- [Legacy Releases](releases/legacy-releases.md) + +## System overview + +```mermaid +flowchart LR + A[Client or Automation] --> B[FastAPI app
src/api.py] + + B --> C1[Repository analysis
src/analysis/repositories.py] + B --> C2[User analysis
src/analysis/user.py] + B --> C3[Organization analysis
src/analysis/organization.py] + + C1 --> D[Atomic agents
src/agents/atomic_agents] + C2 --> D + C3 --> D + + C1 --> E[Data models
src/data_models] + C2 --> E + C3 --> E + + C1 --> F[Cache manager
src/cache] + C2 --> F + C3 --> F + + D --> G[LLM provider] + D --> H[Infoscience / ORCID / ROR / GitHub APIs] +``` + +## Versioning + +- `dev` and `latest` track the `main` branch documentation. +- Tagged releases publish immutable doc versions. +- `stable` points to the newest released docs. diff --git a/docs/releases/legacy-releases.md b/docs/releases/legacy-releases.md new file mode 100644 index 0000000..476d909 --- /dev/null +++ b/docs/releases/legacy-releases.md @@ -0,0 +1,19 @@ +# Legacy Releases + +This project publishes versioned docs with MkDocs + Mike. The current full docs baseline starts at `v2.0.0`. + +## `v1.0.0` + +- No maintained `docs/` site structure equivalent to the current versioned documentation. +- Refer to release history in `CHANGELOG.md`. + +## `v0.1.0` + +- Included minimal documentation artifacts only. +- Refer to release history in `CHANGELOG.md` for historical context. + +## Version selector policy + +- `dev` and `latest`: docs built from `main`. +- Tagged releases: immutable documentation versions. +- `stable`: alias to the newest released version. diff --git a/justfile b/justfile index be9461a..162c6e5 100644 --- a/justfile +++ b/justfile @@ -177,6 +177,34 @@ docs: @echo "Opening API documentation at http://localhost:{{PORT}}/docs" @if command -v xdg-open > /dev/null; then xdg-open http://localhost:{{PORT}}/docs; elif command -v open > /dev/null; then open http://localhost:{{PORT}}/docs; else echo "Please open http://localhost:{{PORT}}/docs in your browser"; fi +# ============================================================================ +# Project Documentation Site (MkDocs + Mike) +# ============================================================================ + +# Serve project documentation locally with live reload +docs-serve: + mkdocs serve + +# Build project documentation and fail on warnings +docs-build: + mkdocs build --strict + +# List published documentation versions and aliases +docs-version-list: + mike list + +# Deploy docs from current branch as dev + latest aliases +docs-deploy-dev: + mike deploy --push --branch gh-pages --update-aliases dev latest + +# Deploy docs for a specific release version and update stable alias +docs-deploy-release VERSION: + mike deploy --push --branch gh-pages --update-aliases {{VERSION}} stable + +# Set the default docs version/alias +docs-set-default VERSION: + mike set-default --push --branch gh-pages {{VERSION}} + # Test the main extract endpoint api-test-extract: curl -X GET "http://localhost:{{PORT}}/v1/extract/json/https://github.com/qchapp/lungs-segmentation" | python -m json.tool diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..a5e81e8 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,62 @@ +site_name: Git Metadata Extractor Documentation +site_url: https://imaging-plaza.github.io/git-metadata-extractor/ +repo_url: https://github.com/Imaging-Plaza/git-metadata-extractor +repo_name: Imaging-Plaza/git-metadata-extractor +edit_uri: edit/main/docs/ +docs_dir: docs +site_dir: site + +theme: + name: material + features: + - content.code.copy + - navigation.sections + +plugins: + - search + +extra: + version: + provider: mike + +markdown_extensions: + - admonition + - attr_list + - md_in_html + - toc: + permalink: true + - pymdownx.superfences: + custom_fences: + - name: mermaid + class: mermaid + format: !!python/name:pymdownx.superfences.fence_code_format + +extra_javascript: + - https://unpkg.com/mermaid@11.4.1/dist/mermaid.min.js + - assets/javascripts/mermaid-init.js + +nav: + - Home: index.md + - Getting Started: getting-started.md + - API and CLI: api-and-cli.md + - Architecture: + - Design Notes: architecture/design-notes.md + - Repository Analysis Strategy: AGENT_STRATEGY.md + - Integrations: + - Infoscience Integration: INFOSCIENCE_INTEGRATION.md + - Infoscience API Findings: INFOSCIENCE_API_FINDINGS.md + - JSON-LD: + - Conversion Guide: JSONLD_CONVERSION.md + - Mapping Documentation: PYDANTIC_JSONLD_MAPPING.md + - Conversion Summary: JSONLD_CONVERSION_SUMMARY.md + - Mapping Update: JSONLD_MAPPING_UPDATE.md + - JSON and JSON-LD CLI: JSON_JSONLD_CONVERSION_CLI.md + - Academic Catalog: + - Option B Implementation: ACADEMIC_CATALOG_OPTION_B_IMPLEMENTATION.md + - Refactor Summary: ACADEMIC_CATALOG_REFACTOR_SUMMARY.md + - Updates: + - Affiliation Changes: AFFILIATION_CHANGES.md + - Estimated Tokens Fix: ESTIMATED_TOKENS_FIX.md + - Updates Summary: UPDATES_SUMMARY.md + - Releases: + - Legacy Releases: releases/legacy-releases.md diff --git a/pyproject.toml b/pyproject.toml index ed7f38c..052af75 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "git-metadata-extractor" -version = "2.0.0" +version = "2.0.1" description = "This project is designed to classify imaging software repositories and extract relevant information using AI models." readme = "README.md" requires-python = ">=3.10" @@ -67,6 +67,12 @@ dev = [ "ruff>=0.1.0", "mypy>=1.0.0" ] +docs = [ + "mkdocs>=1.6.0", + "mkdocs-material>=9.5.0", + "mike>=2.1.3", + "pymdown-extensions>=10.14.0" +] [tool.ruff] line-length = 88 diff --git a/src/api.py b/src/api.py index 31a47a2..4667ca3 100644 --- a/src/api.py +++ b/src/api.py @@ -60,7 +60,7 @@ - **Organization Endpoints**: Process GitHub organization data - **Cache Management**: Monitor and control the caching system """, - version="2.0.0", + version="2.0.1", contact={ "name": "EPFL Center for Imaging / SDSC", "url": "https://imaging-plaza.epfl.ch", @@ -174,7 +174,7 @@ def index(): Returns basic information about the API version, GIMIE version, and configured LLM model. """ return { - "title": f"Hello, welcome to the Git Metadata Extractor v2.0.0. Gimie Version 0.7.2. LLM Model {os.environ.get('MODEL', 'N/A (configured via model configs)')}", + "title": f"Hello, welcome to the Git Metadata Extractor v2.0.1. Gimie Version 0.7.2. LLM Model {os.environ.get('MODEL', 'N/A (configured via model configs)')}", }