diff --git a/__tests__/utils/schemaOrg/anvilDataset.test.ts b/__tests__/utils/schemaOrg/anvilDataset.test.ts new file mode 100644 index 000000000..26ca01b67 --- /dev/null +++ b/__tests__/utils/schemaOrg/anvilDataset.test.ts @@ -0,0 +1,182 @@ +import type { DatasetsResponse } from "../../../app/apis/azul/anvil-cmg/common/responses"; +import { buildAnvilDatasetJsonLd } from "../../../app/utils/schemaOrg/anvilDataset"; +import { DESCRIPTION_LENGTH } from "../../../app/utils/schemaOrg/constants"; + +const BROWSER_URL = "https://explore.anvilproject.org"; + +/** + * Builds a minimal valid AnVIL datasets response with optional overrides for + * top-level (dataset) and aggregated (activity/biosample/donor/file/library/ + * diagnosis) fields. + * @param overrides - Partial overrides applied to the base response. + * @returns A `DatasetsResponse` shape suitable for builder tests. + */ +function makeDatasetsResponse( + overrides: Partial = {} +): DatasetsResponse { + return { + activities: [], + biosamples: [], + datasets: [ + { + accessible: true, + consent_group: ["NRES"], + dataset_id: "uuid-1", + description: + "A multi-cohort study of rare disease across many donors and biosamples.", + duos_id: null, + registered_identifier: [], + title: "Rare disease dataset", + }, + ], + diagnoses: [], + donors: [], + entryId: "abc", + files: [], + libraries: [], + status: 200, + ...overrides, + } as unknown as DatasetsResponse; +} + +describe("buildAnvilDatasetJsonLd", () => { + it("returns undefined when no dataset is present", () => { + const response = makeDatasetsResponse({ datasets: [] }); + expect(buildAnvilDatasetJsonLd(response, BROWSER_URL)).toBeUndefined(); + }); + + it("populates required Schema.org Dataset fields", () => { + const result = buildAnvilDatasetJsonLd(makeDatasetsResponse(), BROWSER_URL); + expect(result).toBeDefined(); + expect(result!["@context"]).toBe("https://schema.org"); + expect(result!["@type"]).toBe("Dataset"); + expect(result!.name).toBe("Rare disease dataset"); + expect(result!.description).toBe( + "A multi-cohort study of rare disease across many donors and biosamples." + ); + expect(result!.url).toBe(`${BROWSER_URL}/datasets/uuid-1`); + expect(result!.identifier).toEqual(["uuid-1"]); + expect(result!.isAccessibleForFree).toBe(true); + expect(result!.includedInDataCatalog).toEqual({ + "@type": "DataCatalog", + name: "AnVIL Data Explorer", + url: BROWSER_URL, + }); + }); + + it("falls back to dataset_id when title is empty", () => { + const response = makeDatasetsResponse(); + response.datasets[0].title = ""; + const result = buildAnvilDatasetJsonLd(response, BROWSER_URL); + expect(result!.name).toBe("uuid-1"); + }); + + it("strips HTML tags from description", () => { + const response = makeDatasetsResponse(); + response.datasets[0].description = + "

Single-cell RNA-seq data across many donors and tissues.

"; + const result = buildAnvilDatasetJsonLd(response, BROWSER_URL); + expect(result!.description).toBe( + "Single-cell RNA-seq data across many donors and tissues." + ); + }); + + it("truncates descriptions over 5000 characters and appends an ellipsis", () => { + const longDescription = "a".repeat(DESCRIPTION_LENGTH.MAX + 200); + const response = makeDatasetsResponse(); + response.datasets[0].description = longDescription; + const result = buildAnvilDatasetJsonLd(response, BROWSER_URL); + expect(result!.description).toHaveLength(DESCRIPTION_LENGTH.MAX); + expect(result!.description.endsWith("…")).toBe(true); + }); + + it("pads short descriptions with name and catalog context to meet the 50-char minimum", () => { + const response = makeDatasetsResponse(); + response.datasets[0].description = "Short."; + const result = buildAnvilDatasetJsonLd(response, BROWSER_URL); + expect(result!.description).toBe( + "Rare disease dataset — Short. — AnVIL Data Explorer dataset." + ); + expect(result!.description.length).toBeGreaterThanOrEqual( + DESCRIPTION_LENGTH.MIN + ); + }); + + it("falls back to dataset name plus catalog context when description is missing", () => { + const response = makeDatasetsResponse(); + response.datasets[0].description = undefined; + const result = buildAnvilDatasetJsonLd(response, BROWSER_URL); + expect(result!.description).toBe( + "Rare disease dataset — AnVIL Data Explorer dataset." + ); + expect(result!.description.length).toBeGreaterThanOrEqual( + DESCRIPTION_LENGTH.MIN + ); + }); + + it("includes registered_identifier values in identifier and dbGaP study URLs in sameAs", () => { + const response = makeDatasetsResponse(); + response.datasets[0].registered_identifier = [ + "phs000123", + "phs000456", + null, + ]; + const result = buildAnvilDatasetJsonLd(response, BROWSER_URL); + expect(result!.identifier).toEqual(["uuid-1", "phs000123", "phs000456"]); + expect(result!.sameAs).toEqual([ + "https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs000123", + "https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs000456", + ]); + }); + + it("omits sameAs when there are no registered identifiers", () => { + const result = buildAnvilDatasetJsonLd(makeDatasetsResponse(), BROWSER_URL); + expect(result!.sameAs).toBeUndefined(); + }); + + it("builds deduplicated keywords from activity, biosample, donor, diagnosis, file, and library fields", () => { + const response = makeDatasetsResponse({ + activities: [ + { activity_type: ["sequencing"], data_modality: ["genomic"] }, + ], + biosamples: [{ anatomical_site: ["brain"], biosample_type: ["tissue"] }], + diagnoses: [{ disease: ["epilepsy"], phenotype: [] }], + donors: [ + { + organism_type: ["Homo sapiens"], + phenotypic_sex: ["female"], + reported_ethnicity: ["asian"], + }, + ], + files: [ + { + data_modality: ["genomic"], + file_format: ["fastq", "bam"], + file_id: "f1", + file_type: "sequencing", + }, + ], + libraries: [{ prep_material_name: ["DNA"] }], + } as unknown as Partial); + const result = buildAnvilDatasetJsonLd(response, BROWSER_URL); + expect(result!.keywords).toEqual([ + "sequencing", + "genomic", + "brain", + "tissue", + "Homo sapiens", + "female", + "asian", + "epilepsy", + "fastq", + "bam", + "DNA", + ]); + }); + + it("omits keywords and sameAs when sources are empty", () => { + const result = buildAnvilDatasetJsonLd(makeDatasetsResponse(), BROWSER_URL); + expect(result!.keywords).toBeUndefined(); + expect(result!.sameAs).toBeUndefined(); + }); +}); diff --git a/__tests__/utils/schemaOrg/hcaProjectDataset.test.ts b/__tests__/utils/schemaOrg/hcaProjectDataset.test.ts new file mode 100644 index 000000000..64d5dde9a --- /dev/null +++ b/__tests__/utils/schemaOrg/hcaProjectDataset.test.ts @@ -0,0 +1,289 @@ +import type { ProjectsResponse } from "../../../app/apis/azul/hca-dcp/common/responses"; +import { DESCRIPTION_LENGTH } from "../../../app/utils/schemaOrg/constants"; +import { buildHcaProjectJsonLd } from "../../../app/utils/schemaOrg/hcaProjectDataset"; + +const BROWSER_URL = "https://explore.data.humancellatlas.org"; + +/** + * Builds a minimal valid HCA project response with optional overrides for + * top-level (project) and aggregated (donor/sample/specimen/protocol) fields. + * @param overrides - Partial overrides applied to the base response. + * @returns A `ProjectsResponse` shape suitable for builder tests. + */ +function makeProjectsResponse( + overrides: Partial = {} +): ProjectsResponse { + return { + dates: [], + donorOrganisms: [], + entryId: "abc", + fileTypeSummaries: [], + projects: [ + { + accessible: true, + accessions: [], + bionetworkName: [], + contributedAnalyses: {}, + contributors: [], + dataUseRestriction: null, + duosId: null, + estimatedCellCount: null, + laboratory: [], + matrices: {}, + projectDescription: + "A study of cells across multiple human individuals examining inter-individual variation in gene expression.", + projectId: "uuid-1", + projectShortname: "Cell Study", + projectTitle: "Cells of the body", + publications: [], + supplementaryLinks: [], + tissueAtlas: [], + }, + ], + protocols: [], + samples: [], + specimens: [], + status: 200, + ...overrides, + } as unknown as ProjectsResponse; +} + +describe("buildHcaProjectJsonLd", () => { + it("returns undefined when no project is present", () => { + const response = makeProjectsResponse({ projects: [] }); + expect(buildHcaProjectJsonLd(response, BROWSER_URL)).toBeUndefined(); + }); + + it("populates required Schema.org Dataset fields", () => { + const result = buildHcaProjectJsonLd(makeProjectsResponse(), BROWSER_URL); + expect(result).toBeDefined(); + expect(result!["@context"]).toBe("https://schema.org"); + expect(result!["@type"]).toBe("Dataset"); + expect(result!.name).toBe("Cells of the body"); + expect(result!.description).toBe( + "A study of cells across multiple human individuals examining inter-individual variation in gene expression." + ); + expect(result!.url).toBe(`${BROWSER_URL}/projects/uuid-1`); + expect(result!.identifier).toEqual(["uuid-1"]); + expect(result!.isAccessibleForFree).toBe(true); + expect(result!.includedInDataCatalog).toEqual({ + "@type": "DataCatalog", + name: "Human Cell Atlas Data Coordination Platform", + url: BROWSER_URL, + }); + }); + + it("falls back to projectShortname when projectTitle is empty", () => { + const response = makeProjectsResponse(); + response.projects[0].projectTitle = ""; + const result = buildHcaProjectJsonLd(response, BROWSER_URL); + expect(result!.name).toBe("Cell Study"); + }); + + it("strips HTML tags from description", () => { + const response = makeProjectsResponse(); + response.projects[0].projectDescription = + "

Single-cell RNA-seq data across many cells and donors and tissues.

"; + const result = buildHcaProjectJsonLd(response, BROWSER_URL); + expect(result!.description).toBe( + "Single-cell RNA-seq data across many cells and donors and tissues." + ); + }); + + it("pads short descriptions with name and catalog context to meet the 50-char minimum", () => { + const response = makeProjectsResponse(); + response.projects[0].projectDescription = "Short."; + const result = buildHcaProjectJsonLd(response, BROWSER_URL); + expect(result!.description).toBe( + "Cells of the body — Short. — Human Cell Atlas Data Coordination Platform project." + ); + expect(result!.description.length).toBeGreaterThanOrEqual( + DESCRIPTION_LENGTH.MIN + ); + }); + + it("falls back to project name plus catalog context when description is empty", () => { + const response = makeProjectsResponse(); + response.projects[0].projectDescription = ""; + const result = buildHcaProjectJsonLd(response, BROWSER_URL); + expect(result!.description).toBe( + "Cells of the body — Human Cell Atlas Data Coordination Platform project." + ); + expect(result!.description.length).toBeGreaterThanOrEqual( + DESCRIPTION_LENGTH.MIN + ); + }); + + it("truncates descriptions over 5000 characters and appends an ellipsis", () => { + const longDescription = "a".repeat(DESCRIPTION_LENGTH.MAX + 200); + const response = makeProjectsResponse(); + response.projects[0].projectDescription = longDescription; + const result = buildHcaProjectJsonLd(response, BROWSER_URL); + expect(result!.description).toHaveLength(DESCRIPTION_LENGTH.MAX); + expect(result!.description.endsWith("…")).toBe(true); + }); + + it("includes accession ids in identifier and identifiers.org URLs in sameAs", () => { + const response = makeProjectsResponse(); + response.projects[0].accessions = [ + { accession: "GSE12345", namespace: "geo_series" }, + { accession: "PRJNA9999", namespace: "insdc_project" }, + { accession: "X", namespace: "unknown_namespace" }, + ]; + const result = buildHcaProjectJsonLd(response, BROWSER_URL); + expect(result!.identifier).toEqual([ + "uuid-1", + "GSE12345", + "PRJNA9999", + "X", + ]); + expect(result!.sameAs).toEqual([ + "https://identifiers.org/geo:GSE12345", + "https://identifiers.org/ena.embl:PRJNA9999", + ]); + }); + + it("splits semicolon-separated accession ids in both identifier and sameAs", () => { + const response = makeProjectsResponse(); + response.projects[0].accessions = [ + { accession: "GSE12345; GSE67890", namespace: "geo_series" }, + ]; + const result = buildHcaProjectJsonLd(response, BROWSER_URL); + expect(result!.identifier).toEqual(["uuid-1", "GSE12345", "GSE67890"]); + expect(result!.sameAs).toEqual([ + "https://identifiers.org/geo:GSE12345", + "https://identifiers.org/geo:GSE67890", + ]); + }); + + it("omits sameAs when no accessions map to a known namespace", () => { + const result = buildHcaProjectJsonLd(makeProjectsResponse(), BROWSER_URL); + expect(result!.sameAs).toBeUndefined(); + }); + + it("builds creators from contributors with affiliation", () => { + const response = makeProjectsResponse(); + response.projects[0].contributors = [ + { + contactName: "Smith,Alice,B", + email: null, + institution: "Example University", + }, + { + contactName: "", + email: null, + institution: "Should be skipped (no name)", + }, + { + contactName: "Jones,Bob", + email: null, + institution: "", + }, + ]; + const result = buildHcaProjectJsonLd(response, BROWSER_URL); + expect(result!.creator).toEqual([ + { + "@type": "Person", + affiliation: { "@type": "Organization", name: "Example University" }, + name: "Alice B Smith", + }, + { "@type": "Person", name: "Bob Jones" }, + ]); + }); + + it("builds citations from publications using DOI then publicationUrl as sameAs", () => { + const response = makeProjectsResponse(); + response.projects[0].publications = [ + { + doi: "10.1000/example", + officialHcaPublication: true, + publicationTitle: "Cell Paper", + publicationUrl: "https://example.org/cell-paper", + }, + { + doi: null, + officialHcaPublication: false, + publicationTitle: "Other Paper", + publicationUrl: "https://example.org/other", + }, + { + doi: null, + officialHcaPublication: false, + publicationTitle: "", + publicationUrl: "https://example.org/no-title", + }, + ]; + const result = buildHcaProjectJsonLd(response, BROWSER_URL); + expect(result!.citation).toEqual([ + { + "@type": "ScholarlyArticle", + headline: "Cell Paper", + name: "Cell Paper", + sameAs: "https://doi.org/10.1000/example", + }, + { + "@type": "ScholarlyArticle", + headline: "Other Paper", + name: "Other Paper", + sameAs: "https://example.org/other", + }, + ]); + }); + + it("builds deduplicated keywords from donor, sample, specimen, and protocol fields", () => { + const response = makeProjectsResponse({ + donorOrganisms: [ + { + biologicalSex: null, + developmentStage: [], + disease: ["normal"], + donorCount: 1, + genusSpecies: ["Homo sapiens"], + organismAge: null, + }, + ], + protocols: [ + { + libraryConstructionApproach: ["10x v2", "10x v3"], + }, + ], + samples: [ + { + disease: ["normal"], + id: ["s1"], + organ: ["brain"], + organPart: ["cortex"], + sampleEntityType: ["specimens"], + }, + ], + specimens: [ + { + disease: ["normal"], + id: ["s1"], + organ: ["brain"], + organPart: ["cortex"], + preservationMethod: [], + source: [], + }, + ], + } as unknown as Partial); + const result = buildHcaProjectJsonLd(response, BROWSER_URL); + expect(result!.keywords).toEqual([ + "Homo sapiens", + "normal", + "brain", + "cortex", + "specimens", + "10x v2", + "10x v3", + ]); + }); + + it("omits keywords, creator, citation, sameAs when sources are empty", () => { + const result = buildHcaProjectJsonLd(makeProjectsResponse(), BROWSER_URL); + expect(result!.keywords).toBeUndefined(); + expect(result!.creator).toBeUndefined(); + expect(result!.citation).toBeUndefined(); + expect(result!.sameAs).toBeUndefined(); + }); +}); diff --git a/__tests__/utils/schemaOrg/lungmapProjectDataset.test.ts b/__tests__/utils/schemaOrg/lungmapProjectDataset.test.ts new file mode 100644 index 000000000..67688c89b --- /dev/null +++ b/__tests__/utils/schemaOrg/lungmapProjectDataset.test.ts @@ -0,0 +1,74 @@ +import type { ProjectsResponse } from "../../../app/apis/azul/hca-dcp/common/responses"; +import { buildLungmapProjectJsonLd } from "../../../app/utils/schemaOrg/lungmapProjectDataset"; + +const BROWSER_URL = "https://data-browser.lungmap.net"; + +/** + * Builds a minimal valid project response for the LungMAP wrapper. The full + * mapping is covered by `hcaProjectDataset.test.ts` (same shared core); this + * file only verifies the LungMAP-specific catalog identity surfaces correctly. + * @returns A `ProjectsResponse` shape sufficient for catalog-identity checks. + */ +function makeProjectsResponse(): ProjectsResponse { + return { + dates: [], + donorOrganisms: [], + entryId: "abc", + fileTypeSummaries: [], + projects: [ + { + accessible: true, + accessions: [], + bionetworkName: [], + contributedAnalyses: {}, + contributors: [], + dataUseRestriction: null, + duosId: null, + estimatedCellCount: null, + laboratory: [], + matrices: {}, + projectDescription: + "A study of lung development and disease across many donors.", + projectId: "uuid-1", + projectShortname: "Lung Study", + projectTitle: "Lung development atlas", + }, + ], + protocols: [], + samples: [], + specimens: [], + status: 200, + } as unknown as ProjectsResponse; +} + +describe("buildLungmapProjectJsonLd", () => { + it("returns undefined when no project is present", () => { + const response = { ...makeProjectsResponse(), projects: [] }; + expect( + buildLungmapProjectJsonLd(response as ProjectsResponse, BROWSER_URL) + ).toBeUndefined(); + }); + + it("surfaces LungMAP as the catalog identity and uses the projects URL pattern", () => { + const result = buildLungmapProjectJsonLd( + makeProjectsResponse(), + BROWSER_URL + ); + expect(result).toBeDefined(); + expect(result!.includedInDataCatalog).toEqual({ + "@type": "DataCatalog", + name: "LungMAP Data Explorer", + url: BROWSER_URL, + }); + expect(result!.url).toBe(`${BROWSER_URL}/projects/uuid-1`); + }); + + it("pads short descriptions with the LungMAP catalog suffix", () => { + const response = makeProjectsResponse(); + response.projects[0].projectDescription = "Short."; + const result = buildLungmapProjectJsonLd(response, BROWSER_URL); + expect(result!.description).toBe( + "Lung development atlas — Short. — LungMAP Data Explorer project." + ); + }); +}); diff --git a/app/utils/schemaOrg/anvilDataset.ts b/app/utils/schemaOrg/anvilDataset.ts new file mode 100644 index 000000000..520986327 --- /dev/null +++ b/app/utils/schemaOrg/anvilDataset.ts @@ -0,0 +1,113 @@ +import type { DatasetEntity } from "../../apis/azul/anvil-cmg/common/entities"; +import type { DatasetsResponse } from "../../apis/azul/anvil-cmg/common/responses"; +import type { SchemaDataset } from "./types"; +import { buildDescription, uniqueNonEmpty } from "./utils"; + +const CATALOG_NAME = "AnVIL Data Explorer"; +const DESCRIPTION_FALLBACK_SUFFIX = `${CATALOG_NAME} dataset.`; +const DBGAP_STUDY_URL = + "https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id="; + +/** + * Builds a Schema.org Dataset JSON-LD object for an AnVIL CMG dataset. + * + * Returns `undefined` when the response does not carry a dataset we can + * describe (i.e. no dataset entity), so the caller can skip rendering. + * @param data - AnVIL CMG dataset detail response from Azul. + * @param browserURL - Site base URL used for canonical and catalog URLs. + * @returns Schema.org Dataset JSON-LD object, or `undefined` if not buildable. + */ +export function buildAnvilDatasetJsonLd( + data: DatasetsResponse, + browserURL: string +): SchemaDataset | undefined { + const dataset = data.datasets?.[0]; + if (!dataset) return undefined; + + const name = dataset.title || dataset.dataset_id; + const description = buildDescription( + dataset.description || "", + name, + DESCRIPTION_FALLBACK_SUFFIX + ); + const identifier = uniqueNonEmpty([ + dataset.dataset_id, + ...dataset.registered_identifier, + ]); + + const jsonLd: SchemaDataset = { + "@context": "https://schema.org", + "@type": "Dataset", + description, + identifier, + includedInDataCatalog: { + "@type": "DataCatalog", + name: CATALOG_NAME, + url: browserURL, + }, + // Google's `isAccessibleForFree` is the inverse of "paid", not "unrestricted" — dbGaP gating doesn't change the value. + isAccessibleForFree: true, + name, + url: `${browserURL}/datasets/${dataset.dataset_id}`, + }; + + const sameAs = buildSameAs(dataset); + if (sameAs.length > 0) jsonLd.sameAs = sameAs; + + const keywords = buildKeywords(data); + if (keywords.length > 0) jsonLd.keywords = keywords; + + return jsonLd; +} + +/** + * Builds a keywords array by unioning biologically-meaningful fields from the + * dataset's aggregated activity/biosample/donor/file/library/diagnosis + * responses. + * @param data - AnVIL CMG dataset detail response. + * @returns Deduplicated keywords array. + */ +function buildKeywords(data: DatasetsResponse): string[] { + const values: (string | null | undefined)[] = []; + for (const activity of data.activities ?? []) { + values.push(...(activity.activity_type ?? [])); + values.push(...(activity.data_modality ?? [])); + } + for (const biosample of data.biosamples ?? []) { + values.push(...(biosample.anatomical_site ?? [])); + values.push(...(biosample.biosample_type ?? [])); + } + for (const donor of data.donors ?? []) { + values.push(...(donor.organism_type ?? [])); + values.push(...(donor.phenotypic_sex ?? [])); + values.push(...(donor.reported_ethnicity ?? [])); + } + for (const diagnosis of data.diagnoses ?? []) { + values.push(...(diagnosis.disease ?? [])); + } + for (const file of data.files ?? []) { + values.push(...(file.data_modality ?? [])); + values.push(...(file.file_format ?? [])); + } + for (const library of data.libraries ?? []) { + values.push(...(library.prep_material_name ?? [])); + } + return uniqueNonEmpty(values); +} + +/** + * Builds the sameAs array of external accession URLs. AnVIL datasets reference + * dbGaP study pages via their `registered_identifier` (phs accessions). + * @param dataset - AnVIL dataset entity. + * @returns Array of canonical dbGaP study URLs. + */ +function buildSameAs(dataset: DatasetEntity): string[] { + const urls: string[] = []; + for (const id of dataset.registered_identifier) { + if (!id) continue; + const trimmed = id.trim(); + if (!trimmed) continue; + urls.push(`${DBGAP_STUDY_URL}${trimmed}`); + } + return uniqueNonEmpty(urls); +} diff --git a/app/utils/schemaOrg/constants.ts b/app/utils/schemaOrg/constants.ts new file mode 100644 index 000000000..d3bfb45a7 --- /dev/null +++ b/app/utils/schemaOrg/constants.ts @@ -0,0 +1,14 @@ +/** + * Schema.org Dataset constants shared by consumer-specific JSON-LD builders. + */ + +/** + * Google Dataset Search description-length bounds. Descriptions outside this + * range may be rejected or downranked by Google's structured-data validator. + * + * See https://developers.google.com/search/docs/appearance/structured-data/dataset + */ +export const DESCRIPTION_LENGTH = { + MAX: 5000, + MIN: 50, +} as const; diff --git a/app/utils/schemaOrg/hcaProjectDataset.ts b/app/utils/schemaOrg/hcaProjectDataset.ts new file mode 100644 index 000000000..aa2bcd084 --- /dev/null +++ b/app/utils/schemaOrg/hcaProjectDataset.ts @@ -0,0 +1,24 @@ +import type { ProjectsResponse } from "../../apis/azul/hca-dcp/common/responses"; +import type { ProjectCatalogOptions } from "./projectDataset"; +import { buildProjectJsonLd } from "./projectDataset"; +import type { SchemaDataset } from "./types"; + +const CATALOG_NAME = "Human Cell Atlas Data Coordination Platform"; + +const OPTIONS: ProjectCatalogOptions = { + catalogName: CATALOG_NAME, + descriptionFallbackSuffix: `${CATALOG_NAME} project.`, +}; + +/** + * Builds a Schema.org Dataset JSON-LD object for an HCA DCP project. + * @param data - HCA DCP project detail response from Azul. + * @param browserURL - Site base URL used for canonical and catalog URLs. + * @returns Schema.org Dataset JSON-LD object, or `undefined` if not buildable. + */ +export function buildHcaProjectJsonLd( + data: ProjectsResponse, + browserURL: string +): SchemaDataset | undefined { + return buildProjectJsonLd(data, browserURL, OPTIONS); +} diff --git a/app/utils/schemaOrg/lungmapProjectDataset.ts b/app/utils/schemaOrg/lungmapProjectDataset.ts new file mode 100644 index 000000000..d07cba9d3 --- /dev/null +++ b/app/utils/schemaOrg/lungmapProjectDataset.ts @@ -0,0 +1,27 @@ +import type { ProjectsResponse } from "../../apis/azul/hca-dcp/common/responses"; +import type { ProjectCatalogOptions } from "./projectDataset"; +import { buildProjectJsonLd } from "./projectDataset"; +import type { SchemaDataset } from "./types"; + +const CATALOG_NAME = "LungMAP Data Explorer"; + +const OPTIONS: ProjectCatalogOptions = { + catalogName: CATALOG_NAME, + descriptionFallbackSuffix: `${CATALOG_NAME} project.`, +}; + +/** + * Builds a Schema.org Dataset JSON-LD object for a LungMAP project. LungMAP + * shares the HCA Azul backend, so the response shape matches HCA's + * `ProjectsResponse` and the shared `buildProjectJsonLd` core does the + * mapping; this wrapper just supplies LungMAP-specific catalog identity. + * @param data - LungMAP project detail response from Azul. + * @param browserURL - Site base URL used for canonical and catalog URLs. + * @returns Schema.org Dataset JSON-LD object, or `undefined` if not buildable. + */ +export function buildLungmapProjectJsonLd( + data: ProjectsResponse, + browserURL: string +): SchemaDataset | undefined { + return buildProjectJsonLd(data, browserURL, OPTIONS); +} diff --git a/app/utils/schemaOrg/projectDataset.ts b/app/utils/schemaOrg/projectDataset.ts new file mode 100644 index 000000000..8703a298e --- /dev/null +++ b/app/utils/schemaOrg/projectDataset.ts @@ -0,0 +1,224 @@ +/** + * Shared Schema.org Dataset builder for consumers that surface HCA-style + * `ProjectResponse` data (HCA DCP, LungMAP). Per-consumer files (e.g. + * `hcaProjectDataset.ts`, `lungmapProjectDataset.ts`) supply a + * `ProjectCatalogOptions` describing catalog identity and call + * `buildProjectJsonLd` to produce the JSON-LD payload. + */ + +import type { + AccessionResponse, + ContributorResponse, + PublicationResponse, +} from "../../apis/azul/hca-dcp/common/entities"; +import type { ProjectsResponse } from "../../apis/azul/hca-dcp/common/responses"; +import { transformAccessionURL } from "../../viewModelBuilders/azul/hca-dcp/common/accessionMapper/accessionMapper"; +import { ACCESSION_CONFIGS_BY_RESPONSE_KEY } from "../../viewModelBuilders/azul/hca-dcp/common/accessionMapper/constants"; +import type { + SchemaDataset, + SchemaOrganization, + SchemaPerson, + SchemaScholarlyArticle, +} from "./types"; +import { buildDescription, uniqueNonEmpty } from "./utils"; + +/** + * Per-consumer catalog identity used to populate `includedInDataCatalog` and + * the description-padding fallback. Callers (e.g. HCA, LungMAP) supply this + * via thin wrappers so the shared builder stays consumer-agnostic. + */ +export interface ProjectCatalogOptions { + catalogName: string; + descriptionFallbackSuffix: string; +} + +/** + * Builds the citation array from project publications. Skips entries without a + * title. Prefers DOI for `sameAs`, falling back to the publication URL. + * @param publications - Project publications. + * @returns Array of schema.org ScholarlyArticle objects. + */ +function buildCitations( + publications: PublicationResponse[] +): SchemaScholarlyArticle[] { + const citations: SchemaScholarlyArticle[] = []; + for (const publication of publications ?? []) { + if (!publication.publicationTitle) continue; + const article: SchemaScholarlyArticle = { + "@type": "ScholarlyArticle", + headline: publication.publicationTitle, + name: publication.publicationTitle, + }; + if (publication.doi) { + article.sameAs = `https://doi.org/${publication.doi}`; + } else if (publication.publicationUrl) { + article.sameAs = publication.publicationUrl; + } + citations.push(article); + } + return citations; +} + +/** + * Builds the creator array from project contributors. Skips entries without a + * name. When the contributor has an institution, attaches it as an affiliation. + * @param contributors - Project contributors. + * @returns Array of schema.org Person objects. + */ +function buildCreators(contributors: ContributorResponse[]): SchemaPerson[] { + const creators: SchemaPerson[] = []; + for (const contributor of contributors ?? []) { + if (!contributor.contactName) continue; + const person: SchemaPerson = { + "@type": "Person", + name: normaliseContactName(contributor.contactName), + }; + if (contributor.institution) { + const affiliation: SchemaOrganization = { + "@type": "Organization", + name: contributor.institution, + }; + person.affiliation = affiliation; + } + creators.push(person); + } + return creators; +} + +/** + * Builds a keywords array by unioning biologically-meaningful fields from the + * project's aggregated donor/sample/specimen/protocol responses. + * @param data - Project detail response. + * @returns Deduplicated keywords array. + */ +function buildKeywords(data: ProjectsResponse): string[] { + const values: (string | null | undefined)[] = []; + for (const donor of data.donorOrganisms ?? []) { + values.push(...(donor.genusSpecies ?? [])); + values.push(...(donor.disease ?? [])); + } + for (const sample of data.samples ?? []) { + values.push(...(sample.organ ?? [])); + values.push(...(sample.organPart ?? [])); + values.push(...(sample.disease ?? [])); + values.push(...(sample.sampleEntityType ?? [])); + } + for (const specimen of data.specimens ?? []) { + values.push(...(specimen.organ ?? [])); + values.push(...(specimen.organPart ?? [])); + values.push(...(specimen.disease ?? [])); + } + for (const protocol of data.protocols ?? []) { + values.push(...(protocol.libraryConstructionApproach ?? [])); + values.push(...(protocol.instrumentManufacturerModel ?? [])); + } + return uniqueNonEmpty(values); +} + +/** + * Builds a Schema.org Dataset JSON-LD object from a project detail response. + * + * Returns `undefined` when the response does not carry a project we can + * describe, so the caller can skip rendering. + * @param data - Project detail response from Azul. + * @param browserURL - Site base URL used for canonical and catalog URLs. + * @param options - Consumer-specific catalog identity. + * @returns Schema.org Dataset JSON-LD object, or `undefined` if not buildable. + */ +export function buildProjectJsonLd( + data: ProjectsResponse, + browserURL: string, + options: ProjectCatalogOptions +): SchemaDataset | undefined { + const project = data.projects?.[0]; + if (!project) return undefined; + + const name = project.projectTitle || project.projectShortname; + const description = buildDescription( + project.projectDescription, + name, + options.descriptionFallbackSuffix + ); + const identifier = uniqueNonEmpty([ + project.projectId, + ...project.accessions.flatMap((accession) => + splitAccessionIds(accession.accession) + ), + ]); + + const jsonLd: SchemaDataset = { + "@context": "https://schema.org", + "@type": "Dataset", + description, + identifier, + includedInDataCatalog: { + "@type": "DataCatalog", + name: options.catalogName, + url: browserURL, + }, + isAccessibleForFree: true, + name, + url: `${browserURL}/projects/${project.projectId}`, + }; + + const sameAs = buildSameAs(project.accessions); + if (sameAs.length > 0) jsonLd.sameAs = sameAs; + + const keywords = buildKeywords(data); + if (keywords.length > 0) jsonLd.keywords = keywords; + + const creator = buildCreators(project.contributors); + if (creator.length > 0) jsonLd.creator = creator; + + const citation = buildCitations(project.publications); + if (citation.length > 0) jsonLd.citation = citation; + + return jsonLd; +} + +/** + * Builds the sameAs array of external accession URLs via identifiers.org. + * Only includes accessions whose namespace maps to a known identifier prefix. + * @param accessions - Project accessions from the Azul response. + * @returns Array of canonical accession URLs. + */ +function buildSameAs(accessions: AccessionResponse[]): string[] { + const urls: string[] = []; + for (const { accession, namespace } of accessions) { + const prefix = + ACCESSION_CONFIGS_BY_RESPONSE_KEY.get(namespace)?.identifierOrgPrefix; + if (!prefix) continue; + for (const id of splitAccessionIds(accession)) { + const url = transformAccessionURL(id, prefix); + if (url) urls.push(url); + } + } + return uniqueNonEmpty(urls); +} + +/** + * Normalises an Azul contributor's contactName from "Last,First,Middle" to + * "First Middle Last" for use as a Schema.org Person.name value. + * @param contactName - Raw contactName from the Azul response. + * @returns Human-readable contributor name. + */ +function normaliseContactName(contactName: string): string { + const parts = contactName.split(",").map((part) => part.trim()); + if (parts.length < 2) return contactName; + const [last, ...rest] = parts; + return [...rest, last].filter(Boolean).join(" "); +} + +/** + * Splits an Azul accession string into individual accession IDs. Azul returns + * accessions as a semicolon-separated string when a project carries multiple + * IDs under the same namespace (mirrors the split done by `mapAccessions`). + * @param accession - Raw accession value from the Azul response. + * @returns Trimmed, non-empty accession IDs. + */ +function splitAccessionIds(accession: string): string[] { + return accession + .split(";") + .map((id) => id.trim()) + .filter(Boolean); +} diff --git a/app/utils/schemaOrg/types.ts b/app/utils/schemaOrg/types.ts new file mode 100644 index 000000000..19acb58e7 --- /dev/null +++ b/app/utils/schemaOrg/types.ts @@ -0,0 +1,74 @@ +/** + * Shared Schema.org Dataset types used by per-consumer JSON-LD builders + * (HCA DCP, AnVIL, LungMAP). Each consumer composes its own `SchemaDataset` + * from its source entity and renders it via the shared `JsonLd` component. + * + * See https://developers.google.com/search/docs/appearance/structured-data/dataset + * for Google's Dataset structured data guidelines. + */ + +/** + * Schema.org DataCatalog type. + */ +export interface SchemaDataCatalog { + "@type": "DataCatalog"; + name: string; + url: string; +} + +/** + * Schema.org DataDownload type. + */ +export interface SchemaDataDownload { + "@type": "DataDownload"; + contentUrl: string; + encodingFormat?: string; +} + +/** + * Schema.org Dataset JSON-LD structure. + */ +export interface SchemaDataset { + "@context": "https://schema.org"; + "@type": "Dataset"; + citation?: SchemaScholarlyArticle[]; + creator?: (SchemaPerson | SchemaOrganization)[]; + description: string; + distribution?: SchemaDataDownload[]; + identifier: string[]; + includedInDataCatalog: SchemaDataCatalog; + isAccessibleForFree: boolean; + keywords?: string[]; + measurementTechnique?: string[]; + name: string; + sameAs?: string[]; + url: string; +} + +/** + * Schema.org Organization type. + */ +export interface SchemaOrganization { + "@type": "Organization"; + name: string; +} + +/** + * Schema.org Person type. + */ +export interface SchemaPerson { + "@type": "Person"; + affiliation?: SchemaOrganization; + name: string; +} + +/** + * Schema.org ScholarlyArticle type. + */ +export interface SchemaScholarlyArticle { + "@type": "ScholarlyArticle"; + author?: SchemaPerson[]; + headline: string; + name: string; + sameAs?: string; +} diff --git a/app/utils/schemaOrg/utils.ts b/app/utils/schemaOrg/utils.ts new file mode 100644 index 000000000..d29358c49 --- /dev/null +++ b/app/utils/schemaOrg/utils.ts @@ -0,0 +1,86 @@ +import { DESCRIPTION_LENGTH } from "./constants"; + +/** + * Builds a Schema.org description string from a raw entity description, padding + * short or empty values with the entity name and a caller-supplied fallback + * suffix so the result satisfies Google's minimum description-length + * requirement (50 chars). + * @param sourceDescription - Raw description (may contain HTML, may be empty). + * @param name - Entity name used in the padded fallback. + * @param fallbackSuffix - Caller-owned suffix (e.g. catalog + entity kind) used + * to reliably push padded descriptions past the 50-character minimum. The + * caller controls phrasing and punctuation; the helper does not add a period. + * @returns HTML-stripped description, padded when short, truncated when long. + */ +export function buildDescription( + sourceDescription: string, + name: string, + fallbackSuffix: string +): string { + const stripped = stripHtmlTags(sourceDescription || ""); + if (stripped.length >= DESCRIPTION_LENGTH.MIN) { + return truncateDescription(stripped); + } + const padded = stripped + ? `${name} — ${stripped} — ${fallbackSuffix}` + : `${name} — ${fallbackSuffix}`; + return truncateDescription(padded); +} + +/** + * Escapes a JSON string for safe embedding inside an HTML `` or HTML entity injection. + * @param json - Serialised JSON to embed. + * @returns Escaped JSON safe for `dangerouslySetInnerHTML`. + */ +export function escapeJsonForHtml(json: string): string { + return json + .replace(//g, "\\u003e") + .replace(/&/g, "\\u0026"); +} + +/** + * Strips HTML tags and collapses whitespace from a description string. + * @param value - Source description (may contain HTML). + * @returns Plain text suitable for embedding in JSON-LD. + */ +export function stripHtmlTags(value: string): string { + return ( + value + // eslint-disable-next-line sonarjs/slow-regex -- `[^>]+` is bounded by the next `>` or end-of-input; no nested quantifiers or alternation, so backtracking is linear. + .replace(/<[^>]+>/g, " ") + .replace(/\s+/g, " ") + .trim() + ); +} + +/** + * Truncates a description to the Google Dataset Search maximum, appending an + * ellipsis when truncation occurs. + * @param description - Plain text description. + * @returns Truncated description. + */ +export function truncateDescription(description: string): string { + if (description.length <= DESCRIPTION_LENGTH.MAX) return description; + return description.slice(0, DESCRIPTION_LENGTH.MAX - 1) + "…"; +} + +/** + * De-duplicates and removes empty/null/undefined entries from a string array. + * @param values - Source array (may contain null, undefined, or duplicates). + * @returns Deduplicated array of non-empty strings, preserving first-seen order. + */ +export function uniqueNonEmpty( + values: (string | null | undefined)[] +): string[] { + const seen = new Set(); + const result: string[] = []; + for (const value of values) { + if (!value) continue; + if (seen.has(value)) continue; + seen.add(value); + result.push(value); + } + return result; +} diff --git a/app/viewModelBuilders/azul/hca-dcp/common/accessionMapper/accessionMapper.ts b/app/viewModelBuilders/azul/hca-dcp/common/accessionMapper/accessionMapper.ts index d6f50837c..7f5c4925e 100644 --- a/app/viewModelBuilders/azul/hca-dcp/common/accessionMapper/accessionMapper.ts +++ b/app/viewModelBuilders/azul/hca-dcp/common/accessionMapper/accessionMapper.ts @@ -72,7 +72,7 @@ export function mapAccessions( * @param identifierOrgPrefix - Identifier org prefix. * @returns formatted accession URL. */ -function transformAccessionURL( +export function transformAccessionURL( accessionId: string, identifierOrgPrefix: string ): string { diff --git a/app/views/EntityDetailView/components/JsonLd/jsonLd.tsx b/app/views/EntityDetailView/components/JsonLd/jsonLd.tsx new file mode 100644 index 000000000..adae845ac --- /dev/null +++ b/app/views/EntityDetailView/components/JsonLd/jsonLd.tsx @@ -0,0 +1,31 @@ +import Head from "next/head"; +import { JSX } from "react"; +import type { SchemaDataset } from "../../../../utils/schemaOrg/types"; +import { escapeJsonForHtml } from "../../../../utils/schemaOrg/utils"; + +interface JsonLdProps { + jsonLd: SchemaDataset; +} + +/** + * Renders a Schema.org Dataset JSON-LD `` sequences in entity descriptions). + * @param props - Component props. + * @param props.jsonLd - Schema.org Dataset payload built by a consumer-specific builder. + * @returns Head element with the JSON-LD script tag. + */ +export const JsonLd = ({ jsonLd }: JsonLdProps): JSX.Element => { + return ( + +