diff --git a/__tests__/utils/schemaOrg/hcaProjectDataset.test.ts b/__tests__/utils/schemaOrg/hcaProjectDataset.test.ts new file mode 100644 index 000000000..64d5dde9a --- /dev/null +++ b/__tests__/utils/schemaOrg/hcaProjectDataset.test.ts @@ -0,0 +1,289 @@ +import type { ProjectsResponse } from "../../../app/apis/azul/hca-dcp/common/responses"; +import { DESCRIPTION_LENGTH } from "../../../app/utils/schemaOrg/constants"; +import { buildHcaProjectJsonLd } from "../../../app/utils/schemaOrg/hcaProjectDataset"; + +const BROWSER_URL = "https://explore.data.humancellatlas.org"; + +/** + * Builds a minimal valid HCA project response with optional overrides for + * top-level (project) and aggregated (donor/sample/specimen/protocol) fields. + * @param overrides - Partial overrides applied to the base response. + * @returns A `ProjectsResponse` shape suitable for builder tests. + */ +function makeProjectsResponse( + overrides: Partial = {} +): ProjectsResponse { + return { + dates: [], + donorOrganisms: [], + entryId: "abc", + fileTypeSummaries: [], + projects: [ + { + accessible: true, + accessions: [], + bionetworkName: [], + contributedAnalyses: {}, + contributors: [], + dataUseRestriction: null, + duosId: null, + estimatedCellCount: null, + laboratory: [], + matrices: {}, + projectDescription: + "A study of cells across multiple human individuals examining inter-individual variation in gene expression.", + projectId: "uuid-1", + projectShortname: "Cell Study", + projectTitle: "Cells of the body", + publications: [], + supplementaryLinks: [], + tissueAtlas: [], + }, + ], + protocols: [], + samples: [], + specimens: [], + status: 200, + ...overrides, + } as unknown as ProjectsResponse; +} + +describe("buildHcaProjectJsonLd", () => { + it("returns undefined when no project is present", () => { + const response = makeProjectsResponse({ projects: [] }); + expect(buildHcaProjectJsonLd(response, BROWSER_URL)).toBeUndefined(); + }); + + it("populates required Schema.org Dataset fields", () => { + const result = buildHcaProjectJsonLd(makeProjectsResponse(), BROWSER_URL); + expect(result).toBeDefined(); + expect(result!["@context"]).toBe("https://schema.org"); + expect(result!["@type"]).toBe("Dataset"); + expect(result!.name).toBe("Cells of the body"); + expect(result!.description).toBe( + "A study of cells across multiple human individuals examining inter-individual variation in gene expression." + ); + expect(result!.url).toBe(`${BROWSER_URL}/projects/uuid-1`); + expect(result!.identifier).toEqual(["uuid-1"]); + expect(result!.isAccessibleForFree).toBe(true); + expect(result!.includedInDataCatalog).toEqual({ + "@type": "DataCatalog", + name: "Human Cell Atlas Data Coordination Platform", + url: BROWSER_URL, + }); + }); + + it("falls back to projectShortname when projectTitle is empty", () => { + const response = makeProjectsResponse(); + response.projects[0].projectTitle = ""; + const result = buildHcaProjectJsonLd(response, BROWSER_URL); + expect(result!.name).toBe("Cell Study"); + }); + + it("strips HTML tags from description", () => { + const response = makeProjectsResponse(); + response.projects[0].projectDescription = + "

Single-cell RNA-seq data across many cells and donors and tissues.

"; + const result = buildHcaProjectJsonLd(response, BROWSER_URL); + expect(result!.description).toBe( + "Single-cell RNA-seq data across many cells and donors and tissues." + ); + }); + + it("pads short descriptions with name and catalog context to meet the 50-char minimum", () => { + const response = makeProjectsResponse(); + response.projects[0].projectDescription = "Short."; + const result = buildHcaProjectJsonLd(response, BROWSER_URL); + expect(result!.description).toBe( + "Cells of the body — Short. — Human Cell Atlas Data Coordination Platform project." + ); + expect(result!.description.length).toBeGreaterThanOrEqual( + DESCRIPTION_LENGTH.MIN + ); + }); + + it("falls back to project name plus catalog context when description is empty", () => { + const response = makeProjectsResponse(); + response.projects[0].projectDescription = ""; + const result = buildHcaProjectJsonLd(response, BROWSER_URL); + expect(result!.description).toBe( + "Cells of the body — Human Cell Atlas Data Coordination Platform project." + ); + expect(result!.description.length).toBeGreaterThanOrEqual( + DESCRIPTION_LENGTH.MIN + ); + }); + + it("truncates descriptions over 5000 characters and appends an ellipsis", () => { + const longDescription = "a".repeat(DESCRIPTION_LENGTH.MAX + 200); + const response = makeProjectsResponse(); + response.projects[0].projectDescription = longDescription; + const result = buildHcaProjectJsonLd(response, BROWSER_URL); + expect(result!.description).toHaveLength(DESCRIPTION_LENGTH.MAX); + expect(result!.description.endsWith("…")).toBe(true); + }); + + it("includes accession ids in identifier and identifiers.org URLs in sameAs", () => { + const response = makeProjectsResponse(); + response.projects[0].accessions = [ + { accession: "GSE12345", namespace: "geo_series" }, + { accession: "PRJNA9999", namespace: "insdc_project" }, + { accession: "X", namespace: "unknown_namespace" }, + ]; + const result = buildHcaProjectJsonLd(response, BROWSER_URL); + expect(result!.identifier).toEqual([ + "uuid-1", + "GSE12345", + "PRJNA9999", + "X", + ]); + expect(result!.sameAs).toEqual([ + "https://identifiers.org/geo:GSE12345", + "https://identifiers.org/ena.embl:PRJNA9999", + ]); + }); + + it("splits semicolon-separated accession ids in both identifier and sameAs", () => { + const response = makeProjectsResponse(); + response.projects[0].accessions = [ + { accession: "GSE12345; GSE67890", namespace: "geo_series" }, + ]; + const result = buildHcaProjectJsonLd(response, BROWSER_URL); + expect(result!.identifier).toEqual(["uuid-1", "GSE12345", "GSE67890"]); + expect(result!.sameAs).toEqual([ + "https://identifiers.org/geo:GSE12345", + "https://identifiers.org/geo:GSE67890", + ]); + }); + + it("omits sameAs when no accessions map to a known namespace", () => { + const result = buildHcaProjectJsonLd(makeProjectsResponse(), BROWSER_URL); + expect(result!.sameAs).toBeUndefined(); + }); + + it("builds creators from contributors with affiliation", () => { + const response = makeProjectsResponse(); + response.projects[0].contributors = [ + { + contactName: "Smith,Alice,B", + email: null, + institution: "Example University", + }, + { + contactName: "", + email: null, + institution: "Should be skipped (no name)", + }, + { + contactName: "Jones,Bob", + email: null, + institution: "", + }, + ]; + const result = buildHcaProjectJsonLd(response, BROWSER_URL); + expect(result!.creator).toEqual([ + { + "@type": "Person", + affiliation: { "@type": "Organization", name: "Example University" }, + name: "Alice B Smith", + }, + { "@type": "Person", name: "Bob Jones" }, + ]); + }); + + it("builds citations from publications using DOI then publicationUrl as sameAs", () => { + const response = makeProjectsResponse(); + response.projects[0].publications = [ + { + doi: "10.1000/example", + officialHcaPublication: true, + publicationTitle: "Cell Paper", + publicationUrl: "https://example.org/cell-paper", + }, + { + doi: null, + officialHcaPublication: false, + publicationTitle: "Other Paper", + publicationUrl: "https://example.org/other", + }, + { + doi: null, + officialHcaPublication: false, + publicationTitle: "", + publicationUrl: "https://example.org/no-title", + }, + ]; + const result = buildHcaProjectJsonLd(response, BROWSER_URL); + expect(result!.citation).toEqual([ + { + "@type": "ScholarlyArticle", + headline: "Cell Paper", + name: "Cell Paper", + sameAs: "https://doi.org/10.1000/example", + }, + { + "@type": "ScholarlyArticle", + headline: "Other Paper", + name: "Other Paper", + sameAs: "https://example.org/other", + }, + ]); + }); + + it("builds deduplicated keywords from donor, sample, specimen, and protocol fields", () => { + const response = makeProjectsResponse({ + donorOrganisms: [ + { + biologicalSex: null, + developmentStage: [], + disease: ["normal"], + donorCount: 1, + genusSpecies: ["Homo sapiens"], + organismAge: null, + }, + ], + protocols: [ + { + libraryConstructionApproach: ["10x v2", "10x v3"], + }, + ], + samples: [ + { + disease: ["normal"], + id: ["s1"], + organ: ["brain"], + organPart: ["cortex"], + sampleEntityType: ["specimens"], + }, + ], + specimens: [ + { + disease: ["normal"], + id: ["s1"], + organ: ["brain"], + organPart: ["cortex"], + preservationMethod: [], + source: [], + }, + ], + } as unknown as Partial); + const result = buildHcaProjectJsonLd(response, BROWSER_URL); + expect(result!.keywords).toEqual([ + "Homo sapiens", + "normal", + "brain", + "cortex", + "specimens", + "10x v2", + "10x v3", + ]); + }); + + it("omits keywords, creator, citation, sameAs when sources are empty", () => { + const result = buildHcaProjectJsonLd(makeProjectsResponse(), BROWSER_URL); + expect(result!.keywords).toBeUndefined(); + expect(result!.creator).toBeUndefined(); + expect(result!.citation).toBeUndefined(); + expect(result!.sameAs).toBeUndefined(); + }); +}); diff --git a/app/utils/schemaOrg/constants.ts b/app/utils/schemaOrg/constants.ts new file mode 100644 index 000000000..d3bfb45a7 --- /dev/null +++ b/app/utils/schemaOrg/constants.ts @@ -0,0 +1,14 @@ +/** + * Schema.org Dataset constants shared by consumer-specific JSON-LD builders. + */ + +/** + * Google Dataset Search description-length bounds. Descriptions outside this + * range may be rejected or downranked by Google's structured-data validator. + * + * See https://developers.google.com/search/docs/appearance/structured-data/dataset + */ +export const DESCRIPTION_LENGTH = { + MAX: 5000, + MIN: 50, +} as const; diff --git a/app/utils/schemaOrg/hcaProjectDataset.ts b/app/utils/schemaOrg/hcaProjectDataset.ts new file mode 100644 index 000000000..e4c7ea7a0 --- /dev/null +++ b/app/utils/schemaOrg/hcaProjectDataset.ts @@ -0,0 +1,224 @@ +import type { + AccessionResponse, + ContributorResponse, + PublicationResponse, +} from "../../apis/azul/hca-dcp/common/entities"; +import type { ProjectsResponse } from "../../apis/azul/hca-dcp/common/responses"; +import { transformAccessionURL } from "../../viewModelBuilders/azul/hca-dcp/common/accessionMapper/accessionMapper"; +import { ACCESSION_CONFIGS_BY_RESPONSE_KEY } from "../../viewModelBuilders/azul/hca-dcp/common/accessionMapper/constants"; +import { DESCRIPTION_LENGTH } from "./constants"; +import type { + SchemaDataset, + SchemaOrganization, + SchemaPerson, + SchemaScholarlyArticle, +} from "./types"; +import { stripHtmlTags, truncateDescription, uniqueNonEmpty } from "./utils"; + +const CATALOG_NAME = "Human Cell Atlas Data Coordination Platform"; + +/** + * Builds the citation array from project publications. Skips entries without a + * title. Prefers DOI for `sameAs`, falling back to the publication URL. + * @param publications - HCA project publications. + * @returns Array of schema.org ScholarlyArticle objects. + */ +function buildCitations( + publications: PublicationResponse[] +): SchemaScholarlyArticle[] { + const citations: SchemaScholarlyArticle[] = []; + for (const publication of publications ?? []) { + if (!publication.publicationTitle) continue; + const article: SchemaScholarlyArticle = { + "@type": "ScholarlyArticle", + headline: publication.publicationTitle, + name: publication.publicationTitle, + }; + if (publication.doi) { + article.sameAs = `https://doi.org/${publication.doi}`; + } else if (publication.publicationUrl) { + article.sameAs = publication.publicationUrl; + } + citations.push(article); + } + return citations; +} + +/** + * Builds the Schema.org description for a project, padding short or empty + * source descriptions with the project name and catalog context so the result + * satisfies Google's minimum description-length requirement (50 chars). + * @param sourceDescription - Raw projectDescription from the Azul response. + * @param name - Project name used as a padding fallback. + * @returns HTML-stripped description, padded if short, truncated if long. + */ +function buildDescription(sourceDescription: string, name: string): string { + const stripped = stripHtmlTags(sourceDescription || ""); + if (stripped.length >= DESCRIPTION_LENGTH.MIN) { + return truncateDescription(stripped); + } + // Padding includes the catalog name (~43 chars) to reliably push the + // result past the 50-char minimum even when name + stripped are short. + const padded = stripped + ? `${name} — ${stripped} — ${CATALOG_NAME} project.` + : `${name} — ${CATALOG_NAME} project.`; + return truncateDescription(padded); +} + +/** + * Builds the creator array from project contributors. Skips entries without a + * name. When the contributor has an institution, attaches it as an affiliation. + * @param contributors - HCA project contributors. + * @returns Array of schema.org Person objects. + */ +function buildCreators(contributors: ContributorResponse[]): SchemaPerson[] { + const creators: SchemaPerson[] = []; + for (const contributor of contributors ?? []) { + if (!contributor.contactName) continue; + const person: SchemaPerson = { + "@type": "Person", + name: normaliseContactName(contributor.contactName), + }; + if (contributor.institution) { + const affiliation: SchemaOrganization = { + "@type": "Organization", + name: contributor.institution, + }; + person.affiliation = affiliation; + } + creators.push(person); + } + return creators; +} + +/** + * Builds a Schema.org Dataset JSON-LD object for an HCA DCP project. + * + * Returns `undefined` when the response does not carry a project we can + * describe (i.e. no project entity), so the caller can skip rendering. + * @param data - HCA DCP project detail response from Azul. + * @param browserURL - Site base URL used for canonical and catalog URLs. + * @returns Schema.org Dataset JSON-LD object, or `undefined` if not buildable. + */ +export function buildHcaProjectJsonLd( + data: ProjectsResponse, + browserURL: string +): SchemaDataset | undefined { + const project = data.projects?.[0]; + if (!project) return undefined; + + const name = project.projectTitle || project.projectShortname; + const description = buildDescription(project.projectDescription, name); + const identifier = uniqueNonEmpty([ + project.projectId, + ...project.accessions.flatMap((accession) => + splitAccessionIds(accession.accession) + ), + ]); + + const jsonLd: SchemaDataset = { + "@context": "https://schema.org", + "@type": "Dataset", + description, + identifier, + includedInDataCatalog: { + "@type": "DataCatalog", + name: CATALOG_NAME, + url: browserURL, + }, + isAccessibleForFree: true, + name, + url: `${browserURL}/projects/${project.projectId}`, + }; + + const sameAs = buildSameAs(project.accessions); + if (sameAs.length > 0) jsonLd.sameAs = sameAs; + + const keywords = buildKeywords(data); + if (keywords.length > 0) jsonLd.keywords = keywords; + + const creator = buildCreators(project.contributors); + if (creator.length > 0) jsonLd.creator = creator; + + const citation = buildCitations(project.publications); + if (citation.length > 0) jsonLd.citation = citation; + + return jsonLd; +} + +/** + * Builds a keywords array by unioning biologically-meaningful fields from the + * project's aggregated donor/sample/specimen/protocol responses. + * @param data - HCA project detail response. + * @returns Deduplicated keywords array. + */ +function buildKeywords(data: ProjectsResponse): string[] { + const values: (string | null | undefined)[] = []; + for (const donor of data.donorOrganisms ?? []) { + values.push(...(donor.genusSpecies ?? [])); + values.push(...(donor.disease ?? [])); + } + for (const sample of data.samples ?? []) { + values.push(...(sample.organ ?? [])); + values.push(...(sample.organPart ?? [])); + values.push(...(sample.disease ?? [])); + values.push(...(sample.sampleEntityType ?? [])); + } + for (const specimen of data.specimens ?? []) { + values.push(...(specimen.organ ?? [])); + values.push(...(specimen.organPart ?? [])); + values.push(...(specimen.disease ?? [])); + } + for (const protocol of data.protocols ?? []) { + values.push(...(protocol.libraryConstructionApproach ?? [])); + values.push(...(protocol.instrumentManufacturerModel ?? [])); + } + return uniqueNonEmpty(values); +} + +/** + * Builds the sameAs array of external accession URLs via identifiers.org. + * Only includes accessions whose namespace maps to a known identifier prefix. + * @param accessions - Project accessions from the Azul response. + * @returns Array of canonical accession URLs. + */ +function buildSameAs(accessions: AccessionResponse[]): string[] { + const urls: string[] = []; + for (const { accession, namespace } of accessions) { + const prefix = + ACCESSION_CONFIGS_BY_RESPONSE_KEY.get(namespace)?.identifierOrgPrefix; + if (!prefix) continue; + for (const id of splitAccessionIds(accession)) { + const url = transformAccessionURL(id, prefix); + if (url) urls.push(url); + } + } + return uniqueNonEmpty(urls); +} + +/** + * Normalises an HCA contributor's contactName from "Last,First,Middle" to + * "First Middle Last" for use as a Schema.org Person.name value. + * @param contactName - Raw contactName from the Azul response. + * @returns Human-readable contributor name. + */ +function normaliseContactName(contactName: string): string { + const parts = contactName.split(",").map((part) => part.trim()); + if (parts.length < 2) return contactName; + const [last, ...rest] = parts; + return [...rest, last].filter(Boolean).join(" "); +} + +/** + * Splits an Azul accession string into individual accession IDs. Azul returns + * accessions as a semicolon-separated string when a project carries multiple + * IDs under the same namespace (mirrors the split done by `mapAccessions`). + * @param accession - Raw accession value from the Azul response. + * @returns Trimmed, non-empty accession IDs. + */ +function splitAccessionIds(accession: string): string[] { + return accession + .split(";") + .map((id) => id.trim()) + .filter(Boolean); +} diff --git a/app/utils/schemaOrg/types.ts b/app/utils/schemaOrg/types.ts new file mode 100644 index 000000000..19acb58e7 --- /dev/null +++ b/app/utils/schemaOrg/types.ts @@ -0,0 +1,74 @@ +/** + * Shared Schema.org Dataset types used by per-consumer JSON-LD builders + * (HCA DCP, AnVIL, LungMAP). Each consumer composes its own `SchemaDataset` + * from its source entity and renders it via the shared `JsonLd` component. + * + * See https://developers.google.com/search/docs/appearance/structured-data/dataset + * for Google's Dataset structured data guidelines. + */ + +/** + * Schema.org DataCatalog type. + */ +export interface SchemaDataCatalog { + "@type": "DataCatalog"; + name: string; + url: string; +} + +/** + * Schema.org DataDownload type. + */ +export interface SchemaDataDownload { + "@type": "DataDownload"; + contentUrl: string; + encodingFormat?: string; +} + +/** + * Schema.org Dataset JSON-LD structure. + */ +export interface SchemaDataset { + "@context": "https://schema.org"; + "@type": "Dataset"; + citation?: SchemaScholarlyArticle[]; + creator?: (SchemaPerson | SchemaOrganization)[]; + description: string; + distribution?: SchemaDataDownload[]; + identifier: string[]; + includedInDataCatalog: SchemaDataCatalog; + isAccessibleForFree: boolean; + keywords?: string[]; + measurementTechnique?: string[]; + name: string; + sameAs?: string[]; + url: string; +} + +/** + * Schema.org Organization type. + */ +export interface SchemaOrganization { + "@type": "Organization"; + name: string; +} + +/** + * Schema.org Person type. + */ +export interface SchemaPerson { + "@type": "Person"; + affiliation?: SchemaOrganization; + name: string; +} + +/** + * Schema.org ScholarlyArticle type. + */ +export interface SchemaScholarlyArticle { + "@type": "ScholarlyArticle"; + author?: SchemaPerson[]; + headline: string; + name: string; + sameAs?: string; +} diff --git a/app/utils/schemaOrg/utils.ts b/app/utils/schemaOrg/utils.ts new file mode 100644 index 000000000..c7d0849dc --- /dev/null +++ b/app/utils/schemaOrg/utils.ts @@ -0,0 +1,59 @@ +import { DESCRIPTION_LENGTH } from "./constants"; + +/** + * Escapes a JSON string for safe embedding inside an HTML `` or HTML entity injection. + * @param json - Serialised JSON to embed. + * @returns Escaped JSON safe for `dangerouslySetInnerHTML`. + */ +export function escapeJsonForHtml(json: string): string { + return json + .replace(//g, "\\u003e") + .replace(/&/g, "\\u0026"); +} + +/** + * Strips HTML tags and collapses whitespace from a description string. + * @param value - Source description (may contain HTML). + * @returns Plain text suitable for embedding in JSON-LD. + */ +export function stripHtmlTags(value: string): string { + return ( + value + // eslint-disable-next-line sonarjs/slow-regex -- `[^>]+` is bounded by the next `>` or end-of-input; no nested quantifiers or alternation, so backtracking is linear. + .replace(/<[^>]+>/g, " ") + .replace(/\s+/g, " ") + .trim() + ); +} + +/** + * Truncates a description to the Google Dataset Search maximum, appending an + * ellipsis when truncation occurs. + * @param description - Plain text description. + * @returns Truncated description. + */ +export function truncateDescription(description: string): string { + if (description.length <= DESCRIPTION_LENGTH.MAX) return description; + return description.slice(0, DESCRIPTION_LENGTH.MAX - 1) + "…"; +} + +/** + * De-duplicates and removes empty/null/undefined entries from a string array. + * @param values - Source array (may contain null, undefined, or duplicates). + * @returns Deduplicated array of non-empty strings, preserving first-seen order. + */ +export function uniqueNonEmpty( + values: (string | null | undefined)[] +): string[] { + const seen = new Set(); + const result: string[] = []; + for (const value of values) { + if (!value) continue; + if (seen.has(value)) continue; + seen.add(value); + result.push(value); + } + return result; +} diff --git a/app/viewModelBuilders/azul/hca-dcp/common/accessionMapper/accessionMapper.ts b/app/viewModelBuilders/azul/hca-dcp/common/accessionMapper/accessionMapper.ts index d6f50837c..7f5c4925e 100644 --- a/app/viewModelBuilders/azul/hca-dcp/common/accessionMapper/accessionMapper.ts +++ b/app/viewModelBuilders/azul/hca-dcp/common/accessionMapper/accessionMapper.ts @@ -72,7 +72,7 @@ export function mapAccessions( * @param identifierOrgPrefix - Identifier org prefix. * @returns formatted accession URL. */ -function transformAccessionURL( +export function transformAccessionURL( accessionId: string, identifierOrgPrefix: string ): string { diff --git a/app/views/EntityDetailView/components/JsonLd/jsonLd.tsx b/app/views/EntityDetailView/components/JsonLd/jsonLd.tsx new file mode 100644 index 000000000..adae845ac --- /dev/null +++ b/app/views/EntityDetailView/components/JsonLd/jsonLd.tsx @@ -0,0 +1,31 @@ +import Head from "next/head"; +import { JSX } from "react"; +import type { SchemaDataset } from "../../../../utils/schemaOrg/types"; +import { escapeJsonForHtml } from "../../../../utils/schemaOrg/utils"; + +interface JsonLdProps { + jsonLd: SchemaDataset; +} + +/** + * Renders a Schema.org Dataset JSON-LD `` sequences in entity descriptions). + * @param props - Component props. + * @param props.jsonLd - Schema.org Dataset payload built by a consumer-specific builder. + * @returns Head element with the JSON-LD script tag. + */ +export const JsonLd = ({ jsonLd }: JsonLdProps): JSX.Element => { + return ( + +