From d2aa49efe060a24f0ba50f4694d2777097aba898 Mon Sep 17 00:00:00 2001 From: Fran McDade <18710366+frano-m@users.noreply.github.com> Date: Wed, 13 May 2026 22:07:06 +1000 Subject: [PATCH 1/2] feat: [anvil dx] add anvil datasets to google datasets catalog (#4807) --- .../utils/schemaOrg/anvilDataset.test.ts | 182 ++++++++++++++++++ app/utils/schemaOrg/anvilDataset.ts | 113 +++++++++++ app/utils/schemaOrg/hcaProjectDataset.ts | 31 +-- app/utils/schemaOrg/utils.ts | 27 +++ pages/[entityListType]/[...params].tsx | 28 +-- 5 files changed, 345 insertions(+), 36 deletions(-) create mode 100644 __tests__/utils/schemaOrg/anvilDataset.test.ts create mode 100644 app/utils/schemaOrg/anvilDataset.ts diff --git a/__tests__/utils/schemaOrg/anvilDataset.test.ts b/__tests__/utils/schemaOrg/anvilDataset.test.ts new file mode 100644 index 000000000..26ca01b67 --- /dev/null +++ b/__tests__/utils/schemaOrg/anvilDataset.test.ts @@ -0,0 +1,182 @@ +import type { DatasetsResponse } from "../../../app/apis/azul/anvil-cmg/common/responses"; +import { buildAnvilDatasetJsonLd } from "../../../app/utils/schemaOrg/anvilDataset"; +import { DESCRIPTION_LENGTH } from "../../../app/utils/schemaOrg/constants"; + +const BROWSER_URL = "https://explore.anvilproject.org"; + +/** + * Builds a minimal valid AnVIL datasets response with optional overrides for + * top-level (dataset) and aggregated (activity/biosample/donor/file/library/ + * diagnosis) fields. + * @param overrides - Partial overrides applied to the base response. + * @returns A `DatasetsResponse` shape suitable for builder tests. + */ +function makeDatasetsResponse( + overrides: Partial = {} +): DatasetsResponse { + return { + activities: [], + biosamples: [], + datasets: [ + { + accessible: true, + consent_group: ["NRES"], + dataset_id: "uuid-1", + description: + "A multi-cohort study of rare disease across many donors and biosamples.", + duos_id: null, + registered_identifier: [], + title: "Rare disease dataset", + }, + ], + diagnoses: [], + donors: [], + entryId: "abc", + files: [], + libraries: [], + status: 200, + ...overrides, + } as unknown as DatasetsResponse; +} + +describe("buildAnvilDatasetJsonLd", () => { + it("returns undefined when no dataset is present", () => { + const response = makeDatasetsResponse({ datasets: [] }); + expect(buildAnvilDatasetJsonLd(response, BROWSER_URL)).toBeUndefined(); + }); + + it("populates required Schema.org Dataset fields", () => { + const result = buildAnvilDatasetJsonLd(makeDatasetsResponse(), BROWSER_URL); + expect(result).toBeDefined(); + expect(result!["@context"]).toBe("https://schema.org"); + expect(result!["@type"]).toBe("Dataset"); + expect(result!.name).toBe("Rare disease dataset"); + expect(result!.description).toBe( + "A multi-cohort study of rare disease across many donors and biosamples." + ); + expect(result!.url).toBe(`${BROWSER_URL}/datasets/uuid-1`); + expect(result!.identifier).toEqual(["uuid-1"]); + expect(result!.isAccessibleForFree).toBe(true); + expect(result!.includedInDataCatalog).toEqual({ + "@type": "DataCatalog", + name: "AnVIL Data Explorer", + url: BROWSER_URL, + }); + }); + + it("falls back to dataset_id when title is empty", () => { + const response = makeDatasetsResponse(); + response.datasets[0].title = ""; + const result = buildAnvilDatasetJsonLd(response, BROWSER_URL); + expect(result!.name).toBe("uuid-1"); + }); + + it("strips HTML tags from description", () => { + const response = makeDatasetsResponse(); + response.datasets[0].description = + "

Single-cell RNA-seq data across many donors and tissues.

"; + const result = buildAnvilDatasetJsonLd(response, BROWSER_URL); + expect(result!.description).toBe( + "Single-cell RNA-seq data across many donors and tissues." + ); + }); + + it("truncates descriptions over 5000 characters and appends an ellipsis", () => { + const longDescription = "a".repeat(DESCRIPTION_LENGTH.MAX + 200); + const response = makeDatasetsResponse(); + response.datasets[0].description = longDescription; + const result = buildAnvilDatasetJsonLd(response, BROWSER_URL); + expect(result!.description).toHaveLength(DESCRIPTION_LENGTH.MAX); + expect(result!.description.endsWith("…")).toBe(true); + }); + + it("pads short descriptions with name and catalog context to meet the 50-char minimum", () => { + const response = makeDatasetsResponse(); + response.datasets[0].description = "Short."; + const result = buildAnvilDatasetJsonLd(response, BROWSER_URL); + expect(result!.description).toBe( + "Rare disease dataset — Short. — AnVIL Data Explorer dataset." + ); + expect(result!.description.length).toBeGreaterThanOrEqual( + DESCRIPTION_LENGTH.MIN + ); + }); + + it("falls back to dataset name plus catalog context when description is missing", () => { + const response = makeDatasetsResponse(); + response.datasets[0].description = undefined; + const result = buildAnvilDatasetJsonLd(response, BROWSER_URL); + expect(result!.description).toBe( + "Rare disease dataset — AnVIL Data Explorer dataset." + ); + expect(result!.description.length).toBeGreaterThanOrEqual( + DESCRIPTION_LENGTH.MIN + ); + }); + + it("includes registered_identifier values in identifier and dbGaP study URLs in sameAs", () => { + const response = makeDatasetsResponse(); + response.datasets[0].registered_identifier = [ + "phs000123", + "phs000456", + null, + ]; + const result = buildAnvilDatasetJsonLd(response, BROWSER_URL); + expect(result!.identifier).toEqual(["uuid-1", "phs000123", "phs000456"]); + expect(result!.sameAs).toEqual([ + "https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs000123", + "https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs000456", + ]); + }); + + it("omits sameAs when there are no registered identifiers", () => { + const result = buildAnvilDatasetJsonLd(makeDatasetsResponse(), BROWSER_URL); + expect(result!.sameAs).toBeUndefined(); + }); + + it("builds deduplicated keywords from activity, biosample, donor, diagnosis, file, and library fields", () => { + const response = makeDatasetsResponse({ + activities: [ + { activity_type: ["sequencing"], data_modality: ["genomic"] }, + ], + biosamples: [{ anatomical_site: ["brain"], biosample_type: ["tissue"] }], + diagnoses: [{ disease: ["epilepsy"], phenotype: [] }], + donors: [ + { + organism_type: ["Homo sapiens"], + phenotypic_sex: ["female"], + reported_ethnicity: ["asian"], + }, + ], + files: [ + { + data_modality: ["genomic"], + file_format: ["fastq", "bam"], + file_id: "f1", + file_type: "sequencing", + }, + ], + libraries: [{ prep_material_name: ["DNA"] }], + } as unknown as Partial); + const result = buildAnvilDatasetJsonLd(response, BROWSER_URL); + expect(result!.keywords).toEqual([ + "sequencing", + "genomic", + "brain", + "tissue", + "Homo sapiens", + "female", + "asian", + "epilepsy", + "fastq", + "bam", + "DNA", + ]); + }); + + it("omits keywords and sameAs when sources are empty", () => { + const result = buildAnvilDatasetJsonLd(makeDatasetsResponse(), BROWSER_URL); + expect(result!.keywords).toBeUndefined(); + expect(result!.sameAs).toBeUndefined(); + }); +}); diff --git a/app/utils/schemaOrg/anvilDataset.ts b/app/utils/schemaOrg/anvilDataset.ts new file mode 100644 index 000000000..520986327 --- /dev/null +++ b/app/utils/schemaOrg/anvilDataset.ts @@ -0,0 +1,113 @@ +import type { DatasetEntity } from "../../apis/azul/anvil-cmg/common/entities"; +import type { DatasetsResponse } from "../../apis/azul/anvil-cmg/common/responses"; +import type { SchemaDataset } from "./types"; +import { buildDescription, uniqueNonEmpty } from "./utils"; + +const CATALOG_NAME = "AnVIL Data Explorer"; +const DESCRIPTION_FALLBACK_SUFFIX = `${CATALOG_NAME} dataset.`; +const DBGAP_STUDY_URL = + "https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id="; + +/** + * Builds a Schema.org Dataset JSON-LD object for an AnVIL CMG dataset. + * + * Returns `undefined` when the response does not carry a dataset we can + * describe (i.e. no dataset entity), so the caller can skip rendering. + * @param data - AnVIL CMG dataset detail response from Azul. + * @param browserURL - Site base URL used for canonical and catalog URLs. + * @returns Schema.org Dataset JSON-LD object, or `undefined` if not buildable. + */ +export function buildAnvilDatasetJsonLd( + data: DatasetsResponse, + browserURL: string +): SchemaDataset | undefined { + const dataset = data.datasets?.[0]; + if (!dataset) return undefined; + + const name = dataset.title || dataset.dataset_id; + const description = buildDescription( + dataset.description || "", + name, + DESCRIPTION_FALLBACK_SUFFIX + ); + const identifier = uniqueNonEmpty([ + dataset.dataset_id, + ...dataset.registered_identifier, + ]); + + const jsonLd: SchemaDataset = { + "@context": "https://schema.org", + "@type": "Dataset", + description, + identifier, + includedInDataCatalog: { + "@type": "DataCatalog", + name: CATALOG_NAME, + url: browserURL, + }, + // Google's `isAccessibleForFree` is the inverse of "paid", not "unrestricted" — dbGaP gating doesn't change the value. + isAccessibleForFree: true, + name, + url: `${browserURL}/datasets/${dataset.dataset_id}`, + }; + + const sameAs = buildSameAs(dataset); + if (sameAs.length > 0) jsonLd.sameAs = sameAs; + + const keywords = buildKeywords(data); + if (keywords.length > 0) jsonLd.keywords = keywords; + + return jsonLd; +} + +/** + * Builds a keywords array by unioning biologically-meaningful fields from the + * dataset's aggregated activity/biosample/donor/file/library/diagnosis + * responses. + * @param data - AnVIL CMG dataset detail response. + * @returns Deduplicated keywords array. + */ +function buildKeywords(data: DatasetsResponse): string[] { + const values: (string | null | undefined)[] = []; + for (const activity of data.activities ?? []) { + values.push(...(activity.activity_type ?? [])); + values.push(...(activity.data_modality ?? [])); + } + for (const biosample of data.biosamples ?? []) { + values.push(...(biosample.anatomical_site ?? [])); + values.push(...(biosample.biosample_type ?? [])); + } + for (const donor of data.donors ?? []) { + values.push(...(donor.organism_type ?? [])); + values.push(...(donor.phenotypic_sex ?? [])); + values.push(...(donor.reported_ethnicity ?? [])); + } + for (const diagnosis of data.diagnoses ?? []) { + values.push(...(diagnosis.disease ?? [])); + } + for (const file of data.files ?? []) { + values.push(...(file.data_modality ?? [])); + values.push(...(file.file_format ?? [])); + } + for (const library of data.libraries ?? []) { + values.push(...(library.prep_material_name ?? [])); + } + return uniqueNonEmpty(values); +} + +/** + * Builds the sameAs array of external accession URLs. AnVIL datasets reference + * dbGaP study pages via their `registered_identifier` (phs accessions). + * @param dataset - AnVIL dataset entity. + * @returns Array of canonical dbGaP study URLs. + */ +function buildSameAs(dataset: DatasetEntity): string[] { + const urls: string[] = []; + for (const id of dataset.registered_identifier) { + if (!id) continue; + const trimmed = id.trim(); + if (!trimmed) continue; + urls.push(`${DBGAP_STUDY_URL}${trimmed}`); + } + return uniqueNonEmpty(urls); +} diff --git a/app/utils/schemaOrg/hcaProjectDataset.ts b/app/utils/schemaOrg/hcaProjectDataset.ts index e4c7ea7a0..d70fcd2f5 100644 --- a/app/utils/schemaOrg/hcaProjectDataset.ts +++ b/app/utils/schemaOrg/hcaProjectDataset.ts @@ -6,16 +6,16 @@ import type { import type { ProjectsResponse } from "../../apis/azul/hca-dcp/common/responses"; import { transformAccessionURL } from "../../viewModelBuilders/azul/hca-dcp/common/accessionMapper/accessionMapper"; import { ACCESSION_CONFIGS_BY_RESPONSE_KEY } from "../../viewModelBuilders/azul/hca-dcp/common/accessionMapper/constants"; -import { DESCRIPTION_LENGTH } from "./constants"; import type { SchemaDataset, SchemaOrganization, SchemaPerson, SchemaScholarlyArticle, } from "./types"; -import { stripHtmlTags, truncateDescription, uniqueNonEmpty } from "./utils"; +import { buildDescription, uniqueNonEmpty } from "./utils"; const CATALOG_NAME = "Human Cell Atlas Data Coordination Platform"; +const DESCRIPTION_FALLBACK_SUFFIX = `${CATALOG_NAME} project.`; /** * Builds the citation array from project publications. Skips entries without a @@ -44,27 +44,6 @@ function buildCitations( return citations; } -/** - * Builds the Schema.org description for a project, padding short or empty - * source descriptions with the project name and catalog context so the result - * satisfies Google's minimum description-length requirement (50 chars). - * @param sourceDescription - Raw projectDescription from the Azul response. - * @param name - Project name used as a padding fallback. - * @returns HTML-stripped description, padded if short, truncated if long. - */ -function buildDescription(sourceDescription: string, name: string): string { - const stripped = stripHtmlTags(sourceDescription || ""); - if (stripped.length >= DESCRIPTION_LENGTH.MIN) { - return truncateDescription(stripped); - } - // Padding includes the catalog name (~43 chars) to reliably push the - // result past the 50-char minimum even when name + stripped are short. - const padded = stripped - ? `${name} — ${stripped} — ${CATALOG_NAME} project.` - : `${name} — ${CATALOG_NAME} project.`; - return truncateDescription(padded); -} - /** * Builds the creator array from project contributors. Skips entries without a * name. When the contributor has an institution, attaches it as an affiliation. @@ -108,7 +87,11 @@ export function buildHcaProjectJsonLd( if (!project) return undefined; const name = project.projectTitle || project.projectShortname; - const description = buildDescription(project.projectDescription, name); + const description = buildDescription( + project.projectDescription, + name, + DESCRIPTION_FALLBACK_SUFFIX + ); const identifier = uniqueNonEmpty([ project.projectId, ...project.accessions.flatMap((accession) => diff --git a/app/utils/schemaOrg/utils.ts b/app/utils/schemaOrg/utils.ts index c7d0849dc..d29358c49 100644 --- a/app/utils/schemaOrg/utils.ts +++ b/app/utils/schemaOrg/utils.ts @@ -1,5 +1,32 @@ import { DESCRIPTION_LENGTH } from "./constants"; +/** + * Builds a Schema.org description string from a raw entity description, padding + * short or empty values with the entity name and a caller-supplied fallback + * suffix so the result satisfies Google's minimum description-length + * requirement (50 chars). + * @param sourceDescription - Raw description (may contain HTML, may be empty). + * @param name - Entity name used in the padded fallback. + * @param fallbackSuffix - Caller-owned suffix (e.g. catalog + entity kind) used + * to reliably push padded descriptions past the 50-character minimum. The + * caller controls phrasing and punctuation; the helper does not add a period. + * @returns HTML-stripped description, padded when short, truncated when long. + */ +export function buildDescription( + sourceDescription: string, + name: string, + fallbackSuffix: string +): string { + const stripped = stripHtmlTags(sourceDescription || ""); + if (stripped.length >= DESCRIPTION_LENGTH.MIN) { + return truncateDescription(stripped); + } + const padded = stripped + ? `${name} — ${stripped} — ${fallbackSuffix}` + : `${name} — ${fallbackSuffix}`; + return truncateDescription(padded); +} + /** * Escapes a JSON string for safe embedding inside an HTML `` or HTML entity injection. diff --git a/pages/[entityListType]/[...params].tsx b/pages/[entityListType]/[...params].tsx index d95bbe166..a8bc9df1a 100644 --- a/pages/[entityListType]/[...params].tsx +++ b/pages/[entityListType]/[...params].tsx @@ -30,17 +30,18 @@ import { useRouter } from "next/router"; import { ParsedUrlQuery } from "querystring"; import { JSX } from "react"; import { EntityGuard } from "../../app/components/Detail/components/EntityGuard/entityGuard"; +import { buildAnvilDatasetJsonLd } from "../../app/utils/schemaOrg/anvilDataset"; import { buildHcaProjectJsonLd } from "../../app/utils/schemaOrg/hcaProjectDataset"; +import type { SchemaDataset } from "../../app/utils/schemaOrg/types"; import { readFile } from "../../app/utils/tsvParser"; import { JsonLd } from "../../app/views/EntityDetailView/components/JsonLd/jsonLd"; import { ROUTES } from "../../site-config/anvil-cmg/dev/export/routes"; -import { DatasetsResponse } from "../../app/apis/azul/anvil-cmg/common/responses"; +import type { DatasetsResponse } from "../../app/apis/azul/anvil-cmg/common/responses"; import { getConsentGroup, isNRESOrUnrestrictedAccess, } from "../../app/apis/azul/anvil-cmg/common/transformers"; -import type { ProjectsResponse } from "../../app/apis/azul/hca-dcp/common/responses"; import { isProductionEnvironment } from "../../app/config/utils"; const setOfProcessedIds = new Set(); @@ -88,7 +89,8 @@ const EntityDetailPage = (props: EntityDetailPageProps): JSX.Element => { if (isExportMethodView(query)) return ; return ( <> - {isHcaDcp && renderHcaProjectJsonLd(props)} + {isAnVIL && renderJsonLd(props, "datasets", buildAnvilDatasetJsonLd)} + {isHcaDcp && renderJsonLd(props, "projects", buildHcaProjectJsonLd)} ); @@ -531,19 +533,21 @@ async function processEntityProps( } /** - * Renders the HCA project JSON-LD when the page is a project detail route with - * data and a browser URL available. Returns null otherwise. + * Renders a consumer-specific Schema.org Dataset JSON-LD script when the page + * matches the given entity list type and carries the data needed by the + * builder. Returns null otherwise. * @param props - Entity detail page props. + * @param entityListType - The entity list type this builder applies to. + * @param build - Consumer-specific builder that maps detail data to a Dataset. * @returns JsonLd element, or null when the page can't be described. */ -function renderHcaProjectJsonLd( - props: EntityDetailPageProps +function renderJsonLd( + props: EntityDetailPageProps, + entityListType: string, + build: (data: T, browserURL: string) => SchemaDataset | undefined ): JSX.Element | null { - if (props.entityListType !== "projects") return null; + if (props.entityListType !== entityListType) return null; if (!props.browserURL || !props.data) return null; - const jsonLd = buildHcaProjectJsonLd( - props.data as ProjectsResponse, - props.browserURL - ); + const jsonLd = build(props.data as T, props.browserURL); return jsonLd ? : null; } From 75955746a009e5f47394b91df7ebbe0d24af3268 Mon Sep 17 00:00:00 2001 From: Fran McDade <18710366+frano-m@users.noreply.github.com> Date: Thu, 14 May 2026 16:41:42 +1000 Subject: [PATCH 2/2] fix: address copilot feedback on anvil dataset jsonld (#4807) --- .../utils/schemaOrg/anvilDataset.test.ts | 40 ++++++++++++++++++- app/utils/schemaOrg/anvilDataset.ts | 9 ++++- app/utils/schemaOrg/constants.ts | 7 ++++ app/utils/schemaOrg/hcaProjectDataset.ts | 3 +- pages/[entityListType]/[...params].tsx | 9 ++++- 5 files changed, 62 insertions(+), 6 deletions(-) diff --git a/__tests__/utils/schemaOrg/anvilDataset.test.ts b/__tests__/utils/schemaOrg/anvilDataset.test.ts index 26ca01b67..1191be18e 100644 --- a/__tests__/utils/schemaOrg/anvilDataset.test.ts +++ b/__tests__/utils/schemaOrg/anvilDataset.test.ts @@ -1,6 +1,9 @@ import type { DatasetsResponse } from "../../../app/apis/azul/anvil-cmg/common/responses"; import { buildAnvilDatasetJsonLd } from "../../../app/utils/schemaOrg/anvilDataset"; -import { DESCRIPTION_LENGTH } from "../../../app/utils/schemaOrg/constants"; +import { + DESCRIPTION_LENGTH, + MAX_KEYWORDS, +} from "../../../app/utils/schemaOrg/constants"; const BROWSER_URL = "https://explore.anvilproject.org"; @@ -134,6 +137,21 @@ describe("buildAnvilDatasetJsonLd", () => { expect(result!.sameAs).toBeUndefined(); }); + it("skips registered_identifier values that aren't dbGaP phs accessions", () => { + const response = makeDatasetsResponse(); + response.datasets[0].registered_identifier = [ + "phs000123", + "not-a-phs", + "PHS000456", // wrong case — phs accessions are lowercase + "phs000789", + ]; + const result = buildAnvilDatasetJsonLd(response, BROWSER_URL); + expect(result!.sameAs).toEqual([ + "https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs000123", + "https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs000789", + ]); + }); + it("builds deduplicated keywords from activity, biosample, donor, diagnosis, file, and library fields", () => { const response = makeDatasetsResponse({ activities: [ @@ -179,4 +197,24 @@ describe("buildAnvilDatasetJsonLd", () => { expect(result!.keywords).toBeUndefined(); expect(result!.sameAs).toBeUndefined(); }); + + it("caps keywords at MAX_KEYWORDS to keep payload size predictable", () => { + const fileFormats = Array.from( + { length: MAX_KEYWORDS + 10 }, + (_, i) => `fmt-${i}` + ); + const response = makeDatasetsResponse({ + files: [ + { + data_modality: [], + file_format: fileFormats, + file_id: "f1", + file_type: "sequencing", + }, + ], + } as unknown as Partial); + const result = buildAnvilDatasetJsonLd(response, BROWSER_URL); + expect(result!.keywords).toHaveLength(MAX_KEYWORDS); + expect(result!.keywords).toEqual(fileFormats.slice(0, MAX_KEYWORDS)); + }); }); diff --git a/app/utils/schemaOrg/anvilDataset.ts b/app/utils/schemaOrg/anvilDataset.ts index 520986327..53dac44d7 100644 --- a/app/utils/schemaOrg/anvilDataset.ts +++ b/app/utils/schemaOrg/anvilDataset.ts @@ -1,5 +1,6 @@ import type { DatasetEntity } from "../../apis/azul/anvil-cmg/common/entities"; import type { DatasetsResponse } from "../../apis/azul/anvil-cmg/common/responses"; +import { MAX_KEYWORDS } from "./constants"; import type { SchemaDataset } from "./types"; import { buildDescription, uniqueNonEmpty } from "./utils"; @@ -7,6 +8,10 @@ const CATALOG_NAME = "AnVIL Data Explorer"; const DESCRIPTION_FALLBACK_SUFFIX = `${CATALOG_NAME} dataset.`; const DBGAP_STUDY_URL = "https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id="; +// dbGaP study accession format (e.g. "phs001234"). We validate against this +// before constructing identifiers.org / dbGaP study URLs so a non-dbGaP value +// in `registered_identifier` doesn't produce a malformed link. +const DBGAP_ACCESSION_PATTERN = /^phs\d+/; /** * Builds a Schema.org Dataset JSON-LD object for an AnVIL CMG dataset. @@ -92,7 +97,7 @@ function buildKeywords(data: DatasetsResponse): string[] { for (const library of data.libraries ?? []) { values.push(...(library.prep_material_name ?? [])); } - return uniqueNonEmpty(values); + return uniqueNonEmpty(values).slice(0, MAX_KEYWORDS); } /** @@ -106,7 +111,7 @@ function buildSameAs(dataset: DatasetEntity): string[] { for (const id of dataset.registered_identifier) { if (!id) continue; const trimmed = id.trim(); - if (!trimmed) continue; + if (!DBGAP_ACCESSION_PATTERN.test(trimmed)) continue; urls.push(`${DBGAP_STUDY_URL}${trimmed}`); } return uniqueNonEmpty(urls); diff --git a/app/utils/schemaOrg/constants.ts b/app/utils/schemaOrg/constants.ts index d3bfb45a7..3994d4db2 100644 --- a/app/utils/schemaOrg/constants.ts +++ b/app/utils/schemaOrg/constants.ts @@ -12,3 +12,10 @@ export const DESCRIPTION_LENGTH = { MAX: 5000, MIN: 50, } as const; + +/** + * Cap on the number of entries surfaced in array-valued Dataset fields + * (keywords, citations, etc.) to keep JSON-LD payload size predictable. + * Mirrors NCPI's reference implementation, which caps citations at 5. + */ +export const MAX_KEYWORDS = 20; diff --git a/app/utils/schemaOrg/hcaProjectDataset.ts b/app/utils/schemaOrg/hcaProjectDataset.ts index d70fcd2f5..20b9ed879 100644 --- a/app/utils/schemaOrg/hcaProjectDataset.ts +++ b/app/utils/schemaOrg/hcaProjectDataset.ts @@ -6,6 +6,7 @@ import type { import type { ProjectsResponse } from "../../apis/azul/hca-dcp/common/responses"; import { transformAccessionURL } from "../../viewModelBuilders/azul/hca-dcp/common/accessionMapper/accessionMapper"; import { ACCESSION_CONFIGS_BY_RESPONSE_KEY } from "../../viewModelBuilders/azul/hca-dcp/common/accessionMapper/constants"; +import { MAX_KEYWORDS } from "./constants"; import type { SchemaDataset, SchemaOrganization, @@ -156,7 +157,7 @@ function buildKeywords(data: ProjectsResponse): string[] { values.push(...(protocol.libraryConstructionApproach ?? [])); values.push(...(protocol.instrumentManufacturerModel ?? [])); } - return uniqueNonEmpty(values); + return uniqueNonEmpty(values).slice(0, MAX_KEYWORDS); } /** diff --git a/pages/[entityListType]/[...params].tsx b/pages/[entityListType]/[...params].tsx index a8bc9df1a..9d4ff4eee 100644 --- a/pages/[entityListType]/[...params].tsx +++ b/pages/[entityListType]/[...params].tsx @@ -69,10 +69,15 @@ export interface EntityDetailPageProps extends AzulEntityStaticResponse { * @param props.entityListType - Entity list type. * @returns Entity detail view component. */ +// Exact appTitle match — substring detection would also hit "AnVIL Dataset +// Catalog", which shares the "AnVIL" prefix but has a different entity shape. +const APP_TITLE_ANVIL_CMG = "AnVIL Data Explorer"; +const APP_TITLE_HCA_DCP = "HCA Data Explorer"; + const EntityDetailPage = (props: EntityDetailPageProps): JSX.Element => { const { config: siteConfig } = useConfig(); - const isAnVIL = siteConfig.appTitle?.includes("AnVIL"); - const isHcaDcp = siteConfig.appTitle?.includes("HCA"); + const isAnVIL = siteConfig.appTitle === APP_TITLE_ANVIL_CMG; + const isHcaDcp = siteConfig.appTitle === APP_TITLE_HCA_DCP; const { query } = useRouter(); if (!props.entityListType) return <>; if (props.override) return ;