DataBiosphere · frano-m · May 13, 2026 · May 14, 2026
diff --git a/__tests__/utils/schemaOrg/anvilDataset.test.ts b/__tests__/utils/schemaOrg/anvilDataset.test.ts
@@ -0,0 +1,220 @@
+import type { DatasetsResponse } from "../../../app/apis/azul/anvil-cmg/common/responses";
+import { buildAnvilDatasetJsonLd } from "../../../app/utils/schemaOrg/anvilDataset";
+import {
+  DESCRIPTION_LENGTH,
+  MAX_KEYWORDS,
+} from "../../../app/utils/schemaOrg/constants";
+
+const BROWSER_URL = "https://explore.anvilproject.org";
+
+/**
+ * Builds a minimal valid AnVIL datasets response with optional overrides for
+ * top-level (dataset) and aggregated (activity/biosample/donor/file/library/
+ * diagnosis) fields.
+ * @param overrides - Partial overrides applied to the base response.
+ * @returns A `DatasetsResponse` shape suitable for builder tests.
+ */
+function makeDatasetsResponse(
+  overrides: Partial<DatasetsResponse> = {}
+): DatasetsResponse {
+  return {
+    activities: [],
+    biosamples: [],
+    datasets: [
+      {
+        accessible: true,
+        consent_group: ["NRES"],
+        dataset_id: "uuid-1",
+        description:
+          "A multi-cohort study of rare disease across many donors and biosamples.",
+        duos_id: null,
+        registered_identifier: [],
+        title: "Rare disease dataset",
+      },
+    ],
+    diagnoses: [],
+    donors: [],
+    entryId: "abc",
+    files: [],
+    libraries: [],
+    status: 200,
+    ...overrides,
+  } as unknown as DatasetsResponse;
+}
+
+describe("buildAnvilDatasetJsonLd", () => {
+  it("returns undefined when no dataset is present", () => {
+    const response = makeDatasetsResponse({ datasets: [] });
+    expect(buildAnvilDatasetJsonLd(response, BROWSER_URL)).toBeUndefined();
+  });
+
+  it("populates required Schema.org Dataset fields", () => {
+    const result = buildAnvilDatasetJsonLd(makeDatasetsResponse(), BROWSER_URL);
+    expect(result).toBeDefined();
+    expect(result!["@context"]).toBe("https://schema.org");
+    expect(result!["@type"]).toBe("Dataset");
+    expect(result!.name).toBe("Rare disease dataset");
+    expect(result!.description).toBe(
+      "A multi-cohort study of rare disease across many donors and biosamples."
+    );
+    expect(result!.url).toBe(`${BROWSER_URL}/datasets/uuid-1`);
+    expect(result!.identifier).toEqual(["uuid-1"]);
+    expect(result!.isAccessibleForFree).toBe(true);
+    expect(result!.includedInDataCatalog).toEqual({
+      "@type": "DataCatalog",
+      name: "AnVIL Data Explorer",
+      url: BROWSER_URL,
+    });
+  });
+
+  it("falls back to dataset_id when title is empty", () => {
+    const response = makeDatasetsResponse();
+    response.datasets[0].title = "";
+    const result = buildAnvilDatasetJsonLd(response, BROWSER_URL);
+    expect(result!.name).toBe("uuid-1");
+  });
+
+  it("strips HTML tags from description", () => {
+    const response = makeDatasetsResponse();
+    response.datasets[0].description =
+      "<p>Single-cell <strong>RNA-seq</strong> data across many donors and tissues.</p>";
+    const result = buildAnvilDatasetJsonLd(response, BROWSER_URL);
+    expect(result!.description).toBe(
+      "Single-cell RNA-seq data across many donors and tissues."
+    );
+  });
+
+  it("truncates descriptions over 5000 characters and appends an ellipsis", () => {
+    const longDescription = "a".repeat(DESCRIPTION_LENGTH.MAX + 200);
+    const response = makeDatasetsResponse();
+    response.datasets[0].description = longDescription;
+    const result = buildAnvilDatasetJsonLd(response, BROWSER_URL);
+    expect(result!.description).toHaveLength(DESCRIPTION_LENGTH.MAX);
+    expect(result!.description.endsWith("…")).toBe(true);
+  });
+
+  it("pads short descriptions with name and catalog context to meet the 50-char minimum", () => {
+    const response = makeDatasetsResponse();
+    response.datasets[0].description = "Short.";
+    const result = buildAnvilDatasetJsonLd(response, BROWSER_URL);
+    expect(result!.description).toBe(
+      "Rare disease dataset — Short. — AnVIL Data Explorer dataset."
+    );
+    expect(result!.description.length).toBeGreaterThanOrEqual(
+      DESCRIPTION_LENGTH.MIN
+    );
+  });
+
+  it("falls back to dataset name plus catalog context when description is missing", () => {
+    const response = makeDatasetsResponse();
+    response.datasets[0].description = undefined;
+    const result = buildAnvilDatasetJsonLd(response, BROWSER_URL);
+    expect(result!.description).toBe(
+      "Rare disease dataset — AnVIL Data Explorer dataset."
+    );
+    expect(result!.description.length).toBeGreaterThanOrEqual(
+      DESCRIPTION_LENGTH.MIN
+    );
+  });
+
+  it("includes registered_identifier values in identifier and dbGaP study URLs in sameAs", () => {
+    const response = makeDatasetsResponse();
+    response.datasets[0].registered_identifier = [
+      "phs000123",
+      "phs000456",
+      null,
+    ];
+    const result = buildAnvilDatasetJsonLd(response, BROWSER_URL);
+    expect(result!.identifier).toEqual(["uuid-1", "phs000123", "phs000456"]);
+    expect(result!.sameAs).toEqual([
+      "https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs000123",
+      "https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs000456",
+    ]);
+  });
+
+  it("omits sameAs when there are no registered identifiers", () => {
+    const result = buildAnvilDatasetJsonLd(makeDatasetsResponse(), BROWSER_URL);
+    expect(result!.sameAs).toBeUndefined();
+  });
+
+  it("skips registered_identifier values that aren't dbGaP phs accessions", () => {
+    const response = makeDatasetsResponse();
+    response.datasets[0].registered_identifier = [
+      "phs000123",
+      "not-a-phs",
+      "PHS000456", // wrong case — phs accessions are lowercase
+      "phs000789",
+    ];
+    const result = buildAnvilDatasetJsonLd(response, BROWSER_URL);
+    expect(result!.sameAs).toEqual([
+      "https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs000123",
+      "https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs000789",
+    ]);
+  });
+
+  it("builds deduplicated keywords from activity, biosample, donor, diagnosis, file, and library fields", () => {
+    const response = makeDatasetsResponse({
+      activities: [
+        { activity_type: ["sequencing"], data_modality: ["genomic"] },
+      ],
+      biosamples: [{ anatomical_site: ["brain"], biosample_type: ["tissue"] }],
+      diagnoses: [{ disease: ["epilepsy"], phenotype: [] }],
+      donors: [
+        {
+          organism_type: ["Homo sapiens"],
+          phenotypic_sex: ["female"],
+          reported_ethnicity: ["asian"],
+        },
+      ],
+      files: [
+        {
+          data_modality: ["genomic"],
+          file_format: ["fastq", "bam"],
+          file_id: "f1",
+          file_type: "sequencing",
+        },
+      ],
+      libraries: [{ prep_material_name: ["DNA"] }],
+    } as unknown as Partial<DatasetsResponse>);
+    const result = buildAnvilDatasetJsonLd(response, BROWSER_URL);
+    expect(result!.keywords).toEqual([
+      "sequencing",
+      "genomic",
+      "brain",
+      "tissue",
+      "Homo sapiens",
+      "female",
+      "asian",
+      "epilepsy",
+      "fastq",
+      "bam",
+      "DNA",
+    ]);
+  });
+
+  it("omits keywords and sameAs when sources are empty", () => {
+    const result = buildAnvilDatasetJsonLd(makeDatasetsResponse(), BROWSER_URL);
+    expect(result!.keywords).toBeUndefined();
+    expect(result!.sameAs).toBeUndefined();
+  });
+
+  it("caps keywords at MAX_KEYWORDS to keep payload size predictable", () => {
+    const fileFormats = Array.from(
+      { length: MAX_KEYWORDS + 10 },
+      (_, i) => `fmt-${i}`
+    );
+    const response = makeDatasetsResponse({
+      files: [
+        {
+          data_modality: [],
+          file_format: fileFormats,
+          file_id: "f1",
+          file_type: "sequencing",
+        },
+      ],
+    } as unknown as Partial<DatasetsResponse>);
+    const result = buildAnvilDatasetJsonLd(response, BROWSER_URL);
+    expect(result!.keywords).toHaveLength(MAX_KEYWORDS);
+    expect(result!.keywords).toEqual(fileFormats.slice(0, MAX_KEYWORDS));
+  });
+});
diff --git a/app/utils/schemaOrg/anvilDataset.ts b/app/utils/schemaOrg/anvilDataset.ts
@@ -0,0 +1,118 @@
+import type { DatasetEntity } from "../../apis/azul/anvil-cmg/common/entities";
+import type { DatasetsResponse } from "../../apis/azul/anvil-cmg/common/responses";
+import { MAX_KEYWORDS } from "./constants";
+import type { SchemaDataset } from "./types";
+import { buildDescription, uniqueNonEmpty } from "./utils";
+
+const CATALOG_NAME = "AnVIL Data Explorer";
+const DESCRIPTION_FALLBACK_SUFFIX = `${CATALOG_NAME} dataset.`;
+const DBGAP_STUDY_URL =
+  "https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=";
+// dbGaP study accession format (e.g. "phs001234"). We validate against this
+// before constructing identifiers.org / dbGaP study URLs so a non-dbGaP value
+// in `registered_identifier` doesn't produce a malformed link.
+const DBGAP_ACCESSION_PATTERN = /^phs\d+/;
+
+/**
+ * Builds a Schema.org Dataset JSON-LD object for an AnVIL CMG dataset.
+ *
+ * Returns `undefined` when the response does not carry a dataset we can
+ * describe (i.e. no dataset entity), so the caller can skip rendering.
+ * @param data - AnVIL CMG dataset detail response from Azul.
+ * @param browserURL - Site base URL used for canonical and catalog URLs.
+ * @returns Schema.org Dataset JSON-LD object, or `undefined` if not buildable.
+ */
+export function buildAnvilDatasetJsonLd(
+  data: DatasetsResponse,
+  browserURL: string
+): SchemaDataset | undefined {
+  const dataset = data.datasets?.[0];
+  if (!dataset) return undefined;
+
+  const name = dataset.title || dataset.dataset_id;
+  const description = buildDescription(
+    dataset.description || "",
+    name,
+    DESCRIPTION_FALLBACK_SUFFIX
+  );
+  const identifier = uniqueNonEmpty([
+    dataset.dataset_id,
+    ...dataset.registered_identifier,
+  ]);
+
+  const jsonLd: SchemaDataset = {
+    "@context": "https://schema.org",
+    "@type": "Dataset",
+    description,
+    identifier,
+    includedInDataCatalog: {
+      "@type": "DataCatalog",
+      name: CATALOG_NAME,
+      url: browserURL,
+    },
+    // Google's `isAccessibleForFree` is the inverse of "paid", not "unrestricted" — dbGaP gating doesn't change the value.
+    isAccessibleForFree: true,
+    name,
+    url: `${browserURL}/datasets/${dataset.dataset_id}`,
+  };
+
+  const sameAs = buildSameAs(dataset);
+  if (sameAs.length > 0) jsonLd.sameAs = sameAs;
+
+  const keywords = buildKeywords(data);
+  if (keywords.length > 0) jsonLd.keywords = keywords;
+
+  return jsonLd;
+}
+
+/**
+ * Builds a keywords array by unioning biologically-meaningful fields from the
+ * dataset's aggregated activity/biosample/donor/file/library/diagnosis
+ * responses.
+ * @param data - AnVIL CMG dataset detail response.
+ * @returns Deduplicated keywords array.
+ */
+function buildKeywords(data: DatasetsResponse): string[] {
+  const values: (string | null | undefined)[] = [];
+  for (const activity of data.activities ?? []) {
+    values.push(...(activity.activity_type ?? []));
+    values.push(...(activity.data_modality ?? []));
+  }
+  for (const biosample of data.biosamples ?? []) {
+    values.push(...(biosample.anatomical_site ?? []));
+    values.push(...(biosample.biosample_type ?? []));
+  }
+  for (const donor of data.donors ?? []) {
+    values.push(...(donor.organism_type ?? []));
+    values.push(...(donor.phenotypic_sex ?? []));
+    values.push(...(donor.reported_ethnicity ?? []));
+  }
+  for (const diagnosis of data.diagnoses ?? []) {
+    values.push(...(diagnosis.disease ?? []));
+  }
+  for (const file of data.files ?? []) {
+    values.push(...(file.data_modality ?? []));
+    values.push(...(file.file_format ?? []));
+  }
+  for (const library of data.libraries ?? []) {
+    values.push(...(library.prep_material_name ?? []));
+  }
+  return uniqueNonEmpty(values).slice(0, MAX_KEYWORDS);
+}
+
+/**
+ * Builds the sameAs array of external accession URLs. AnVIL datasets reference
+ * dbGaP study pages via their `registered_identifier` (phs accessions).
+ * @param dataset - AnVIL dataset entity.
+ * @returns Array of canonical dbGaP study URLs.
+ */
+function buildSameAs(dataset: DatasetEntity): string[] {
+  const urls: string[] = [];
+  for (const id of dataset.registered_identifier) {
+    if (!id) continue;
+    const trimmed = id.trim();
+    if (!DBGAP_ACCESSION_PATTERN.test(trimmed)) continue;
+    urls.push(`${DBGAP_STUDY_URL}${trimmed}`);
+  }
+  return uniqueNonEmpty(urls);
+}
diff --git a/app/utils/schemaOrg/constants.ts b/app/utils/schemaOrg/constants.ts
@@ -12,3 +12,10 @@ export const DESCRIPTION_LENGTH = {
   MAX: 5000,
   MIN: 50,
 } as const;
+
+/**
+ * Cap on the number of entries surfaced in array-valued Dataset fields
+ * (keywords, citations, etc.) to keep JSON-LD payload size predictable.
+ * Mirrors NCPI's reference implementation, which caps citations at 5.
+ */
+export const MAX_KEYWORDS = 20;