From ffd5c15ec7fda45739913ee72db692aa0cd66013 Mon Sep 17 00:00:00 2001 From: roll Date: Tue, 4 Jun 2024 15:46:25 +0100 Subject: [PATCH 1/2] Added Data Package serializer --- osf/metadata/serializers/__init__.py | 2 + osf/metadata/serializers/datapackage.py | 103 ++++++++++++++++++++++++ 2 files changed, 105 insertions(+) create mode 100644 osf/metadata/serializers/datapackage.py diff --git a/osf/metadata/serializers/__init__.py b/osf/metadata/serializers/__init__.py index 0a2bf995e11..a04425072d6 100644 --- a/osf/metadata/serializers/__init__.py +++ b/osf/metadata/serializers/__init__.py @@ -7,6 +7,7 @@ from osf import exceptions from osf.metadata import gather from .datacite import DataciteJsonMetadataSerializer, DataciteXmlMetadataSerializer +from .datapackage import DatapackageSerializer from .google_dataset_json_ld import GoogleDatasetJsonLdSerializer from .turtle import TurtleMetadataSerializer @@ -15,6 +16,7 @@ 'turtle': TurtleMetadataSerializer, 'datacite-json': DataciteJsonMetadataSerializer, 'datacite-xml': DataciteXmlMetadataSerializer, + 'datapackage.json': DatapackageSerializer, 'google-dataset-json-ld': GoogleDatasetJsonLdSerializer, } diff --git a/osf/metadata/serializers/datapackage.py b/osf/metadata/serializers/datapackage.py new file mode 100644 index 00000000000..c23d4a69ec5 --- /dev/null +++ b/osf/metadata/serializers/datapackage.py @@ -0,0 +1,103 @@ +import json +from osf.metadata.serializers import _base +from osf.metadata.serializers.datacite import DataciteJsonMetadataSerializer + +PROFILE_URL = "https://datapackage.org/profiles/2.0/datapackage.json" + + +class DatapackageSerializer(_base.MetadataSerializer): + mediatype = "application/json" # type: ignore + + def filename_for_itemid(self, itemid: str): + return f"{itemid}.datapackage.json" + + def serialize(self) -> str: + return json.dumps( + self.metadata_as_dict(), + indent=2, + sort_keys=True, + ) + + # NOTE: this mapping can be done by `dplib-py` on Python3.8+ + def metadata_as_dict(self) -> dict: + dataset = DataciteJsonMetadataSerializer(self.basket).metadata_as_dict() + package = {"$schema": PROFILE_URL, "resources": []} + + # Id + for identifier in dataset.get("identifiers", []): + type = identifier.get("identifierType") + value = identifier.get("identifier") + if value and type == "DOI": + package["id"] = value + break + + # Title + for title in dataset.get("titles", []): + type = title.get("titleType") + value = title.get("title") + if value and not type: + package["title"] = value + break + + # Description + for title in dataset.get("descriptions", []): + type = title.get("descriptionType") + value = title.get("description") + if value and type == "Abstract": + package["description"] = value + break + + # Homepage + for identifier in dataset.get("identifiers", []): + type = identifier.get("identifierType") + value = identifier.get("identifier") + if value and type == "URL": + package["homepage"] = value + break + + # Version + version = dataset.get("version") + if version: + package["version"] = version + + # Keywords + for subject in dataset.get("subjects", []): + value = subject.get("subject") + if value: + package.setdefault("keywords", []).append(value) + + # Licenses + for right in dataset.get("rightsList", []): + license = {} + license["name"] = right.get("rightsIdentifier") + license["path"] = right.get("rightsUri") + license["title"] = right.get("rights") + license = {k: v for k, v in license.items() if v is not None} + if license: + package.setdefault("licenses", []).append(license) + + # Contributors + creators = dataset.get("creators", []) + contributors = dataset.get("contributors", []) + for type, items in [("creator", creators), (None, contributors)]: + for item in items: + type = item.get("contributorType", type) + contributor = {} + contributor["title"] = item.get("name") + contributor["givenName"] = item.get("givenName") + contributor["familyName"] = item.get("familyName") + if type: + contributor["roles"] = [type] + for affiliation in item.get("affiliations", []): + name = affiliation.get("name") + if name: + contributor["organization"] = name + break + contributor = {k: v for k, v in contributor.items() if v is not None} + if contributor: + package.setdefault("contributors", []).append(contributor) + + # Resources + # TODO: is there a way to get actual file urls from the metadata? + + return package From a2e3b86bf51a21a282164079b21350d29acaf5ce Mon Sep 17 00:00:00 2001 From: roll Date: Wed, 18 Sep 2024 14:32:13 +0100 Subject: [PATCH 2/2] Removed irrelevant TODO --- osf/metadata/serializers/datapackage.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/osf/metadata/serializers/datapackage.py b/osf/metadata/serializers/datapackage.py index c23d4a69ec5..74e3f302d40 100644 --- a/osf/metadata/serializers/datapackage.py +++ b/osf/metadata/serializers/datapackage.py @@ -18,7 +18,6 @@ def serialize(self) -> str: sort_keys=True, ) - # NOTE: this mapping can be done by `dplib-py` on Python3.8+ def metadata_as_dict(self) -> dict: dataset = DataciteJsonMetadataSerializer(self.basket).metadata_as_dict() package = {"$schema": PROFILE_URL, "resources": []} @@ -97,7 +96,4 @@ def metadata_as_dict(self) -> dict: if contributor: package.setdefault("contributors", []).append(contributor) - # Resources - # TODO: is there a way to get actual file urls from the metadata? - return package