From 8f3e7481b2df7c63631623ba3344e316cdd89004 Mon Sep 17 00:00:00 2001 From: Bertrand Zuchuat Date: Thu, 24 Feb 2022 14:46:04 +0100 Subject: [PATCH] oai: fix error on document type processing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Closes #789. Co-Authored-by: Bertrand Zuchuat Co-Authored-by: Johnny MariƩthoz --- sonar/config.py | 2 +- .../modules/documents/serializers/__init__.py | 13 +-- sonar/modules/documents/serializers/dc.py | 34 +++++-- .../documents/serializers/schemas/dc.py | 13 +-- tests/ui/documents/test_dc_schema.py | 90 ++++++++++--------- 5 files changed, 82 insertions(+), 70 deletions(-) diff --git a/sonar/config.py b/sonar/config.py index 3d5360e4..8bc3e85a 100644 --- a/sonar/config.py +++ b/sonar/config.py @@ -851,7 +851,7 @@ def _(x): 'oai_dc': { 'namespace': 'http://www.openarchives.org/OAI/2.0/oai_dc/', 'schema': 'http://www.openarchives.org/OAI/2.0/oai_dc.xsd', - 'serializer': 'sonar.modules.documents.serializers.oaipmh_oai_dc', + 'serializer': 'sonar.modules.documents.serializers.dc.sonar_dublin_core', } } #: Number of records to return per page in OAI-PMH results. diff --git a/sonar/modules/documents/serializers/__init__.py b/sonar/modules/documents/serializers/__init__.py index 5f07e625..3c801bd0 100644 --- a/sonar/modules/documents/serializers/__init__.py +++ b/sonar/modules/documents/serializers/__init__.py @@ -21,24 +21,20 @@ from datetime import datetime -from flask import current_app, request +from flask import request from invenio_records_rest.serializers.response import record_responsify, \ search_responsify from sonar.modules.collections.api import Record as CollectionRecord -from sonar.modules.documents.serializers.dc import SonarDublinCoreSerializer from sonar.modules.documents.serializers.google_scholar import \ SonarGoogleScholarSerializer from sonar.modules.documents.serializers.schemaorg import \ SonarSchemaOrgSerializer -from sonar.modules.documents.serializers.schemas.dc import DublinCoreV1 from sonar.modules.documents.serializers.schemas.google_scholar import \ GoogleScholarV1 from sonar.modules.documents.serializers.schemas.schemaorg import SchemaOrgV1 from sonar.modules.organisations.api import OrganisationRecord from sonar.modules.serializers import JSONSerializer as _JSONSerializer -from sonar.modules.subdivisions.api import Record as SubdivisionRecord -from sonar.modules.users.api import current_user_record from sonar.modules.utils import get_language_value from ..marshmallow import DocumentSchemaV1 @@ -84,8 +80,6 @@ def post_process_serialize_search(self, results, pid_fetcher): # =========== #: JSON serializer definition. json_v1 = JSONSerializer(DocumentSchemaV1) -#: Dublin Core serializer -dc_v1 = SonarDublinCoreSerializer(DublinCoreV1, replace_refs=True) #: schema.org serializer schemaorg_v1 = SonarSchemaOrgSerializer(SchemaOrgV1, replace_refs=True) #: google scholar serializer @@ -104,8 +98,3 @@ def post_process_serialize_search(self, results, pid_fetcher): 'json_v1_response', 'json_v1_search', ) - -# OAI-PMH record serializers. -# =========================== -#: OAI-PMH OAI Dublin Core record serializer. -oaipmh_oai_dc = dc_v1.serialize_oaipmh diff --git a/sonar/modules/documents/serializers/dc.py b/sonar/modules/documents/serializers/dc.py index 3a980a6a..12adce37 100644 --- a/sonar/modules/documents/serializers/dc.py +++ b/sonar/modules/documents/serializers/dc.py @@ -17,16 +17,34 @@ """Dublin Core serializer.""" -from invenio_records_rest.serializers.dc import DublinCoreSerializer +from dcxml import simpledc +from flask_resources.serializers import SerializerMixin +from sonar.modules.documents.serializers.schemas.dc import DublinCoreSchema -class SonarDublinCoreSerializer(DublinCoreSerializer): - """Marshmallow based DublinCore serializer for records.""" - def dump(self, obj, context=None): - """Serialize object with schema. +class SonarDublinCoreXMLSerializer(SerializerMixin): + """DublinCore serializer for records.""" - Mandatory to override this method, as invenio-records-rest does not - use the right way to dump objects (compatible with marshmallow 3.9). + def __init__(self, **options): + """Constructor.""" + self.schema_class = DublinCoreSchema + + def transform_record(self, obj): + """Tranform record.""" + # TODO: Remove this hack after migrate to invenio ressources + return self.schema_class().dump(dict(metadata=obj)) + + def serialize_object_xml(self, obj): + """Serialize a single record and persistent identifier to etree. + + :param obj: Record instance """ - return self.schema_class(context=context).dump(obj) + json = self.transform_record(obj["_source"]) + return simpledc.dump_etree(json) + + +def sonar_dublin_core(pid, record): + """Get DublinCore XML for OAI-PMH.""" + return SonarDublinCoreXMLSerializer()\ + .serialize_object_xml(record) diff --git a/sonar/modules/documents/serializers/schemas/dc.py b/sonar/modules/documents/serializers/schemas/dc.py index 0d9f150e..a6df019c 100644 --- a/sonar/modules/documents/serializers/schemas/dc.py +++ b/sonar/modules/documents/serializers/schemas/dc.py @@ -28,7 +28,7 @@ from .base_schema import BaseSchema -class DublinCoreV1(BaseSchema): +class DublinCoreSchema(BaseSchema): """Schema for records v1 in JSON.""" contributors = fields.Method('get_contributors') @@ -249,11 +249,12 @@ def get_titles(self, obj): def get_types(self, obj): """Get types.""" - if obj['metadata'].get('documentType'): - return [ - 'http://purl.org/coar/resource_type/{type}'.format( - type=obj['metadata']['documentType'].split(':')[1]) - ] + if obj['metadata'].get('documentType', ''): + types = obj['metadata'].get('documentType', '').split(':') + if len(types) == 1: + return [f'{types[0]}'] + if len(types) == 2: + return [f'http://purl.org/coar/resource_type/{types[1]}'] return [] diff --git a/tests/ui/documents/test_dc_schema.py b/tests/ui/documents/test_dc_schema.py index ede4065e..eccf2bc9 100644 --- a/tests/ui/documents/test_dc_schema.py +++ b/tests/ui/documents/test_dc_schema.py @@ -22,7 +22,7 @@ import pytest from sonar.modules.documents.api import DocumentRecord -from sonar.modules.documents.serializers import dc_v1 +from sonar.modules.documents.serializers.dc import SonarDublinCoreXMLSerializer @pytest.fixture() @@ -98,11 +98,11 @@ def contributors(): def test_contributors(minimal_document, contributors): """Test contributors serialization.""" - result = dc_v1.transform_record(minimal_document['pid'], minimal_document) + result = SonarDublinCoreXMLSerializer().transform_record(minimal_document) assert result['contributors'] == [] minimal_document.update({'contribution': contributors}) - result = dc_v1.transform_record(minimal_document['pid'], minimal_document) + result = SonarDublinCoreXMLSerializer().transform_record(minimal_document) assert result['contributors'] == [ 'Contributor 1', 'Contributor 2 (999 : 2010 : Sion)', @@ -114,18 +114,18 @@ def test_contributors(minimal_document, contributors): def test_creators(minimal_document, contributors): """Test creators serialization.""" - result = dc_v1.transform_record(minimal_document['pid'], minimal_document) + result = SonarDublinCoreXMLSerializer().transform_record(minimal_document) assert result['contributors'] == [] minimal_document.update({'contribution': contributors}) - result = dc_v1.transform_record(minimal_document['pid'], minimal_document) + result = SonarDublinCoreXMLSerializer().transform_record(minimal_document) assert result['creators'] == [ 'Creator 1', 'Creator 2 (123 : 2019 : Martigny)' ] def test_dates(app, minimal_document, embargo_date): - result = dc_v1.transform_record(minimal_document['pid'], minimal_document) + result = SonarDublinCoreXMLSerializer().transform_record(minimal_document) assert result['dates'] == [] minimal_document.update({ @@ -142,17 +142,17 @@ def test_dates(app, minimal_document, embargo_date): 'startDate': '2020-01-01' }] }) - result = dc_v1.transform_record(minimal_document['pid'], minimal_document) + result =SonarDublinCoreXMLSerializer().transform_record(minimal_document) assert result['dates'] == ['2019', '2020-01-01'] minimal_document.pop('provisionActivity', None) minimal_document.files['test.pdf'] = BytesIO(b'File content') - result = dc_v1.transform_record(minimal_document['pid'], minimal_document) + result = SonarDublinCoreXMLSerializer().transform_record(minimal_document) assert result['dates'] == [] minimal_document.files['test.pdf']['type'] = 'file' - result = dc_v1.transform_record(minimal_document['pid'], minimal_document) + result = SonarDublinCoreXMLSerializer().transform_record(minimal_document) assert result['dates'] == [] iso_embargo_date = embargo_date.isoformat() @@ -162,14 +162,14 @@ def test_dates(app, minimal_document, embargo_date): minimal_document.files['test.pdf']['access'] = 'coar:c_f1cf' minimal_document.files['test.pdf']['restricted'] = 'full' minimal_document.files['test.pdf']['embargo_date'] = iso_embargo_date - result = dc_v1.transform_record(minimal_document['pid'], - minimal_document) + result = SonarDublinCoreXMLSerializer()\ + .transform_record(minimal_document) assert result['dates'] == [ f'info:eu-repo/date/embargoEnd/{iso_embargo_date}'] def test_descriptions(minimal_document): - result = dc_v1.transform_record(minimal_document['pid'], minimal_document) + result = SonarDublinCoreXMLSerializer().transform_record(minimal_document) assert result['descriptions'] == [] minimal_document['abstracts'] = [{ @@ -177,43 +177,43 @@ def test_descriptions(minimal_document): }, { 'value': 'Description 2' }] - result = dc_v1.transform_record(minimal_document['pid'], minimal_document) + result = SonarDublinCoreXMLSerializer().transform_record(minimal_document) assert result['descriptions'] == ['Description 1', 'Description 2'] def test_formats(minimal_document): - result = dc_v1.transform_record(minimal_document['pid'], minimal_document) + result = SonarDublinCoreXMLSerializer().transform_record(minimal_document) assert result['formats'] == [] minimal_document.files['test.pdf'] = BytesIO(b'File content') - result = dc_v1.transform_record(minimal_document['pid'], minimal_document) + result = SonarDublinCoreXMLSerializer().transform_record(minimal_document) assert result['formats'] == [] minimal_document.files['test.pdf'] = BytesIO(b'File content') minimal_document.files['test.pdf']['type'] = 'file' - result = dc_v1.transform_record(minimal_document['pid'], minimal_document) + result = SonarDublinCoreXMLSerializer().transform_record(minimal_document) assert result['formats'] == ['application/pdf'] def test_identifiers(minimal_document): """Test identifiers serialization.""" - result = dc_v1.transform_record(minimal_document['pid'], minimal_document) + result = SonarDublinCoreXMLSerializer().transform_record(minimal_document) assert result['identifiers'] == ['http://localhost/global/documents/1000'] def test_languages(minimal_document): """Test languages serialization.""" - result = dc_v1.transform_record(minimal_document['pid'], minimal_document) + result = SonarDublinCoreXMLSerializer().transform_record(minimal_document) assert result['languages'] == [] minimal_document['language'] = [{'value': 'eng'}, {'value': 'fre'}] - result = dc_v1.transform_record(minimal_document['pid'], minimal_document) + result = SonarDublinCoreXMLSerializer().transform_record(minimal_document) assert result['languages'] == ['eng', 'fre'] def test_publishers(minimal_document): """Test publishers serialization.""" - result = dc_v1.transform_record(minimal_document['pid'], minimal_document) + result = SonarDublinCoreXMLSerializer().transform_record(minimal_document) assert result['publishers'] == [] minimal_document['provisionActivity'] = [{ @@ -246,13 +246,13 @@ def test_publishers(minimal_document): }] }] }] - result = dc_v1.transform_record(minimal_document['pid'], minimal_document) + result = SonarDublinCoreXMLSerializer().transform_record(minimal_document) assert result['publishers'] == ['Publisher 1'] def test_relations(minimal_document): """Test relations serialization.""" - result = dc_v1.transform_record(minimal_document['pid'], minimal_document) + result = SonarDublinCoreXMLSerializer().transform_record(minimal_document) assert result['relations'] == [] minimal_document['otherEdition'] = [{ @@ -264,7 +264,7 @@ def test_relations(minimal_document): 'electronicLocator': 'https://some.url.2' } }] - result = dc_v1.transform_record(minimal_document['pid'], minimal_document) + result = SonarDublinCoreXMLSerializer().transform_record(minimal_document) assert result['relations'] == ['https://some.url.1', 'https://some.url.2'] minimal_document.pop('otherEdition', None) @@ -298,7 +298,7 @@ def test_relations(minimal_document): 'type': 'bf:Urn', 'value': '1.2.3.4' }] - result = dc_v1.transform_record(minimal_document['pid'], minimal_document) + result = SonarDublinCoreXMLSerializer().transform_record(minimal_document) assert result['relations'] == [ 'info:eu-repo/semantics/altIdentifier/ark/13030/tf5p30086k', 'info:eu-repo/semantics/altIdentifier/doi/10.1186/2041-1480-3-9', @@ -312,18 +312,18 @@ def test_relations(minimal_document): def test_rights(app, minimal_document, embargo_date): """Test rights serialization.""" - result = dc_v1.transform_record(minimal_document['pid'], minimal_document) + result = SonarDublinCoreXMLSerializer().transform_record(minimal_document) assert result['rights'] == [] minimal_document['usageAndAccessPolicy'] = {'license': 'CC BY-NC-SA'} - result = dc_v1.transform_record(minimal_document['pid'], minimal_document) + result = SonarDublinCoreXMLSerializer().transform_record(minimal_document) assert result['rights'] == ['CC BY-NC-SA'] minimal_document['usageAndAccessPolicy'] = { 'license': 'License undefined', 'label': 'Custom license' } - result = dc_v1.transform_record(minimal_document['pid'], minimal_document) + result = SonarDublinCoreXMLSerializer().transform_record(minimal_document) assert result['rights'] == ['License undefined, Custom license'] minimal_document.pop('usageAndAccessPolicy', None) @@ -332,27 +332,27 @@ def test_rights(app, minimal_document, embargo_date): minimal_document.files['test.pdf'] = BytesIO(b'File content') minimal_document.files['test.pdf']['type'] = 'file' - result = dc_v1.transform_record(minimal_document['pid'], - minimal_document) + result = SonarDublinCoreXMLSerializer()\ + .transform_record(minimal_document) assert result['rights'] == ['info:eu-repo/semantics/openAccess'] minimal_document.files['test.pdf']['access'] = 'coar:c_16ec' minimal_document.files['test.pdf']['restricted'] = 'full' - result = dc_v1.transform_record(minimal_document['pid'], - minimal_document) + result = SonarDublinCoreXMLSerializer()\ + .transform_record(minimal_document) assert result['rights'] == ['info:eu-repo/semantics/restrictedAccess'] minimal_document.files['test.pdf']['access'] = 'coar:c_f1cf' minimal_document.files['test.pdf'][ 'embargo_date'] = embargo_date.isoformat() - result = dc_v1.transform_record(minimal_document['pid'], - minimal_document) + result = SonarDublinCoreXMLSerializer()\ + .transform_record(minimal_document) assert result['rights'] == ['info:eu-repo/semantics/embargoedAccess'] def test_sources(minimal_document): """Test sources serialization.""" - result = dc_v1.transform_record(minimal_document['pid'], minimal_document) + result = SonarDublinCoreXMLSerializer().transform_record(minimal_document) assert result['sources'] == [] minimal_document['partOf'] = [{ @@ -376,7 +376,7 @@ def test_sources(minimal_document): 'numberingPages': '135-139', 'numberingIssue': '12' }] - result = dc_v1.transform_record(minimal_document['pid'], minimal_document) + result = SonarDublinCoreXMLSerializer().transform_record(minimal_document) assert result['sources'] == [ 'Document 1, 2020', 'Document 2, 2020, vol. 6, no. 12, p. 135-139', @@ -386,7 +386,7 @@ def test_sources(minimal_document): def test_subjects(minimal_document): """Test subjects serialization.""" - result = dc_v1.transform_record(minimal_document['pid'], minimal_document) + result = SonarDublinCoreXMLSerializer().transform_record(minimal_document) assert result['subjects'] == [] minimal_document['subjects'] = [{ @@ -400,7 +400,7 @@ def test_subjects(minimal_document): 'value': ['Sujet 1', 'Sujet 2'] } }] - result = dc_v1.transform_record(minimal_document['pid'], minimal_document) + result = SonarDublinCoreXMLSerializer().transform_record(minimal_document) assert result['subjects'] == [ 'Subject 1', 'Subject 2', 'Sujet 1', 'Sujet 2' ] @@ -413,7 +413,7 @@ def test_subjects(minimal_document): 'type': 'bf:ClassificationDdc', 'classificationPortion': 'Portion' }] - result = dc_v1.transform_record(minimal_document['pid'], minimal_document) + result = SonarDublinCoreXMLSerializer().transform_record(minimal_document) assert result['subjects'] == [ 'info:eu-repo/classification/udc/54', 'info:eu-repo/classification/ddc/Portion' @@ -422,7 +422,7 @@ def test_subjects(minimal_document): def test_titles(minimal_document): """Test titles serialization.""" - result = dc_v1.transform_record(minimal_document['pid'], minimal_document) + result = SonarDublinCoreXMLSerializer().transform_record(minimal_document) assert result['titles'] == ['Title of the document'] minimal_document['title'] = [{ @@ -436,7 +436,7 @@ def test_titles(minimal_document): 'value': 'Title 2' }] }] - result = dc_v1.transform_record(minimal_document['pid'], minimal_document) + result = SonarDublinCoreXMLSerializer().transform_record(minimal_document) assert result['titles'] == ['Title 1'] minimal_document['title'] = [{ @@ -449,15 +449,19 @@ def test_titles(minimal_document): 'value': 'Subtitle 1' }] }] - result = dc_v1.transform_record(minimal_document['pid'], minimal_document) + result = SonarDublinCoreXMLSerializer().transform_record(minimal_document) assert result['titles'] == ['Title 1 : Subtitle 1'] def test_types(minimal_document): """Test types serialization.""" - result = dc_v1.transform_record(minimal_document['pid'], minimal_document) + result = SonarDublinCoreXMLSerializer().transform_record(minimal_document) assert result['types'] == [] + minimal_document['documentType'] = 'advanced_studies_thesis' + result = SonarDublinCoreXMLSerializer().transform_record(minimal_document) + assert result['types'] == ['advanced_studies_thesis'] + minimal_document['documentType'] = 'coar:c_2f33' - result = dc_v1.transform_record(minimal_document['pid'], minimal_document) + result = SonarDublinCoreXMLSerializer().transform_record(minimal_document) assert result['types'] == ['http://purl.org/coar/resource_type/c_2f33']