From e870ec408df805f4c95b224fe9ea829449ff3fe5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Se=CC=81bastien=20De=CC=81le=CC=80ze?= Date: Mon, 1 Feb 2021 13:23:52 +0100 Subject: [PATCH] documents: add metadata in detail view. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Adds JSON-LD data for schema.org. * Adds citation metadata for Google Scholar. * Adds title and meta description. * Creates a base serializer schema and moves some functions into it. * Creates a serializer for schema.org metadata. * Creates a serializer for Google scholar metadata. * Closes #98. * Closes #99. Co-Authored-by: Sébastien Délèze --- .../modules/documents/serializers/__init__.py | 12 + .../documents/serializers/google_scholar.py | 53 ++ .../documents/serializers/schemaorg.py | 35 ++ .../documents/serializers/schemas/__init__.py | 18 + .../serializers/schemas/base_schema.py | 157 ++++++ .../documents/serializers/schemas/dc.py | 49 +- .../serializers/schemas/google_scholar.py | 95 ++++ .../serializers/schemas/schemaorg.py | 175 +++++++ .../documents/templates/documents/record.html | 16 +- sonar/modules/documents/views.py | 18 +- .../serializers/test_google_scholar_schema.py | 405 +++++++++++++++ .../serializers/test_schemaorg_schema.py | 463 ++++++++++++++++++ 12 files changed, 1449 insertions(+), 47 deletions(-) create mode 100644 sonar/modules/documents/serializers/google_scholar.py create mode 100644 sonar/modules/documents/serializers/schemaorg.py create mode 100644 sonar/modules/documents/serializers/schemas/__init__.py create mode 100644 sonar/modules/documents/serializers/schemas/base_schema.py create mode 100644 sonar/modules/documents/serializers/schemas/google_scholar.py create mode 100644 sonar/modules/documents/serializers/schemas/schemaorg.py create mode 100644 tests/unit/documents/serializers/test_google_scholar_schema.py create mode 100644 tests/unit/documents/serializers/test_schemaorg_schema.py diff --git a/sonar/modules/documents/serializers/__init__.py b/sonar/modules/documents/serializers/__init__.py index d0e18c0f..295ae8d1 100644 --- a/sonar/modules/documents/serializers/__init__.py +++ b/sonar/modules/documents/serializers/__init__.py @@ -26,7 +26,14 @@ search_responsify from sonar.modules.documents.serializers.dc import SonarDublinCoreSerializer +from sonar.modules.documents.serializers.google_scholar import \ + SonarGoogleScholarSerializer +from sonar.modules.documents.serializers.schemaorg import \ + SonarSchemaOrgSerializer from sonar.modules.documents.serializers.schemas.dc import DublinCoreV1 +from sonar.modules.documents.serializers.schemas.google_scholar import \ + GoogleScholarV1 +from sonar.modules.documents.serializers.schemas.schemaorg import SchemaOrgV1 from sonar.modules.organisations.api import OrganisationRecord from sonar.modules.serializers import JSONSerializer as _JSONSerializer from sonar.modules.users.api import current_user_record @@ -76,6 +83,11 @@ def post_process_serialize_search(self, results, pid_fetcher): json_v1 = JSONSerializer(DocumentSchemaV1) #: Dublin Core serializer dc_v1 = SonarDublinCoreSerializer(DublinCoreV1, replace_refs=True) +#: schema.org serializer +schemaorg_v1 = SonarSchemaOrgSerializer(SchemaOrgV1, replace_refs=True) +#: google scholar serializer +google_scholar_v1 = SonarGoogleScholarSerializer(GoogleScholarV1, + replace_refs=True) # Records-REST serializers # ======================== diff --git a/sonar/modules/documents/serializers/google_scholar.py b/sonar/modules/documents/serializers/google_scholar.py new file mode 100644 index 00000000..567928e0 --- /dev/null +++ b/sonar/modules/documents/serializers/google_scholar.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- +# +# Swiss Open Access Repository +# Copyright (C) 2019 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""Google scholar serializer.""" + +from invenio_records_rest.serializers.base import PreprocessorMixin, \ + SerializerMixinInterface +from invenio_records_rest.serializers.marshmallow import MarshmallowMixin + + +class SonarGoogleScholarSerializer(SerializerMixinInterface, MarshmallowMixin, + PreprocessorMixin): + """Google scholar serializer.""" + + def dump(self, obj, context=None): + """Serialize object with schema. + + Mandatory to override this method, as invenio-records-rest does not + use the right way to dump objects (compatible with marshmallow 3.9). + """ + return self.schema_class(context=context).dump(obj) + + def transform_record(self, pid, record, links_factory=None, **kwargs): + """Transform record in metas for Google scholar.""" + data = super(SonarGoogleScholarSerializer, + self).transform_record(pid, record, links_factory, + **kwargs) + + metas = [] + meta_template = '' + for key, value in data.items(): + if isinstance(value, list): + for listValue in value: + metas.append(meta_template.format(key=key, + value=listValue)) + else: + metas.append(meta_template.format(key=key, value=value)) + + return '\n'.join(metas) diff --git a/sonar/modules/documents/serializers/schemaorg.py b/sonar/modules/documents/serializers/schemaorg.py new file mode 100644 index 00000000..9e30a6ba --- /dev/null +++ b/sonar/modules/documents/serializers/schemaorg.py @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- +# +# Swiss Open Access Repository +# Copyright (C) 2019 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""schema.org serializer.""" + +from invenio_records_rest.serializers.base import PreprocessorMixin, \ + SerializerMixinInterface +from invenio_records_rest.serializers.marshmallow import MarshmallowMixin + + +class SonarSchemaOrgSerializer(SerializerMixinInterface, MarshmallowMixin, + PreprocessorMixin): + """Marshmallow based schema.org serializer for records.""" + + def dump(self, obj, context=None): + """Serialize object with schema. + + Mandatory to override this method, as invenio-records-rest does not + use the right way to dump objects (compatible with marshmallow 3.9). + """ + return self.schema_class(context=context).dump(obj) diff --git a/sonar/modules/documents/serializers/schemas/__init__.py b/sonar/modules/documents/serializers/schemas/__init__.py new file mode 100644 index 00000000..0ec9e641 --- /dev/null +++ b/sonar/modules/documents/serializers/schemas/__init__.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- +# +# Swiss Open Access Repository +# Copyright (C) 2019 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""Schemas for serialization.""" diff --git a/sonar/modules/documents/serializers/schemas/base_schema.py b/sonar/modules/documents/serializers/schemas/base_schema.py new file mode 100644 index 00000000..6a18bed2 --- /dev/null +++ b/sonar/modules/documents/serializers/schemas/base_schema.py @@ -0,0 +1,157 @@ +# -*- coding: utf-8 -*- +# +# Swiss Open Access Repository +# Copyright (C) 2019 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""Base schema for marshmallow serialization.""" + +import re + +from flask import request +from marshmallow import Schema, pre_dump + +from sonar.modules.documents.api import DocumentRecord +from sonar.modules.documents.utils import has_external_urls_for_files, \ + populate_files_properties + + +class BaseSchema(Schema): + """Base schema for marshmallow serialization.""" + + @pre_dump + def pre_dump(self, item, **kwargs): + """Do some transformations in record before dumping it. + + - Store the main file to use it in methods. + - Check if files must point to an external URL. + - Populate restrictions, thumbnail and URL in files. + + :param item: Item object to process + :returns: Modified item + """ + if not item['metadata'].get('_files'): + return item + + # Store the main file + main_file = self.get_main_file(item) + if main_file: + item['metadata']['mainFile'] = main_file + + # Check if organisation record forces to point file to an external url + item['metadata']['external_url'] = has_external_urls_for_files( + item['metadata']) + + # Add restriction, link and thumbnail to files + populate_files_properties(item['metadata']) + + return item + + def get_main_file(self, obj): + """Return the main file. + + :param obj: Record dict. + :returns: Main file or None. + """ + files = [ + file for file in obj['metadata'].get('_files', []) + if file.get('type') == 'file' + ] + files = sorted(files, key=lambda file: file.get('order', 100)) + return files[0] if files else None + + def get_id(self, obj): + """Get id.""" + return DocumentRecord.get_permanent_link(request.host_url, + obj['metadata']['pid']) + + def get_title(self, obj): + """Get title.""" + for title in obj['metadata'].get('title', []): + return title['mainTitle'][0]['value'] + + return None + + def get_start_date(self, obj): + """Get start date.""" + for provision_activity in obj['metadata'].get('provisionActivity', []): + if provision_activity[ + 'type'] == 'bf:Publication' and provision_activity.get( + 'startDate'): + return provision_activity['startDate'] + + return None + + def get_keywords(self, obj): + """Get keywords.""" + items = [] + + for subjects in obj['metadata'].get('subjects', []): + items = items + subjects['label']['value'] + + return items + + def get_url(self, obj): + """Get url.""" + if obj['metadata'].get('mainFile', {}).get('links'): + if obj['metadata']['mainFile']['links'].get('download'): + return '{host}{image}'.format( + host=request.host_url.rstrip('/'), + image=obj['metadata']['mainFile']['links']['download']) + + if obj['metadata']['mainFile']['links'].get('external'): + return obj['metadata']['mainFile']['links']['external'] + + return None + + def get_pages(self, obj): + """Get pages. + + :param obj: Record dict. + :returns: Pages stored in partOf + """ + for part_of in obj['metadata'].get('partOf', []): + if part_of.get('numberingPages'): + return part_of['numberingPages'] + + return None + + def get_first_page(self, obj): + """Get the first page. + + :param obj: Record dict. + :returns: The first page. + """ + for part_of in obj['metadata'].get('partOf', []): + if part_of.get('numberingPages'): + matches = re.match(r'^([0-9]+)', part_of['numberingPages']) + + return matches.group(1) if matches else None + + return None + + def get_last_page(self, obj): + """Get the last page. + + :param obj: Record dict. + :returns: The last page. + """ + for part_of in obj['metadata'].get('partOf', []): + if part_of.get('numberingPages'): + matches = re.match(r'^[0-9]+\-([0-9]+)', + part_of['numberingPages']) + + return matches.group(1) if matches else None + + return None diff --git a/sonar/modules/documents/serializers/schemas/dc.py b/sonar/modules/documents/serializers/schemas/dc.py index 23ae7aff..7c8ca178 100644 --- a/sonar/modules/documents/serializers/schemas/dc.py +++ b/sonar/modules/documents/serializers/schemas/dc.py @@ -20,15 +20,15 @@ import re from flask import request -from marshmallow import Schema, fields, pre_dump +from marshmallow import fields from sonar.modules.documents.api import DocumentRecord -from sonar.modules.documents.utils import has_external_urls_for_files, \ - populate_files_properties from sonar.modules.documents.views import part_of_format +from .base_schema import BaseSchema -class DublinCoreV1(Schema): + +class DublinCoreV1(BaseSchema): """Schema for records v1 in JSON.""" contributors = fields.Method('get_contributors') @@ -46,34 +46,6 @@ class DublinCoreV1(Schema): titles = fields.Method('get_titles') types = fields.Method('get_types') - @pre_dump - def pre_dump(self, item, **kwargs): - """Do some transformations in record before dumping it. - - - Store the main file to use it in methods. - - Check if files must point to an external URL. - - Populate restrictions, thumbnail and URL in files. - - :param item: Item object to process - :returns: Modified item - """ - if not item['metadata'].get('_files'): - return item - - # Store the main file - main_file = self.get_main_file(item) - if main_file: - item['metadata']['mainFile'] = main_file - - # Check if organisation record forces to point file to an external url - item['metadata']['external_url'] = has_external_urls_for_files( - item['metadata']) - - # Add restriction, link and thumbnail to files - populate_files_properties(item['metadata']) - - return item - def get_contributors(self, obj): """Get contributors.""" items = [] @@ -307,16 +279,3 @@ def format_contributor(self, contributor): data += ' ({info})'.format(info=' : '.join(info)) return data - - def get_main_file(self, obj): - """Return the main file. - - :param obj: Record dict. - :returns: Main file or None. - """ - files = [ - file for file in obj['metadata'].get('_files', []) - if file.get('type') == 'file' - ] - files = sorted(files, key=lambda file: file.get('order', 100)) - return files[0] if files else None diff --git a/sonar/modules/documents/serializers/schemas/google_scholar.py b/sonar/modules/documents/serializers/schemas/google_scholar.py new file mode 100644 index 00000000..63a88865 --- /dev/null +++ b/sonar/modules/documents/serializers/schemas/google_scholar.py @@ -0,0 +1,95 @@ +# -*- coding: utf-8 -*- +# +# Swiss Open Access Repository +# Copyright (C) 2019 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""Google scholar marshmallow schema.""" + +from __future__ import absolute_import, print_function, unicode_literals + +from marshmallow import fields, post_dump + +from sonar.modules.documents.views import get_language_from_bibliographic_code + +from .base_schema import BaseSchema + + +class GoogleScholarV1(BaseSchema): + """Marshmallow schema for Google scholar.""" + + title = fields.Method('get_title') + language = fields.Method('get_language') + publication_date = fields.Method('get_start_date') + keywords = fields.Method('get_keywords') + pdf_url = fields.Method('get_url') + online_date = fields.Method('get_start_date') + author = fields.Method('get_author') + doi = fields.Method('get_doi') + abstract_html_url = fields.Method('get_id') + pages = fields.Method('get_pages') + firstpage = fields.Method('get_first_page') + lastpage = fields.Method('get_last_page') + volume = fields.Method('get_volume') + journal_title = fields.Method('get_host_document_title') + + def get_language(self, obj): + """Get language.""" + for language in obj['metadata'].get('language', []): + return get_language_from_bibliographic_code(language['value']) + + return None + + def get_keywords(self, obj): + """Get keywords.""" + return ' ; '.join(super(GoogleScholarV1, self).get_keywords(obj)) + + def get_author(self, obj): + """Get authors.""" + items = [] + for contributor in obj['metadata'].get('contribution', []): + if contributor['role'][0] == 'cre' and contributor['agent'].get( + 'preferred_name'): + items.append(contributor['agent']['preferred_name']) + + return items + + def get_doi(self, obj): + """Get DOI.""" + for identifier in obj['metadata'].get('identifiedBy', []): + if identifier['type'] == 'bf:Doi': + return identifier['value'] + + return None + + def get_volume(self, obj): + """Get volume.""" + for part_of in obj['metadata'].get('partOf', []): + if part_of.get('numberingVolume'): + return part_of['numberingVolume'] + + return None + + def get_host_document_title(self, obj): + """Get volume.""" + for part_of in obj['metadata'].get('partOf', []): + if part_of.get('document', {}).get('title'): + return part_of['document']['title'] + + return None + + @post_dump + def remove_empty_values(self, data, **kwargs): + """Remove empty values before dumping data.""" + return {key: value for key, value in data.items() if value} diff --git a/sonar/modules/documents/serializers/schemas/schemaorg.py b/sonar/modules/documents/serializers/schemas/schemaorg.py new file mode 100644 index 00000000..c167ad60 --- /dev/null +++ b/sonar/modules/documents/serializers/schemas/schemaorg.py @@ -0,0 +1,175 @@ +# -*- coding: utf-8 -*- +# +# Swiss Open Access Repository +# Copyright (C) 2019 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""schema.org marshmallow schema.""" + +from __future__ import absolute_import, print_function, unicode_literals + +from flask import request +from marshmallow import fields, post_dump + +from .base_schema import BaseSchema + +TYPE_MAPPING = { + 'coar:c_2f33': 'Book', + 'coar:c_3248': 'Chapter', + 'coar:c_c94f': 'CreativeWork', + 'coar:c_5794': 'ScholarlyArticle', + 'coar:c_18cp': 'ScholarlyArticle', + 'coar:c_6670': 'Poster', + 'coar:c_18co': 'Poster', + 'coar:c_f744': 'Book', + 'coar:c_ddb1': 'Dataset', + 'coar:c_3e5a': 'Article', + 'coar:c_beb9': 'ScholarlyArticle', + 'coar:c_6501': 'ScholarlyArticle', + 'coar:c_998f': 'NewsArticle', + 'coar:c_dcae04bc': 'ScholarlyArticle', + 'coar:c_8544': 'Course', + 'non_textual_object': 'MediaObject', + 'coar:c_8a7e': 'VideoObject', + 'coar:c_ecc8': 'ImageObject', + 'coar:c_12cc': 'Map', + 'coar:c_18cc': 'AudioObject', + 'coar:c_18cw': 'MusicComposition', + 'coar:c_5ce6': 'SoftwareApplication', + 'coar:c_15cd': 'CreativeWork', + 'coar:c_2659': 'Periodical', + 'coar:c_0640': 'Periodical', + 'coar:c_2cd9': 'Periodical', + 'coar:c_2fe3': 'Newspaper', + 'coar:c_816b': 'ScholarlyArticle', + 'coar:c_93fc': 'Report', + 'coar:c_18ww': 'Report', + 'coar:c_18wz': 'Report', + 'coar:c_18wq': 'Report', + 'coar:c_186u': 'Report', + 'coar:c_18op': 'Report', + 'coar:c_ba1f': 'Report', + 'coar:c_18hj': 'Report', + 'coar:c_18ws': 'Report', + 'coar:c_18gh': 'Report', + 'coar:c_46ec': 'Thesis', + 'coar:c_7a1f': 'Thesis', + 'coar:c_db06': 'Thesis', + 'coar:c_bdcc': 'Thesis', + 'habilitation_thesis': 'Thesis', + 'advanced_studies_thesis': 'Thesis', + 'other': 'Thesis', + 'coar:c_8042': 'CreativeWork', + 'coar:c_1843': 'CreativeWork' +} + + +class SchemaOrgV1(BaseSchema): + """Marshmallow schema for schema.org/ScholarlyArticle.""" + + type_ = fields.Method('get_type', data_key='@type') + context_ = fields.Constant('http://schema.org', data_key='@context') + id_ = fields.Method('get_id', data_key='@id') + name = fields.Method('get_title') + abstract = fields.Method('get_abstract') + description = fields.Method('get_abstract') + inLanguage = fields.Method('get_in_language') + creator = fields.Method('get_creator') + headline = fields.Method('get_title') + datePublished = fields.Method('get_start_date') + url = fields.Method('get_file_urls') + keywords = fields.Method('get_keywords') + identifier = fields.Method('get_id') + license = fields.Method('get_license') + image = fields.Method('get_image') + pagination = fields.Method('get_pages') + pageStart = fields.Method('get_first_page') + pageEnd = fields.Method('get_last_page') + + def get_type(self, obj): + """Get type.""" + if obj['metadata'].get('documentType') and TYPE_MAPPING.get( + obj['metadata']['documentType']): + return TYPE_MAPPING[obj['metadata']['documentType']] + + return 'CreativeWork' + + def get_abstract(self, obj): + """Get abstract.""" + for abstract in obj['metadata'].get('abstracts', []): + return abstract['value'] + + return None + + def get_in_language(self, obj): + """Get inLanguage.""" + for language in obj['metadata'].get('language', []): + return language['value'] + + return None + + def get_creator(self, obj): + """Get authors.""" + items = [] + for contributor in obj['metadata'].get('contribution', []): + if contributor['role'][0] == 'cre' and contributor['agent'].get( + 'preferred_name'): + items.append({ + '@type': 'Person', + 'name': contributor['agent']['preferred_name'] + }) + + return items + + def get_license(self, obj): + """Get license.""" + if obj['metadata'].get('usageAndAccessPolicy'): + result = [obj['metadata']['usageAndAccessPolicy']['license']] + + if obj['metadata']['usageAndAccessPolicy'].get('label'): + result.append(obj['metadata']['usageAndAccessPolicy']['label']) + + return ', '.join(result) + + return None + + def get_image(self, obj): + """Get image.""" + if obj['metadata'].get('mainFile', {}).get('thumbnail'): + return '{host}{image}'.format( + host=request.host_url.rstrip('/'), + image=obj['metadata']['mainFile']['thumbnail']) + + return None + + def get_file_urls(self, obj): + """Get file URLs.""" + files = [] + + for file in obj['metadata'].get('_files', []): + if file.get('type') == 'file' and file.get('links'): + if file['links'].get('download'): + files.append('{host}{image}'.format( + host=request.host_url.rstrip('/'), + image=file['links']['download'])) + + if file['links'].get('external'): + files.append(file['links']['external']) + + return files + + @post_dump + def remove_empty_values(self, data, **kwargs): + """Remove empty values before dumping data.""" + return {key: value for key, value in data.items() if value} diff --git a/sonar/modules/documents/templates/documents/record.html b/sonar/modules/documents/templates/documents/record.html index dc9a3125..2ec85221 100644 --- a/sonar/modules/documents/templates/documents/record.html +++ b/sonar/modules/documents/templates/documents/record.html @@ -19,8 +19,22 @@ {% from 'sonar/macros/macro.html' import thumbnail %} -{%- block body %} {% set title = record.title[0] | title_format(current_i18n.language) %} +{% set description = record.abstracts[0].value if record.abstracts else None %} + +{% block head %} +{{ super() }} + + + + +{{ google_scholar_data | safe }} +{% endblock head %} + +{%- block body %} + {% set files = record.get_files_list() %}
diff --git a/sonar/modules/documents/views.py b/sonar/modules/documents/views.py index cac46b65..6549e6d9 100644 --- a/sonar/modules/documents/views.py +++ b/sonar/modules/documents/views.py @@ -19,6 +19,8 @@ from __future__ import absolute_import, print_function +import json + from flask import Blueprint, abort, current_app, g, render_template, request from flask_babelex import gettext as _ from invenio_i18n.ext import current_i18n @@ -96,12 +98,26 @@ def detail(pid_value, view='global'): populate_files_properties(record) + # Import is here to avoid a circular reference error. + from sonar.modules.documents.serializers import google_scholar_v1, \ + schemaorg_v1 + + # Get schema org data + schema_org_data = json.dumps( + schemaorg_v1.transform_record(record['pid'], record)) + + # Get scholar data + google_scholar_data = google_scholar_v1.transform_record( + record['pid'], record) + # Resolve $ref properties record = record.replace_refs() return render_template('documents/record.html', pid=pid_value, - record=record) + record=record, + schema_org_data=schema_org_data, + google_scholar_data=google_scholar_data) @blueprint.route('/projects/') diff --git a/tests/unit/documents/serializers/test_google_scholar_schema.py b/tests/unit/documents/serializers/test_google_scholar_schema.py new file mode 100644 index 00000000..601299f1 --- /dev/null +++ b/tests/unit/documents/serializers/test_google_scholar_schema.py @@ -0,0 +1,405 @@ +# -*- coding: utf-8 -*- +# +# Swiss Open Access Repository +# Copyright (C) 2019 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""Test Google Scholar marshmallow schema.""" + +from io import BytesIO + +import pytest + +from sonar.modules.documents.api import DocumentRecord +from sonar.modules.documents.serializers import google_scholar_v1 + + +@pytest.fixture() +def minimal_document(db, bucket_location, organisation): + record = DocumentRecord.create( + { + 'pid': + '1000', + 'title': [{ + 'type': + 'bf:Title', + 'mainTitle': [{ + 'language': 'eng', + 'value': 'Title of the document' + }] + }], + 'organisation': [{ + '$ref': 'https://sonar.ch/api/organisations/org' + }] + }, + dbcommit=True, + with_bucket=True) + record.commit() + db.session.commit() + return record + + +@pytest.fixture() +def contributors(): + return [{ + 'agent': { + 'preferred_name': 'Creator 1' + }, + 'role': ['cre'], + }, { + 'agent': { + 'preferred_name': 'Creator 2', + 'number': '123', + 'date': '2019', + 'place': 'Martigny' + }, + 'role': ['cre'], + }, { + 'agent': { + 'preferred_name': 'Contributor 1' + }, + 'role': ['ctb'], + }, { + 'agent': { + 'preferred_name': 'Contributor 2', + 'number': '999', + 'date': '2010', + 'place': 'Sion' + }, + 'role': ['ctb'], + }, { + 'agent': { + 'preferred_name': 'Degree supervisor' + }, + 'role': ['dgs'], + }, { + 'agent': { + 'preferred_name': 'Printer' + }, + 'role': ['prt'], + }, { + 'agent': { + 'preferred_name': 'Editor' + }, + 'role': ['edt'], + }] + + +def test_title(minimal_document): + """Test name.""" + result = google_scholar_v1.transform_record(minimal_document['pid'], + minimal_document) + assert '' in \ + result + + # No title + minimal_document.pop('title', None) + result = google_scholar_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'citation_title' not in result + + +def test_language(minimal_document): + """Test inLanguage serialization.""" + # No language + result = google_scholar_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'citation_language' not in result + + # Take the first language + minimal_document['language'] = [{'value': 'eng'}, {'value': 'fre'}] + result = google_scholar_v1.transform_record(minimal_document['pid'], + minimal_document) + assert '' in result + + +def test_publication_date(minimal_document): + """Test publication date.""" + result = google_scholar_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'citation_publication_date' not in result + + minimal_document.update({ + 'provisionActivity': [{ + 'type': 'bf:Agent', + 'startDate': '2019' + }, { + 'type': 'bf:Publication', + }, { + 'type': 'bf:Publication', + 'startDate': '2019' + }, { + 'type': 'bf:Publication', + 'startDate': '2020-01-01' + }] + }) + result = google_scholar_v1.transform_record(minimal_document['pid'], + minimal_document) + assert '' in result + + +def test_keywords(minimal_document): + """Test subjects serialization.""" + result = google_scholar_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'citation_keywords' not in result + + minimal_document['subjects'] = [{ + 'label': { + 'language': 'eng', + 'value': ['Subject 1', 'Subject 2'] + } + }, { + 'label': { + 'language': 'fre', + 'value': ['Sujet 1', 'Sujet 2'] + } + }] + result = google_scholar_v1.transform_record(minimal_document['pid'], + minimal_document) + assert '' in result + + +def test_pdf_url(minimal_document): + """Test PDF URL serialization.""" + result = google_scholar_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'citation_pdf_url' not in result + + minimal_document.files['test.pdf'] = BytesIO(b'File content') + minimal_document.files['test.pdf']['type'] = 'file' + result = google_scholar_v1.transform_record(minimal_document['pid'], + minimal_document) + assert '' in result + + minimal_document.files['test.pdf']['force_external_url'] = True + minimal_document.files['test.pdf'][ + 'external_url'] = 'https://some.domain/file.pdf' + result = google_scholar_v1.transform_record(minimal_document['pid'], + minimal_document) + assert '' in result + + +def test_authors(minimal_document, contributors): + """Test authors serialization.""" + minimal_document.update({'contribution': contributors}) + result = google_scholar_v1.transform_record(minimal_document['pid'], + minimal_document) + + for author in ['Creator 1', 'Creator 2']: + assert ''.format( + author=author) in result + + +def test_doi(minimal_document): + """Test DOI serialization.""" + result = google_scholar_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'citation_doi' not in result + + minimal_document['identifiedBy'] = [{'type': 'bf:Doi', 'value': '111111'}] + result = google_scholar_v1.transform_record(minimal_document['pid'], + minimal_document) + assert '' in result + + +def test_abstract_html_url(minimal_document): + """Test HTML URL serialization.""" + result = google_scholar_v1.transform_record(minimal_document['pid'], + minimal_document) + assert '' in result + + +def test_pages(app, minimal_document): + """Test pages.""" + # No part of + result = google_scholar_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'citation_pages' not in result + + # No pages + minimal_document['partOf'] = [{'document': {'title': 'Host document'}}] + result = google_scholar_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'citation_pages' not in result + + # OK + minimal_document['partOf'] = [{ + 'document': { + 'title': 'Host document' + }, + 'numberingPages': '123-125' + }] + result = google_scholar_v1.transform_record(minimal_document['pid'], + minimal_document) + assert '' in result + + +def test_first_page(app, minimal_document): + """Test first page.""" + # No partOf + result = google_scholar_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'citation_firstpage' not in result + + # No pages + minimal_document['partOf'] = [{'document': {'title': 'Host document'}}] + result = google_scholar_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'citation_firstpage' not in result + + # Only one page + minimal_document['partOf'] = [{ + 'document': { + 'title': 'Host document' + }, + 'numberingPages': '123' + }] + result = google_scholar_v1.transform_record(minimal_document['pid'], + minimal_document) + assert '' in result + + # Set of pages + minimal_document['partOf'] = [{ + 'document': { + 'title': 'Host document' + }, + 'numberingPages': '123-130' + }] + result = google_scholar_v1.transform_record(minimal_document['pid'], + minimal_document) + assert '' in result + + # Exotic formatting + minimal_document['partOf'] = [{ + 'document': { + 'title': 'Host document' + }, + 'numberingPages': '123, 134-135' + }] + result = google_scholar_v1.transform_record(minimal_document['pid'], + minimal_document) + assert '' in result + + # Page start not found + minimal_document['partOf'] = [{ + 'document': { + 'title': 'Host document' + }, + 'numberingPages': 'pages' + }] + result = google_scholar_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'citation_firstpage' not in result + + +def test_last_page(app, minimal_document): + """Test last page.""" + # No partOf + result = google_scholar_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'citation_lastpage' not in result + + # No pages + minimal_document['partOf'] = [{'document': {'title': 'Host document'}}] + result = google_scholar_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'citation_lastpage' not in result + + # Only one page + minimal_document['partOf'] = [{ + 'document': { + 'title': 'Host document' + }, + 'numberingPages': '123' + }] + result = google_scholar_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'citation_lastpage' not in result + + # Set of pages + minimal_document['partOf'] = [{ + 'document': { + 'title': 'Host document' + }, + 'numberingPages': '123-130' + }] + result = google_scholar_v1.transform_record(minimal_document['pid'], + minimal_document) + assert '' in result + + # Exotic formatting + minimal_document['partOf'] = [{ + 'document': { + 'title': 'Host document' + }, + 'numberingPages': '123, 134-135' + }] + result = google_scholar_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'citation_lastpage' not in result + + # Page end not found + minimal_document['partOf'] = [{ + 'document': { + 'title': 'Host document' + }, + 'numberingPages': 'pages' + }] + result = google_scholar_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'citation_lastpage' not in result + + +def test_volume(app, minimal_document): + """Test volume.""" + # No partOf + result = google_scholar_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'citation_volume' not in result + + # No volume + minimal_document['partOf'] = [{'document': {'title': 'Host document'}}] + result = google_scholar_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'citation_volume' not in result + + # Only one page + minimal_document['partOf'] = [{ + 'document': { + 'title': 'Host document' + }, + 'numberingVolume': '1' + }] + result = google_scholar_v1.transform_record(minimal_document['pid'], + minimal_document) + assert '' in result + + +def test_journal_title(app, minimal_document): + """Test journal title.""" + # No partOf + result = google_scholar_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'citation_journal_title' not in result + + minimal_document['partOf'] = [{'document': {'title': 'Host document'}}] + result = google_scholar_v1.transform_record(minimal_document['pid'], + minimal_document) + assert '' in \ + result diff --git a/tests/unit/documents/serializers/test_schemaorg_schema.py b/tests/unit/documents/serializers/test_schemaorg_schema.py new file mode 100644 index 00000000..b642dcdb --- /dev/null +++ b/tests/unit/documents/serializers/test_schemaorg_schema.py @@ -0,0 +1,463 @@ +# -*- coding: utf-8 -*- +# +# Swiss Open Access Repository +# Copyright (C) 2019 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""Test schema.org marshmallow schema.""" + +from io import BytesIO + +import pytest + +from sonar.modules.documents.api import DocumentRecord +from sonar.modules.documents.serializers import schemaorg_v1 + + +@pytest.fixture() +def minimal_document(db, bucket_location, organisation): + record = DocumentRecord.create( + { + 'pid': + '1000', + 'title': [{ + 'type': + 'bf:Title', + 'mainTitle': [{ + 'language': 'eng', + 'value': 'Title of the document' + }] + }], + 'organisation': [{ + '$ref': 'https://sonar.ch/api/organisations/org' + }] + }, + dbcommit=True, + with_bucket=True) + record.commit() + db.session.commit() + return record + + +@pytest.fixture() +def contributors(): + return [{ + 'agent': { + 'preferred_name': 'Creator 1' + }, + 'role': ['cre'], + }, { + 'agent': { + 'preferred_name': 'Creator 2', + 'number': '123', + 'date': '2019', + 'place': 'Martigny' + }, + 'role': ['cre'], + }, { + 'agent': { + 'preferred_name': 'Contributor 1' + }, + 'role': ['ctb'], + }, { + 'agent': { + 'preferred_name': 'Contributor 2', + 'number': '999', + 'date': '2010', + 'place': 'Sion' + }, + 'role': ['ctb'], + }, { + 'agent': { + 'preferred_name': 'Degree supervisor' + }, + 'role': ['dgs'], + }, { + 'agent': { + 'preferred_name': 'Printer' + }, + 'role': ['prt'], + }, { + 'agent': { + 'preferred_name': 'Editor' + }, + 'role': ['edt'], + }] + + +def test_type(minimal_document): + """Test @type serialization.""" + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert result['@type'] == 'CreativeWork' + + minimal_document['documentType'] = 'coar:c_2f33' + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert result['@type'] == 'Book' + + +def test_context(minimal_document): + """Test @context serialization.""" + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert result['@context'] == 'http://schema.org' + + +def test_abstract(minimal_document): + """Test abstract serialization.""" + # No abstract + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'abstract' not in result + + # Take the first + minimal_document['abstracts'] = [{ + 'value': 'Description 1' + }, { + 'value': 'Description 2' + }] + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert result['abstract'] == 'Description 1' + + +def test_description(minimal_document): + """Test description serialization.""" + # No abstract + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'description' not in result + + # Take the first + minimal_document['abstracts'] = [{ + 'value': 'Description 1' + }, { + 'value': 'Description 2' + }] + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert result['description'] == 'Description 1' + + +def test_in_language(minimal_document): + """Test inLanguage serialization.""" + # No language + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'inLanguage' not in result + + # Take the first language + minimal_document['language'] = [{'value': 'eng'}, {'value': 'fre'}] + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert result['inLanguage'] == 'eng' + + +def test_name(minimal_document): + """Test name.""" + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert result['name'] == 'Title of the document' + + # No title + minimal_document.pop('title', None) + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'name' not in result + + +def test_headline(minimal_document): + """Test headline.""" + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert result['headline'] == 'Title of the document' + + +def test_creator(minimal_document, contributors): + """Test creator serialization.""" + minimal_document.update({'contribution': contributors}) + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert result['creator'] == [{ + '@type': 'Person', + 'name': 'Creator 1' + }, { + '@type': 'Person', + 'name': 'Creator 2' + }] + + +def test_date_published(minimal_document): + """Test date published serialization.""" + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'datePublished' not in result + + minimal_document.update({ + 'provisionActivity': [{ + 'type': 'bf:Agent', + 'startDate': '2019' + }, { + 'type': 'bf:Publication', + }, { + 'type': 'bf:Publication', + 'startDate': '2019' + }, { + 'type': 'bf:Publication', + 'startDate': '2020-01-01' + }] + }) + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert result['datePublished'] == '2019' + + +def test_url(minimal_document): + """Test URL serialization.""" + minimal_document.files['test.pdf'] = BytesIO(b'File content') + minimal_document.files['test.pdf']['type'] = 'file' + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert result['url'] == ['http://localhost/documents/1000/files/test.pdf'] + + # External file + minimal_document.files['test.pdf']['force_external_url'] = True + minimal_document.files['test.pdf'][ + 'external_url'] = 'https://some.domain/file.pdf' + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert result['url'] == ['https://some.domain/file.pdf'] + + # Multiple files + minimal_document.files['test2.pdf'] = BytesIO(b'File content') + minimal_document.files['test2.pdf']['type'] = 'file' + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert result['url'] == [ + 'https://some.domain/file.pdf', + 'http://localhost/documents/1000/files/test2.pdf' + ] + + +def test_identifier(minimal_document): + """Test identifier serialization.""" + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert result['identifier'] == 'http://localhost/global/documents/1000' + + +def test_id(minimal_document): + """Test @id serialization.""" + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert result['@id'] == 'http://localhost/global/documents/1000' + + +def test_keywords(minimal_document): + """Test subjects serialization.""" + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'keywords' not in result + + minimal_document['subjects'] = [{ + 'label': { + 'language': 'eng', + 'value': ['Subject 1', 'Subject 2'] + } + }, { + 'label': { + 'language': 'fre', + 'value': ['Sujet 1', 'Sujet 2'] + } + }] + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert result['keywords'] == [ + 'Subject 1', 'Subject 2', 'Sujet 1', 'Sujet 2' + ] + + +def test_license(app, minimal_document): + """Test license serialization.""" + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'license' not in result + + minimal_document['usageAndAccessPolicy'] = {'license': 'CC BY-NC-SA'} + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert result['license'] == 'CC BY-NC-SA' + + minimal_document['usageAndAccessPolicy'] = { + 'license': 'Other OA / license undefined', + 'label': 'Custom license' + } + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert result['license'] == 'Other OA / license undefined, Custom license' + + +def test_image(app, minimal_document): + """Test image serialization.""" + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'image' not in result + + minimal_document.files['test.pdf'] = BytesIO(b'File content') + minimal_document.files['test.pdf']['type'] = 'file' + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert result['image'] == 'http://localhoststatic/images/no-image.png' + + +def test_pagination(app, minimal_document): + """Test pagination.""" + # No part of + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'pagination' not in result + + # No pages + minimal_document['partOf'] = [{'document': {'title': 'Host document'}}] + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'pagination' not in result + + # OK + minimal_document['partOf'] = [{ + 'document': { + 'title': 'Host document' + }, + 'numberingPages': '123-125' + }] + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert result['pagination'] == '123-125' + + +def test_page_start(app, minimal_document): + """Test page start.""" + # No partOf + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'pageStart' not in result + + # No pages + minimal_document['partOf'] = [{'document': {'title': 'Host document'}}] + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'pageStart' not in result + + # Only one page + minimal_document['partOf'] = [{ + 'document': { + 'title': 'Host document' + }, + 'numberingPages': '123' + }] + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert result['pageStart'] == '123' + + # Set of pages + minimal_document['partOf'] = [{ + 'document': { + 'title': 'Host document' + }, + 'numberingPages': '123-130' + }] + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert result['pageStart'] == '123' + + # Exotic formatting + minimal_document['partOf'] = [{ + 'document': { + 'title': 'Host document' + }, + 'numberingPages': '123, 134-135' + }] + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert result['pageStart'] == '123' + + # Page start not found + minimal_document['partOf'] = [{ + 'document': { + 'title': 'Host document' + }, + 'numberingPages': 'pages' + }] + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'pageStart' not in result + + +def test_page_end(app, minimal_document): + """Test page end.""" + # No partOf + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'pageEnd' not in result + + # No pages + minimal_document['partOf'] = [{'document': {'title': 'Host document'}}] + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'pageEnd' not in result + + # Only one page + minimal_document['partOf'] = [{ + 'document': { + 'title': 'Host document' + }, + 'numberingPages': '123' + }] + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'pageEnd' not in result + + # Set of pages + minimal_document['partOf'] = [{ + 'document': { + 'title': 'Host document' + }, + 'numberingPages': '123-130' + }] + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert result['pageEnd'] == '130' + + # Exotic formatting + minimal_document['partOf'] = [{ + 'document': { + 'title': 'Host document' + }, + 'numberingPages': '123, 134-135' + }] + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'pageEnd' not in result + + # Page end not found + minimal_document['partOf'] = [{ + 'document': { + 'title': 'Host document' + }, + 'numberingPages': 'pages' + }] + result = schemaorg_v1.transform_record(minimal_document['pid'], + minimal_document) + assert 'pageEnd' not in result