diff --git a/.travis.yml b/.travis.yml
index e56ce53d..44bb2248 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -55,7 +55,7 @@ python:
before_install:
- sudo apt-get update
- - sudo apt-get install -y libxml2 libxml2-dev libxmlsec1 libxmlsec1-dev
+ - sudo apt-get install -y libxml2 libxml2-dev libxmlsec1 libxmlsec1-dev xpdf
# Stop default travis services
- "sudo service mysql stop"
- "sudo service postgresql stop"
diff --git a/Dockerfile.base b/Dockerfile.base
index 1127bbe6..8edd2a95 100644
--- a/Dockerfile.base
+++ b/Dockerfile.base
@@ -24,7 +24,7 @@
FROM inveniosoftware/centos7-python:3.6
-RUN yum -y install libxml2-devel xmlsec1-devel xmlsec1-openssl-devel libtool-ltdl-devel
+RUN yum -y install libxml2-devel xmlsec1-devel xmlsec1-openssl-devel libtool-ltdl-devel xpdf
COPY Pipfile Pipfile.lock ./
RUN pipenv install --deploy --system
diff --git a/MANIFEST.in b/MANIFEST.in
index 39f07bbb..bad5c261 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -50,10 +50,14 @@ recursive-include docs *.txt
recursive-include docs Makefile
recursive-include sonar *.gitkeep
recursive-include sonar *.po *.pot *.mo
-recursive-include sonar *.json *.html *.js *.scss
+recursive-include sonar *.json *.html *.js *.scss *.css
recursive-include sonar *.png *.jpg *.svg
recursive-include docker *.cfg *.conf *.crt *.ini *.key *.pem *.sh
recursive-include tests *.py
+recursive-include tests *.doc
+recursive-include tests *.json
+recursive-include tests *.pdf
+recursive-include tests *.xml
# added by check_manifest.py
include *.html
diff --git a/Pipfile b/Pipfile
index a2a169a3..3503ef88 100644
--- a/Pipfile
+++ b/Pipfile
@@ -16,6 +16,7 @@ lxml = ">=3.5.0,<4.2.6"
orcid = "*"
python-slugify = "*"
python3-saml = "*"
+xmltodict = "*"
[dev-packages]
Flask-Debugtoolbar = ">=0.10.1"
@@ -33,6 +34,7 @@ pytest-mock = ">=1.6.0"
pytest-pep8 = ">=1.0.6"
pytest-random-order = ">=0.5.4"
pytest-runner = ">=3.0.0,<5"
+docutils = "==0.15"
[requires]
python_version = "3.6"
diff --git a/Pipfile.lock b/Pipfile.lock
index 3eb2625a..05bb12ed 100644
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
- "sha256": "01d78308741a5e6ec55acbe92856baf2c168f0291e92e3f6d87867e70f942708"
+ "sha256": "b9570bb6f346e39478da92fb88039c81ba4a011f8b11df3a750f888c94a6ece7"
},
"pipfile-spec": 6,
"requires": {
@@ -81,11 +81,11 @@
},
"beautifulsoup4": {
"hashes": [
- "sha256:034740f6cb549b4e932ae1ab975581e6103ac8f942200a0e9759065984391858",
- "sha256:945065979fb8529dd2f37dbb58f00b661bdbcbebf954f93b32fdf5263ef35348",
- "sha256:ba6d5c59906a85ac23dadfe5c88deaf3e179ef565f4898671253e50a78680718"
+ "sha256:05668158c7b85b791c5abde53e50265e16f98ad601c402ba44d70f96c4159612",
+ "sha256:25288c9e176f354bf277c0a10aa96c782a6a18a17122dba2e8cec4a97e03343b",
+ "sha256:f040590be10520f2ea4c2ae8c3dae441c7cfff5308ec9d58a0ec0c1b8f81d469"
],
- "version": "==4.7.1"
+ "version": "==4.8.0"
},
"billiard": {
"hashes": [
@@ -332,10 +332,10 @@
},
"flask-menu": {
"hashes": [
- "sha256:02adee99456f70fcf4472c5ce04a65a083bce78d39144e4daa38e79a123da7e8",
- "sha256:d54f604b2d93d1d7bb95c78d9188b5855b54af399f8cd9e64b2feabd138608bf"
+ "sha256:7374c3265c34a3fbb1ab5f1df6385f3b10fc0b05c142fd2f39217c9cece4df29",
+ "sha256:c30f767af3c008d3157a86533d20ea2bc7b73f5b5820ddca773584674f26517b"
],
- "version": "==0.7.0"
+ "version": "==0.7.1"
},
"flask-oauthlib": {
"hashes": [
@@ -475,10 +475,10 @@
},
"invenio-app": {
"hashes": [
- "sha256:617b8e80e313e46db3d6476fbd4848423115b8d4cf25505f86928c4c94fdbb4a",
- "sha256:b8e084aa57f9a4f2c6439872339d73364ece15be14f75a5ebc6f869eac4d1610"
+ "sha256:a77aee57118d06909d2187a3e25f3d0a299189e06bb43b4d7404a689119ae75a",
+ "sha256:df15a9ef65758f82f75f8b392456793c2fc36e9984d15113c852a7d8fd2c52dd"
],
- "version": "==1.1.0"
+ "version": "==1.1.1"
},
"invenio-assets": {
"hashes": [
@@ -597,10 +597,10 @@
},
"invenio-records": {
"hashes": [
- "sha256:7228135f6b399e4d9070ec4feded814af58982f6cdb98ea236f9a3ccbb68e650",
- "sha256:a1226e813f4d2592ef8165018b03b1995e12d38d6f3d3086782bc26b98793756"
+ "sha256:6b6704cf3c9e9243ab025114fcf9f0b4fcf517d0ab3ff72db8131c255abe832a",
+ "sha256:87edf393d4004a353f0befc805a0e6b275c68babc2b4d92de73040f8063d6216"
],
- "version": "==1.1.0"
+ "version": "==1.1.1"
},
"invenio-records-rest": {
"hashes": [
@@ -642,10 +642,10 @@
},
"invenio-theme": {
"hashes": [
- "sha256:5296db888ea33d36f4f989696171481f30d1f7901bb3734af94bfc7169211eac",
- "sha256:c1b2c6715a527d5aae94698dbe0040a4551a515dd7f8adddb4e9a190c02d7275"
+ "sha256:b98224b54fd94615d6588d3606c73b3cefa3963cafb8d859bd7c715036ba556a",
+ "sha256:d642c08df6a8af099188a48043aedfe8a44971ebaf67bc8ae21cf1a96eae2916"
],
- "version": "==1.1.3"
+ "version": "==1.1.4"
},
"invenio-userprofiles": {
"hashes": [
@@ -691,10 +691,10 @@
},
"jedi": {
"hashes": [
- "sha256:49ccb782651bb6f7009810d17a3316f8867dde31654c750506970742e18b553d",
- "sha256:79d0f6595f3846dffcbe667cc6dc821b96e5baa8add125176c31a3917eb19d58"
+ "sha256:53c850f1a7d3cfcd306cc513e2450a54bdf5cacd7604b74e42dd1f0758eaaf36",
+ "sha256:e07457174ef7cb2342ff94fa56484fe41cec7ef69b0059f01d3f812379cb6f7c"
],
- "version": "==0.14.0"
+ "version": "==0.14.1"
},
"jinja2": {
"hashes": [
@@ -803,9 +803,9 @@
},
"mako": {
"hashes": [
- "sha256:95ee720cc3453063788515d55bd7ce4a2a77b7b209e4ac70ec5c86091eb02541"
+ "sha256:f5a642d8c5699269ab62a68b296ff990767eb120f51e2e8f3d6afb16bdb57f4b"
],
- "version": "==1.0.13"
+ "version": "==1.0.14"
},
"markupsafe": {
"hashes": [
@@ -887,10 +887,10 @@
},
"parso": {
"hashes": [
- "sha256:5052bb33be034cba784193e74b1cde6ebf29ae8b8c1e4ad94df0c4209bfc4826",
- "sha256:db5881df1643bf3e66c097bfd8935cf03eae73f4cb61ae4433c9ea4fb6613446"
+ "sha256:63854233e1fadb5da97f2744b6b24346d2750b85965e7e399bec1620232797dc",
+ "sha256:666b0ee4a7a1220f65d367617f2cd3ffddff3e205f3f16a0284df30e774c2a9c"
],
- "version": "==0.5.0"
+ "version": "==0.5.1"
},
"passlib": {
"hashes": [
@@ -1141,9 +1141,9 @@
},
"sqlalchemy": {
"hashes": [
- "sha256:c30925d60af95443458ebd7525daf791f55762b106049ae71e18f8dd58084c2f"
+ "sha256:217e7fc52199a05851eee9b6a0883190743c4fb9c8ac4313ccfceaffd852b0ff"
],
- "version": "==1.3.5"
+ "version": "==1.3.6"
},
"sqlalchemy-continuum": {
"hashes": [
@@ -1156,9 +1156,9 @@
"encrypted"
],
"hashes": [
- "sha256:0ebd4d176a5786233db9f2e92040476fcff8b1b426fdbbb7ee4f478280ee9166"
+ "sha256:c037bec2fe2a73b2dfda7b079b02644f20c477fdf6fc4ff0c2141a65ddbee0ee"
],
- "version": "==0.34.0"
+ "version": "==0.34.1"
},
"text-unidecode": {
"hashes": [
@@ -1239,11 +1239,11 @@
},
"webargs": {
"hashes": [
- "sha256:6b81ce44572d4f345104aa41c734fdc01165f054a061a8ebb1b46e89851e1170",
- "sha256:713bd63440ee078ce48ca953d254d51e5f1a6fa0c76fb521fc596306c78d95a5",
- "sha256:e2394ea7e422c1e795681cee5e8b1c6083bab7db6d7a380841130cbbae173d29"
+ "sha256:132216236980316da205a4cfb571913109a07a2e014bcc2313b72d0b83dce507",
+ "sha256:538c9f333f1f7ce06a1eb14b3daf640351057907be71b58d2b5a23c7d6d026be",
+ "sha256:63cecd4dc79f504c31c33a8470624f79f54c1b35a23141cc52c5a3fa37dc674b"
],
- "version": "==5.3.2"
+ "version": "==5.4.0"
},
"webassets": {
"hashes": [
@@ -1260,17 +1260,17 @@
},
"werkzeug": {
"hashes": [
- "sha256:865856ebb55c4dcd0630cdd8f3331a1847a819dda7e8c750d3db6f2aa6c0209c",
- "sha256:a0b915f0815982fb2a09161cb8f31708052d0951c3ba433ccc5e1aa276507ca6"
+ "sha256:87ae4e5b5366da2347eb3116c0e6c681a0e939a33b2805e2c0cbd282664932c4",
+ "sha256:a13b74dd3c45f758d4ebdb224be8f1ab8ef58b3c0ffc1783a8c7d9f4f50227e6"
],
- "version": "==0.15.4"
+ "version": "==0.15.5"
},
"whichcraft": {
"hashes": [
- "sha256:7533870f751901a0ce43c93cc9850186e9eba7fe58c924dfb435968ba9c9fa4e",
- "sha256:fecddd531f237ffc5db8b215409afb18fa30300699064cca4817521b4fc81815"
+ "sha256:0acf1d3aebb5ab16243edbf818024ce21938f06150563041665a23b33eb11dd8",
+ "sha256:d54caa14cc3f7b1d2276f8753fd05f1dc5a554df6f83a36c5c2a551e81de2498"
],
- "version": "==0.5.2"
+ "version": "==0.6.0"
},
"wtforms": {
"hashes": [
@@ -1297,6 +1297,14 @@
],
"version": "==1.3.3"
},
+ "xmltodict": {
+ "hashes": [
+ "sha256:50d8c638ed7ecb88d90561beedbf720c9b4e851a9fa6c47ebd64e99d166d8a21",
+ "sha256:8bbcb45cc982f48b2ca8fe7e7827c5d792f217ecf1792626f808bf41c3b86051"
+ ],
+ "index": "pypi",
+ "version": "==0.12.0"
+ },
"zipp": {
"hashes": [
"sha256:4970c3758f4e89a7857a973b1e2a5d75bcdc47794442f2e2dd4fe8e0466e809a",
@@ -1416,18 +1424,18 @@
},
"docutils": {
"hashes": [
- "sha256:02aec4bd92ab067f6ff27a38a38a41173bf01bed8f89157768c1573f53e474a6",
- "sha256:51e64ef2ebfb29cae1faa133b3710143496eca21c530f3f71424d77687764274",
- "sha256:7a4bd47eaf6596e1295ecb11361139febe29b084a87bf005bf899f9a42edc3c6"
+ "sha256:54a349c622ff31c91cbec43b0b512f113b5b24daf00e2ea530bb1bd9aac14849",
+ "sha256:d2ddba74835cb090a1b627d3de4e7835c628d07ee461f7b4480f51af2fe4d448"
],
- "version": "==0.14"
+ "index": "pypi",
+ "version": "==0.15"
},
"execnet": {
"hashes": [
- "sha256:027ee5d961afa01e97b90d6ccc34b4ed976702bc58e7f092b3c513ea288cb6d2",
- "sha256:752a3786f17416d491f833a29217dda3ea4a471fc5269c492eebcee8cc4772d3"
+ "sha256:64dcdc248d007060f6f6500e7c79a4f87ee692063e3ec51e9bebf30ef2ea21d7",
+ "sha256:dfd10a5663f94d1235e6fbee86bc53e2f89b6f15e031e7e6d9e4bb345bcd7e52"
],
- "version": "==1.6.0"
+ "version": "==1.6.1"
},
"flask": {
"hashes": [
@@ -1537,10 +1545,10 @@
},
"more-itertools": {
"hashes": [
- "sha256:3ad685ff8512bf6dc5a8b82ebf73543999b657eded8c11803d9ba6b648986f4d",
- "sha256:8bb43d1f51ecef60d81854af61a3a880555a14643691cc4b64a6ee269c78f09a"
+ "sha256:409cd48d4db7052af495b09dec721011634af3753ae1ef92d2b32f73a745f832",
+ "sha256:92b8c4b06dac4f0611c0729b2f2ede52b2e1bac1ab48f089c7ddc12e26bb60c4"
],
- "version": "==7.1.0"
+ "version": "==7.2.0"
},
"packaging": {
"hashes": [
@@ -1586,10 +1594,10 @@
},
"pyparsing": {
"hashes": [
- "sha256:1873c03321fc118f4e9746baf201ff990ceb915f433f23b395f5580d1840cb2a",
- "sha256:9b6323ef4ab914af344ba97510e966d64ba91055d6b9afa6b30799340e89cc03"
+ "sha256:43c5486cefefa536c9aab528881c992328f020eefe4f6d06332449c365218580",
+ "sha256:d6c5ffe9d0305b9b977f7a642d36b9370954d1da7ada4c62393382cbadad4265"
],
- "version": "==2.4.0"
+ "version": "==2.4.1.1"
},
"pytest": {
"hashes": [
@@ -1766,10 +1774,10 @@
},
"werkzeug": {
"hashes": [
- "sha256:865856ebb55c4dcd0630cdd8f3331a1847a819dda7e8c750d3db6f2aa6c0209c",
- "sha256:a0b915f0815982fb2a09161cb8f31708052d0951c3ba433ccc5e1aa276507ca6"
+ "sha256:87ae4e5b5366da2347eb3116c0e6c681a0e939a33b2805e2c0cbd282664932c4",
+ "sha256:a13b74dd3c45f758d4ebdb224be8f1ab8ef58b3c0ffc1783a8c7d9f4f50227e6"
],
- "version": "==0.15.4"
+ "version": "==0.15.5"
},
"zipp": {
"hashes": [
diff --git a/docker-compose.full.yml b/docker-compose.full.yml
index 731f5970..0142502a 100644
--- a/docker-compose.full.yml
+++ b/docker-compose.full.yml
@@ -141,5 +141,9 @@ services:
extends:
file: docker-services.yml
service: es
+ grobid:
+ extends:
+ file: docker-services.yml
+ service: grobid
volumes:
static_data:
diff --git a/docker-compose.yml b/docker-compose.yml
index 0a790946..06b339df 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -46,3 +46,7 @@ services:
extends:
file: docker-services.yml
service: es
+ grobid:
+ extends:
+ file: docker-services.yml
+ service: grobid
diff --git a/docker-services.yml b/docker-services.yml
index 2ec306b6..85f20594 100644
--- a/docker-services.yml
+++ b/docker-services.yml
@@ -98,3 +98,8 @@ services:
command: --broker=amqp://guest:guest@mq:5672/ --broker_api=http://guest:guest@mq:15672/api/
ports:
- "5555:5555"
+ grobid:
+ image: lfoppiano/grobid:0.5.5
+ ports:
+ - "8070:8070"
+ - "8071:8071"
diff --git a/docker/nginx/Dockerfile b/docker/nginx/Dockerfile
index dd0a42e5..651ee785 100644
--- a/docker/nginx/Dockerfile
+++ b/docker/nginx/Dockerfile
@@ -21,7 +21,8 @@ RUN apt-get update && apt-get upgrade -y && apt-get install -y \
libxml2 \
libxml2-dev \
libxmlsec1 \
- libxmlsec1-dev
+ libxmlsec1-dev \
+ xpdf
COPY nginx.conf /etc/nginx/nginx.conf
COPY conf.d/* /etc/nginx/conf.d/
diff --git a/setup.py b/setup.py
index 92da8b5d..e8b69ff5 100644
--- a/setup.py
+++ b/setup.py
@@ -61,6 +61,11 @@
'documents = sonar.modules.documents.views:blueprint',
'shibboleth_authenticator = \
sonar.modules.shibboleth_authenticator.views.client:blueprint',
+ 'pdf_extractor = \
+ sonar.modules.pdf_extractor.views.client:blueprint'
+ ],
+ 'invenio_base.api_blueprints': [
+ 'pdf_extractor = sonar.modules.pdf_extractor.views.api:blueprint'
],
'invenio_assets.webpack': [
'sonar_theme = sonar.theme.webpack:theme',
@@ -71,6 +76,7 @@
'sonar_documents = sonar.modules.documents.config',
'shibboleth_authenticator = \
sonar.modules.shibboleth_authenticator.config',
+ 'pdf_extractor = sonar.modules.pdf_extractor.config',
],
'invenio_i18n.translations': [
'messages = sonar',
diff --git a/sonar/modules/pdf_extractor/__init__.py b/sonar/modules/pdf_extractor/__init__.py
new file mode 100644
index 00000000..2179bdea
--- /dev/null
+++ b/sonar/modules/pdf_extractor/__init__.py
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+#
+# Swiss Open Access Repository
+# Copyright (C) 2019 RERO
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+
+"""PDF extractor extension."""
+
+from __future__ import absolute_import, print_function
+
+from .ext import PDFExtractor
+
+__all__ = ('PDFExtractor', )
diff --git a/sonar/modules/pdf_extractor/config.py b/sonar/modules/pdf_extractor/config.py
new file mode 100644
index 00000000..2b2e25b5
--- /dev/null
+++ b/sonar/modules/pdf_extractor/config.py
@@ -0,0 +1,24 @@
+# -*- coding: utf-8 -*-
+#
+# Swiss Open Access Repository
+# Copyright (C) 2019 RERO
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+"""SONAR documents configuration."""
+
+PDF_EXTRACTOR_GROBID_SERVER = 'localhost'
+"""Grobid server."""
+
+PDF_EXTRACTOR_GROBID_PORT = 8070
+"""Grobid port."""
diff --git a/sonar/modules/pdf_extractor/ext.py b/sonar/modules/pdf_extractor/ext.py
new file mode 100644
index 00000000..fc164467
--- /dev/null
+++ b/sonar/modules/pdf_extractor/ext.py
@@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+#
+# Swiss Open Access Repository
+# Copyright (C) 2019 RERO
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+
+"""PDF Extractor extension."""
+
+from __future__ import absolute_import, print_function
+
+from . import config
+
+
+class PDFExtractor():
+ """PDF Extractor extension."""
+
+ def __init__(self, app=None):
+ """Extension initialization."""
+ if app:
+ self.init_app(app)
+
+ def init_app(self, app):
+ """Flask application initialization."""
+ self.init_config(app)
+ app.extensions['pdf_extractor'] = self
+
+ @staticmethod
+ def init_config(app):
+ """Initialize configuration.
+
+ Override configuration variables with the values in this package.
+ """
+ for k in dir(config):
+ if k.startswith('PDF_EXTRACTOR'):
+ app.config.setdefault(k, getattr(config, k))
diff --git a/sonar/modules/pdf_extractor/pdf_extractor.py b/sonar/modules/pdf_extractor/pdf_extractor.py
new file mode 100644
index 00000000..174afd1d
--- /dev/null
+++ b/sonar/modules/pdf_extractor/pdf_extractor.py
@@ -0,0 +1,163 @@
+# -*- coding: utf-8 -*-
+#
+# Swiss Open Access Repository
+# Copyright (C) 2019 RERO
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+"""PDF extractor class."""
+
+import os
+import tempfile
+import xml.etree.ElementTree as ET
+from io import StringIO
+
+import requests
+import xmltodict
+from flask import current_app
+
+
+class PDFExtractor:
+ """PDF extractor class."""
+
+ api_url = ''
+
+ def __init__(self):
+ """Init PDF extractor."""
+ self._load_config()
+
+ def _load_config(self):
+ """Load configuration from extension."""
+ self.api_url = "http://{server}:{port}/api".format(
+ server=current_app.config.get('PDF_EXTRACTOR_GROBID_SERVER'),
+ port=current_app.config.get('PDF_EXTRACTOR_GROBID_PORT'))
+
+ if not self.api_is_alive():
+ raise ConnectionRefusedError
+
+ def api_is_alive(self):
+ """Test if api is up.
+
+ :returns: (bool) Return wether grobid service is up or not
+ """
+ try:
+ response, status = self.do_request('isalive', 'get')
+ except Exception:
+ return False
+
+ if status != 200:
+ return False
+
+ return bool(response)
+
+ def do_request(self, endpoint, request_type='get', files=None):
+ """Do request on Grobid api.
+
+ :param endpoint: (str) Endpoint of API to query
+ :param request: (str) Request type (get or post)
+ :param files: (dict) files to post (Multipart-encoded files)
+ :returns: (tuple) Tuple containing response text and status
+ """
+ url = self.api_url + '/' + endpoint
+
+ if request_type.lower() not in ['get', 'post']:
+ raise ValueError
+
+ if request_type.lower() == 'get':
+ response = requests.get(url)
+ return response.content, response.status_code
+
+ if request_type.lower() == 'post':
+ response = requests.post(url, files=files)
+ return response.text, response.status_code
+
+ def process(self, input_file, output_file=None, dict_output=True):
+ """Process metadata extraction from file.
+
+ :param input_file: (str) Path to PDF file.
+ :param output_file: (str) Output file where to dump extraction.
+ :param dict_output: (bool) Extraction will be formatted in JSON.
+ :returns: (str|dict|None) Metadata extraction, if output file is not
+ None, data will be put into file
+ """
+ output = self.extract_metadata(input_file)
+
+ # Dump xml output into given file
+ if output_file:
+ with open(output_file, 'w') as file:
+ file.write(output)
+ return None
+
+ # Return output as xml
+ if not dict_output:
+ return output
+
+ # Transform xml to dictionary
+ return self.parse_tei_xml(output)
+
+ def process_raw(self, pdf_content, output_file=None, dict_output=True):
+ """Metadata extraction from raw content.
+
+ :param pdf_content: (str) PDF content.
+ :param output_file: (str) Output file where to dump extraction.
+ :param dict_output: (bool) Extraction will be formatted in JSON.
+ :returns: (str|json) Metadata extraction
+ """
+ temp = tempfile.NamedTemporaryFile(mode='w+b', suffix=".pdf")
+ temp.write(pdf_content)
+
+ return self.process(temp.name,
+ output_file=output_file,
+ dict_output=dict_output)
+
+ def extract_metadata(self, file):
+ """Process metadata extraction.
+
+ :param file: (str) Path to PDF file.
+ :returns: (str) Extraction metadata as TEI XML
+ """
+ if not os.path.isfile(file):
+ raise ValueError('Input file does not exist')
+
+ if not file.lower().endswith('.pdf'):
+ raise ValueError('Input file is not a valid PDF file')
+
+ response, status = self.do_request('processHeaderDocument',
+ 'post',
+ files={
+ 'input':
+ (file, open(file, 'rb'),
+ 'application/pdf'),
+ 'consolidateHeader':
+ '1'
+ })
+
+ if status != 200:
+ raise Exception('Metadata extraction failed')
+
+ return response
+
+ @staticmethod
+ def parse_tei_xml(xml):
+ """Parse xml content."""
+ iterator = ET.iterparse(StringIO(xml))
+ for _, element in iterator:
+ if '}' in element.tag:
+ element.tag = element.tag.split('}', 1)[1]
+ root = iterator.root
+
+ # parse xml
+ result = xmltodict.parse(ET.tostring(root, encoding='unicode'))
+ result = result['TEI']
+
+ return result
diff --git a/sonar/modules/pdf_extractor/static/pdf_extractor/test.css b/sonar/modules/pdf_extractor/static/pdf_extractor/test.css
new file mode 100644
index 00000000..c172ae6a
--- /dev/null
+++ b/sonar/modules/pdf_extractor/static/pdf_extractor/test.css
@@ -0,0 +1,39 @@
+/*
+ * Swiss Open Access Repository
+ * Copyright (C) 2019 RERO
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see .
+ */
+pre {
+ white-space: pre-wrap;
+}
+
+.string {
+ color: green;
+}
+
+.number {
+ color: darkorange;
+}
+
+.boolean {
+ color: blue;
+}
+
+.null {
+ color: magenta;
+}
+
+.key {
+ color: red;
+}
\ No newline at end of file
diff --git a/sonar/modules/pdf_extractor/static/pdf_extractor/test.js b/sonar/modules/pdf_extractor/static/pdf_extractor/test.js
new file mode 100644
index 00000000..338bf750
--- /dev/null
+++ b/sonar/modules/pdf_extractor/static/pdf_extractor/test.js
@@ -0,0 +1,78 @@
+/*
+ * Swiss Open Access Repository
+ * Copyright (C) 2019 RERO
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see .
+ */
+function syntaxHighlight(json) {
+ json = json.replace(/&/g, '&').replace(//g, '>');
+ return json.replace(/("(\\u[a-zA-Z0-9]{4}|\\[^u]|[^\\"])*"(\s*:)?|\b(true|false|null)\b|-?\d+(?:\.\d*)?(?:[eE][+\-]?\d+)?)/g, function (match) {
+ var cls = 'number';
+ if (/^"/.test(match)) {
+ if (/:$/.test(match)) {
+ cls = 'key';
+ } else {
+ cls = 'string';
+ }
+ } else if (/true|false/.test(match)) {
+ cls = 'boolean';
+ } else if (/null/.test(match)) {
+ cls = 'null';
+ }
+ return '' + match + '';
+ });
+}
+
+$(document).ready(function () {
+ var context = 'metadata'
+
+ $('#metadata').click(function() {
+ context = 'metadata'
+ })
+
+ $('#fulltext').click(function() {
+ context = 'full-text'
+ })
+
+ $('#pdfForm').submit(function () {
+ var file_data = $('#file').prop('files')[0];
+ var form_data = new FormData();
+ form_data.append('file', file_data);
+
+ $('#loading').removeClass('d-none')
+
+ $.ajax({
+ url: '/api/pdf-extractor/' + context,
+ dataType: 'json',
+ cache: false,
+ contentType: false,
+ processData: false,
+ data: form_data,
+ type: 'post',
+ success: function (data) {
+ var json = JSON.stringify(data, null, 2)
+
+ $('#loading').addClass('d-none')
+ $('#error').addClass('d-none')
+ $('#result').removeClass('d-none').html(syntaxHighlight(json))
+ },
+ error: function (data) {
+ $('#loading').addClass('d-none')
+ $('#result').addClass('d-none')
+ $('#error').removeClass('d-none').text(data.responseJSON.error)
+ }
+ });
+
+ return false;
+ })
+})
\ No newline at end of file
diff --git a/sonar/modules/pdf_extractor/templates/test.html b/sonar/modules/pdf_extractor/templates/test.html
new file mode 100644
index 00000000..c8515a23
--- /dev/null
+++ b/sonar/modules/pdf_extractor/templates/test.html
@@ -0,0 +1,40 @@
+{# -*- coding: utf-8 -*-
+ Swiss Open Access Repository
+ Copyright (C) 2019 RERO
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, version 3 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see .
+#}
+
+{%- extends config.RECORDS_UI_BASE_TEMPLATE %}
+
+{%- block page_body %}
+
PDF metadata extraction
+
+{%- endblock %}
+
+{%- block javascript %}
+{{ super() }}
+
+{%- endblock javascript %}
+
+{% block css %}
+{{ super() }}
+
+{% endblock %}
\ No newline at end of file
diff --git a/sonar/modules/pdf_extractor/views/__init__.py b/sonar/modules/pdf_extractor/views/__init__.py
new file mode 100644
index 00000000..55ea9305
--- /dev/null
+++ b/sonar/modules/pdf_extractor/views/__init__.py
@@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+#
+# Swiss Open Access Repository
+# Copyright (C) 2019 RERO
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+"""PDF extractor views."""
diff --git a/sonar/modules/pdf_extractor/views/api.py b/sonar/modules/pdf_extractor/views/api.py
new file mode 100644
index 00000000..3ca1f285
--- /dev/null
+++ b/sonar/modules/pdf_extractor/views/api.py
@@ -0,0 +1,81 @@
+# -*- coding: utf-8 -*-
+#
+# Swiss Open Access Repository
+# Copyright (C) 2019 RERO
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+"""API Views."""
+
+from __future__ import absolute_import, print_function
+
+import re
+import subprocess
+import tempfile
+
+from flask import Blueprint, jsonify, request
+
+from sonar.modules.pdf_extractor.pdf_extractor import PDFExtractor
+
+blueprint = Blueprint('pdf_extractor',
+ __name__,
+ template_folder='templates',
+ static_folder='static',
+ url_prefix='/pdf-extractor')
+
+
+@blueprint.route('/metadata', methods=['POST'])
+def metadata():
+ """Extract PDF metadata and return as a json object."""
+ try:
+ if 'file' not in request.files:
+ raise Exception('File not found')
+
+ # Get the file posted
+ pdf_file = request.files['file']
+
+ pdf_extractor = PDFExtractor()
+
+ # Extract metadata from PDF
+ return jsonify(pdf_extractor.process_raw(pdf_file.read()))
+ except Exception as exception:
+ return jsonify(dict(error=str(exception))), 400
+
+
+@blueprint.route('/full-text', methods=['POST'])
+def full_text():
+ """Extract PDF metadata and return as a json object."""
+ try:
+ if 'file' not in request.files:
+ raise Exception('File not found')
+
+ # Get the file posted
+ pdf_file = request.files['file']
+
+ # Temporary file path
+ file = tempfile.gettempdir() + '/' + pdf_file.filename
+
+ # Temporary store file content
+ pdf_file.save(file)
+
+ # Process pdf text extraction
+ text = subprocess.check_output(
+ 'pdftotext -enc UTF-8 {file} -'.format(file=file), shell=True)
+ text = text.decode('utf_8')
+
+ # Remove carriage returns
+ text = re.sub('[\r\n\f]+', ' ', text)
+
+ return jsonify(text=text)
+ except Exception as exception:
+ return jsonify(dict(error=str(exception))), 400
diff --git a/sonar/modules/pdf_extractor/views/client.py b/sonar/modules/pdf_extractor/views/client.py
new file mode 100644
index 00000000..bbe8c0b1
--- /dev/null
+++ b/sonar/modules/pdf_extractor/views/client.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+#
+# Swiss Open Access Repository
+# Copyright (C) 2019 RERO
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+"""API Views."""
+
+from __future__ import absolute_import, print_function
+
+from flask import Blueprint, render_template
+from flask_login import login_required
+
+blueprint = Blueprint(
+ 'pdf',
+ __name__,
+ static_folder='../static',
+ template_folder='../templates',
+ url_prefix='/pdf-extractor'
+)
+
+
+@blueprint.route('/test', methods=['GET'])
+@login_required
+def test():
+ """Test upload file and extracting metadata."""
+ return render_template('test.html')
diff --git a/tests/conftest.py b/tests/conftest.py
index e20226cf..d447ba29 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -18,6 +18,8 @@
"""Common pytest fixtures and plugins."""
import pytest
+from flask import url_for
+from flask_security.utils import encrypt_password
@pytest.fixture(scope='module', autouse=True)
@@ -40,3 +42,24 @@ def app_config(app_config):
)))
return app_config
+
+
+@pytest.fixture()
+def create_user(app):
+ """Create user in database."""
+ datastore = app.extensions['security'].datastore
+ datastore.create_user(email='john.doe@test.com',
+ password=encrypt_password('123456'),
+ active=True)
+ datastore.commit()
+
+
+@pytest.fixture()
+def logged_user_client(create_user, client):
+ """Log in user."""
+ response = client.post(url_for('security.login'),
+ data=dict(email='john.doe@test.com',
+ password='123456'))
+ assert response.status_code == 302
+
+ return client
diff --git a/tests/ui/pdf_extractor/conftest.py b/tests/ui/pdf_extractor/conftest.py
new file mode 100644
index 00000000..d142cca5
--- /dev/null
+++ b/tests/ui/pdf_extractor/conftest.py
@@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+#
+# Swiss Open Access Repository
+# Copyright (C) 2019 RERO
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+"""Pytest fixtures and plugins for PDF extractor tests."""
+
+from __future__ import absolute_import, print_function
+
+import os
+
+import pytest
+import requests
+
+
+@pytest.fixture(scope='function')
+def mock_grobid_response(monkeypatch):
+ """Mock a grobid response for full text extraction."""
+ with open(
+ os.path.dirname(os.path.abspath(__file__)) + '/data/postprint.xml',
+ 'r') as file:
+ output = file.read()
+
+ class MockResponse:
+ """Mock response."""
+
+ status_code = 200
+ text = output
+
+ monkeypatch.setattr(requests, 'post', lambda *args, **kwargs: MockResponse)
+
+
+@pytest.fixture(scope='function')
+def mock_grobid_error_response(monkeypatch):
+ """Mock a grobid response with a failed status code."""
+ class MockResponse:
+ """Mock response."""
+
+ status_code = 503
+ text = ''
+
+ monkeypatch.setattr(requests, 'post', lambda *args, **kwargs: MockResponse)
+
+
+@pytest.fixture(scope='module')
+def pdf_file():
+ """Return test PDF file path."""
+ return os.path.dirname(os.path.abspath(__file__)) + '/data/postprint.pdf'
+
+
+@pytest.fixture(scope='module')
+def xml_file():
+ """Return test XML output file path."""
+ return os.path.dirname(os.path.abspath(__file__)) + '/data/postprint.xml'
diff --git a/tests/ui/pdf_extractor/data/postprint.pdf b/tests/ui/pdf_extractor/data/postprint.pdf
new file mode 100755
index 00000000..38572d1a
Binary files /dev/null and b/tests/ui/pdf_extractor/data/postprint.pdf differ
diff --git a/tests/ui/pdf_extractor/data/postprint.xml b/tests/ui/pdf_extractor/data/postprint.xml
new file mode 100644
index 00000000..f6011343
--- /dev/null
+++ b/tests/ui/pdf_extractor/data/postprint.xml
@@ -0,0 +1,94 @@
+
+
+
+
+
+
+ GROBID - A machine learning software for extracting information from scholarly documents
+
+
+
+
+
+ High-harmonic generation in quantum spin systems
+
+
+
+
+ 2019
+
+
+
+
+
+ ShintaroTakayoshi
+
+ Max Planck Institute for the Physics of Complex Systems
+
+ 01187
+ Dresden
+ Germany
+
+
+
+ Department of Quantum Matter Physics
+ University of Geneva
+
+ 1211
+ Geneva
+ Switzerland
+
+
+
+
+ YutaMurakami
+
+ Department of Physics
+ University of Fribourg
+
+ 1700
+ Fribourg
+ Switzerland
+
+
+
+
+ PhilippWerner
+
+ Department of Physics
+ University of Fribourg
+
+ 1700
+ Fribourg
+ Switzerland
+
+
+
+ High-harmonic generation in quantum spin systems
+
+
+ PHYSICAL REVIEW B
+
+ 99
+ 184303
+ 2019
+
+
+ 10.1103/PhysRevB.99.184303
+ (Received 22 January 2019; revised manuscript received 8 April 2019; published 20 May 2019)
+ Editors' Suggestion
+
+
+
+
+
+
We theoretically study the high-harmonic generation (HHG) in one-dimensional spin systems. While in electronic systems the driving by ac electric fields produces radiation from the dynamics of excited charges, we consider here the situation where spin systems excited by a magnetic field pulse generate radiation via a time-dependent magnetization. Specifically, we study the magnetic dipole radiation in two types of ferromagnetic spin chain models, the Ising model with static longitudinal field and the XXZ model, and reveal the structure of the spin HHG and its relation to spin excitations. For weak laser amplitude, a peak structure appears which can be explained by time-dependent perturbation theory. With increasing amplitude, plateaus with well-defined cutoff energies emerge. In the Ising model with longitudinal field, the thresholds of the multiple plateaus in the radiation spectra can be explained by the annihilation of multiple magnons. In the XXZ model, which retains the Z 2 symmetry, the laser magnetic field can induce a phase transition of the ground state when it exceeds a critical value, which results in a drastic change of the spin excitation character. As a consequence, the first cutoff energy in the HHG spectrum changes from a single-magnon to a two-magnon energy at this transition. Our results demonstrate the possibility of generating high-harmonic radiation from magnetically ordered materials and the usefulness of high-harmonic signals for extracting information on the spin excitation spectrum.
+
+
+
+
+
+
\ No newline at end of file
diff --git a/tests/ui/pdf_extractor/data/preprint.pdf b/tests/ui/pdf_extractor/data/preprint.pdf
new file mode 100755
index 00000000..1e9ff7dd
Binary files /dev/null and b/tests/ui/pdf_extractor/data/preprint.pdf differ
diff --git a/tests/ui/pdf_extractor/data/preprint.xml b/tests/ui/pdf_extractor/data/preprint.xml
new file mode 100644
index 00000000..30ad4654
--- /dev/null
+++ b/tests/ui/pdf_extractor/data/preprint.xml
@@ -0,0 +1,112 @@
+
+
+
+
+
+
+ GROBID - A machine learning software for extracting information from scholarly documents
+
+
+
+
+
+ Contextualized Ranking of Entity Types Based on Knowledge Graphs
+
+
+
+
+
+
+
+
+
+ AlbertoTonon
+
+ eXascale Infolab
+ University of Fribourg
+
+ Switzerland
+
+
+
+
+ MicheleCatasta
+
+ EPFL
+
+ Lausanne
+ Switzerland
+
+
+
+
+ RomanProkofyev
+
+ eXascale Infolab
+ University of Fribourg
+
+ Switzerland
+
+
+
+
+ GianlucaDemartini
+
+ Information School
+ University of Sheffield
+
+ UK
+
+
+
+
+ KarlAberer
+
+ EPFL
+
+ Lausanne
+ Switzerland
+
+
+
+
+ PhilippeCudré-Mauroux
+
+ eXascale Infolab
+ University of Fribourg
+
+ Switzerland
+
+
+
+ Contextualized Ranking of Entity Types Based on Knowledge Graphs
+
+
+
+
+
+
+
+
+
+
+
+
+ Entity typing
+ Ranking
+ Context
+ Crowdsourcing
+ Knowledge Graphs
+
+
+
+
A large fraction of online queries target entities. For this reason, Search Engine Result Pages (SERPs) increasingly contain information about the searched entities such as pictures, short summaries, related entities, and factual information. A key facet that is often displayed on the SERPs and that is instrumental for many applications is the entity type. However, an entity is usually not associated to a single generic type in the background knowledge graph but rather to a set of more specific types, which may be relevant or not given the document context. For example, one can find on the Linked Open Data cloud the fact that Tom Hanks is a person, an actor, and a person from Concord, California. All these types are correct but some may be too general to be interesting (e.g., person), while other may be interesting but already known to the user (e.g., actor), or may be irrelevant given the current browsing context (e.g., person from Concord, California). In this paper, we define the new task of ranking entity types given an entity and its context. We propose and evaluate new methods to find the most relevant entity type based on collection statistics and on the knowledge graph structure interconnecting entities and types. An extensive experimental evaluation over several document collections at different levels of granularity (e.g., sentences, paragraphs) and different type hierarchies (including DBpedia, Freebase, and schema.org) shows that hierarchy-based approaches provide more accurate results when picking entity types to be displayed to the end-user.
+
+
+
+
+
+
\ No newline at end of file
diff --git a/tests/ui/pdf_extractor/data/preprint_conf.pdf b/tests/ui/pdf_extractor/data/preprint_conf.pdf
new file mode 100755
index 00000000..d2048e2b
Binary files /dev/null and b/tests/ui/pdf_extractor/data/preprint_conf.pdf differ
diff --git a/tests/ui/pdf_extractor/data/preprint_conf.xml b/tests/ui/pdf_extractor/data/preprint_conf.xml
new file mode 100644
index 00000000..fa152cae
--- /dev/null
+++ b/tests/ui/pdf_extractor/data/preprint_conf.xml
@@ -0,0 +1,92 @@
+
+
+
+
+
+
+ GROBID - A machine learning software for extracting information from scholarly documents
+
+
+
+
+
+ SwissLink: High-Precision, Context-Free Entity Linking Exploiting Unambiguous Labels
+
+
+
+
+
+
+
+
+
+ RomanProkofyev
+
+ eXascale Infolab
+ University of Fribourg
+
+ Switzerland
+
+
+
+
+ MichaelLuggen
+
+ eXascale Infolab
+ University of Fribourg
+
+ Switzerland
+
+
+
+
+ DjellelEddineDifallah
+
+ eXascale Infolab
+ University of Fribourg
+
+ Switzerland
+
+
+
+
+ PhilippeCudré-Mauroux
+
+ eXascale Infolab
+ University of Fribourg
+
+ Switzerland
+
+
+
+ SwissLink: High-Precision, Context-Free Entity Linking Exploiting Unambiguous Labels
+
+
+
+
+
+
+ 10.1145/nnnnnnn.nnnnnnn
+ ACM Reference format: Roman Prokofyev, Michael Luggen, Djellel Eddine Difallah, and Philippe Cudré-Mauroux. 2017. SwissLink: High-Precision, Context-Free Entity Link-ing Exploiting Unambiguous Labels. In Proceedings of SEMANTiCS, , 2017, 8 pages. https://doi.org/10.1145/nnnnnnn.nnnnnnn
+
+
+
+
+
+
+ Entity Linking
+ Manual annotations
+ Machine learning
+
+
+
+
Webpages are an abundant source of textual information with manually annotated entity links, and are often used as a source of training data for a wide variety of machine learning NLP tasks. However, manual annotations such as those found on Wikipedia are sparse, noisy, and biased towards popular entities. Existing entity linking systems deal with those issues by relying on simple statistics extracted from the data. While such statistics can effectively deal with noisy annotations, they introduce bias towards head entities and are ineffective for long tail (e.g., unpopular) entities. In this work, we first analyze statistical properties linked to manual annotations by studying a large annotated corpus composed of all English Wikipedia webpages, in addition to all pages from the Com-monCrawl containing English Wikipedia annotations. We then propose and evaluate a series of entity linking approaches, with the explicit goal of creating highly-accurate (precision > 95%) and broad annotated corpuses for machine learning tasks. Our results show that our best approach achieves maximal-precision at usable recall levels, and outperforms both state-of-the-art entity-linking systems and human annotators.
+
+
+
+
+
+
\ No newline at end of file
diff --git a/tests/ui/pdf_extractor/data/test.doc b/tests/ui/pdf_extractor/data/test.doc
new file mode 100755
index 00000000..e69de29b
diff --git a/tests/ui/pdf_extractor/data/thesis.pdf b/tests/ui/pdf_extractor/data/thesis.pdf
new file mode 100755
index 00000000..ceb75186
Binary files /dev/null and b/tests/ui/pdf_extractor/data/thesis.pdf differ
diff --git a/tests/ui/pdf_extractor/data/thesis.xml b/tests/ui/pdf_extractor/data/thesis.xml
new file mode 100644
index 00000000..1c7931f1
--- /dev/null
+++ b/tests/ui/pdf_extractor/data/thesis.xml
@@ -0,0 +1,75 @@
+
+
+
+
+
+
+ GROBID - A machine learning software for extracting information from scholarly documents
+
+
+
+
+
+ Three essays in labor economics
+
+
+
+
+
+
+
+
+
+ ElenaGentili
+
+ Faculty of Economics Università della Svizzera Italiana
+ External member
+ Queen Mary University of London
+
+ London, UK)
+
+
+
+
+ ProfFrancescoFasani
+
+ Faculty of Economics Università della Svizzera Italiana
+ External member
+ Queen Mary University of London
+
+ London, UK)
+
+
+
+ Three essays in labor economics
+
+
+
+
+
+
+ Thesis submitted for the degree of Doctor of Philosophy in Economics Lugano, January 2019
+
+
+
+
+
+
+ Doctoral committee: Prof Fabrizio Mazzonna
+ Università della Svizzera Italiana (Lugano
+ CH)
+ Supervisor Prof Giovanni Pica
+ Università della Svizzera Italiana (Lugano
+ CH)
+ Internal memeber
+
+
+
+
+
+
+
+
diff --git a/tests/ui/pdf_extractor/test_pdf_extractor.py b/tests/ui/pdf_extractor/test_pdf_extractor.py
new file mode 100644
index 00000000..66a9fbcf
--- /dev/null
+++ b/tests/ui/pdf_extractor/test_pdf_extractor.py
@@ -0,0 +1,131 @@
+# -*- coding: utf-8 -*-
+#
+# Swiss Open Access Repository
+# Copyright (C) 2019 RERO
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+"""Test PDF extractor class."""
+
+import os
+import tempfile
+
+import pytest
+
+from sonar.modules.pdf_extractor.pdf_extractor import PDFExtractor
+
+
+def test_load_config(app, monkeypatch):
+ """Test configuration loading."""
+ monkeypatch.setattr(PDFExtractor, 'api_is_alive', lambda *args: False)
+ with pytest.raises(ConnectionRefusedError):
+ PDFExtractor()
+
+
+def test_process(app, mock_grobid_response, pdf_file):
+ """Test process method."""
+ pdf_extractor = PDFExtractor()
+
+ # Test output as XML
+ output = pdf_extractor.process(pdf_file, dict_output=False)
+ assert output.startswith('')
+
+ # Test output as dictionary
+ output = pdf_extractor.process(pdf_file, dict_output=True)
+ assert 'teiHeader' in output
+
+ # Test output as XML in a file
+ with tempfile.NamedTemporaryFile(mode='w+b', suffix=".pdf") as temp_file:
+ pdf_extractor.process(pdf_file, output_file=temp_file.name)
+
+ with open(temp_file.name, 'r') as output_file:
+ output = output_file.read()
+ assert output.startswith('')
+
+
+def test_process_raw(app, mock_grobid_response, pdf_file):
+ """Test process with raw content."""
+ with open(pdf_file, 'rb') as file:
+ content = file.read()
+
+ pdf_extractor = PDFExtractor()
+ output = pdf_extractor.process_raw(content)
+ assert 'teiHeader' in output
+
+
+def test_extract_metadata(app, mock_grobid_response, pdf_file):
+ """Test metadata extraction."""
+ pdf_extractor = PDFExtractor()
+
+ # Test valid extraction
+ output = pdf_extractor.extract_metadata(pdf_file)
+ assert output.startswith('')
+
+ # Test non existing file
+ with pytest.raises(ValueError) as exception:
+ pdf_extractor.extract_metadata('not_existing_file.pdf')
+ assert str(exception.value) == 'Input file does not exist'
+
+ # Test non valid pdf
+ input_file = os.path.dirname(os.path.abspath(__file__)) + '/data/test.doc'
+ with pytest.raises(ValueError) as exception:
+ pdf_extractor.extract_metadata(input_file)
+ assert str(exception.value) == 'Input file is not a valid PDF file'
+
+
+def test_extract_metadata_api_error_response(app, mock_grobid_error_response,
+ pdf_file):
+ """Test metadata extraction with error on API."""
+ pdf_extractor = PDFExtractor()
+
+ # Test error on api during extraction
+ with pytest.raises(Exception) as exception:
+ pdf_extractor.extract_metadata(pdf_file)
+ assert str(exception.value) == 'Metadata extraction failed'
+
+
+def test_api_is_alive(app, monkeypatch):
+ """Test if api is alive."""
+ pdf_extractor = PDFExtractor()
+
+ # Test API is up
+ assert pdf_extractor.api_is_alive()
+
+ # Test API is down
+ monkeypatch.setattr(PDFExtractor, 'do_request', lambda *args: ('', 503))
+ assert not pdf_extractor.api_is_alive()
+
+ # Test API raise exception
+ monkeypatch.setattr(PDFExtractor, 'do_request', lambda *args: Exception)
+ assert not pdf_extractor.api_is_alive()
+
+
+def test_do_request(app, mock_grobid_response, pdf_file):
+ """Test request to API."""
+ pdf_extractor = PDFExtractor()
+ # Test valid call
+ assert pdf_extractor.do_request('isalive', 'get') == (b'true', 200)
+
+ # Test unexisting endpoint
+ assert pdf_extractor.do_request('unexisting', 'get')[1] == 404
+
+ # Test invalid request type
+ with pytest.raises(ValueError):
+ pdf_extractor.do_request('isalive', 'invalid')
+
+ # Test post request
+ assert pdf_extractor.do_request(
+ 'processFulltextDocument',
+ 'post',
+ files={'input': (pdf_file, open(pdf_file,
+ 'rb'), 'application/pdf')})[1] == 200
diff --git a/tests/ui/pdf_extractor/test_pdf_extractor_ext.py b/tests/ui/pdf_extractor/test_pdf_extractor_ext.py
new file mode 100644
index 00000000..50c7e0ff
--- /dev/null
+++ b/tests/ui/pdf_extractor/test_pdf_extractor_ext.py
@@ -0,0 +1,26 @@
+# -*- coding: utf-8 -*-
+#
+# Swiss Open Access Repository
+# Copyright (C) 2019 RERO
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+"""Test PDF extractor extension."""
+
+from sonar.modules.pdf_extractor.ext import PDFExtractor
+
+
+def test_ext(app):
+ """Test PDF extractor extension."""
+ ext = PDFExtractor(app)
+ assert isinstance(ext, PDFExtractor)
diff --git a/tests/ui/pdf_extractor/test_pdf_extractor_views_api.py b/tests/ui/pdf_extractor/test_pdf_extractor_views_api.py
new file mode 100644
index 00000000..dfc17f0d
--- /dev/null
+++ b/tests/ui/pdf_extractor/test_pdf_extractor_views_api.py
@@ -0,0 +1,64 @@
+# -*- coding: utf-8 -*-
+#
+# Swiss Open Access Repository
+# Copyright (C) 2019 RERO
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+"""Test api views for PDF extractor."""
+
+import json
+from io import BytesIO
+
+from invenio_app.factory import create_api
+
+create_app = create_api
+
+
+def test_metadata(client, pdf_file, mock_grobid_response):
+ """Test metadata extraction."""
+ response = client.post('/pdf-extractor/metadata')
+ assert response.status_code == 400
+
+ with open(pdf_file, 'rb') as file:
+ content = file.read()
+
+ data = dict(file=(BytesIO(content), 'test.pdf'))
+
+ response = client.post('/pdf-extractor/metadata',
+ data=data,
+ content_type='multipart/form-data')
+ assert response.status_code == 200
+ result = json.loads(response.data)
+ assert 'teiHeader' in result
+
+ title = result['teiHeader']['fileDesc']['titleStmt']['title']['#text']
+ assert title[:10] == 'High-harmo'
+
+
+def test_full_text(client, pdf_file):
+ """Test full text extraction."""
+ response = client.post('/pdf-extractor/full-text')
+ assert response.status_code == 400
+
+ with open(pdf_file, 'rb') as file:
+ content = file.read()
+
+ data = dict(file=(BytesIO(content), 'test.pdf'))
+
+ response = client.post('/pdf-extractor/full-text',
+ data=data,
+ content_type='multipart/form-data')
+ assert response.status_code == 200
+ result = json.loads(response.data)
+ assert 'text' in result
diff --git a/tests/ui/pdf_extractor/test_pdf_extractor_views_client.py b/tests/ui/pdf_extractor/test_pdf_extractor_views_client.py
new file mode 100644
index 00000000..b423e84c
--- /dev/null
+++ b/tests/ui/pdf_extractor/test_pdf_extractor_views_client.py
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+#
+# Swiss Open Access Repository
+# Copyright (C) 2019 RERO
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+"""Test client views for PDF extractor."""
+
+
+def test_test(logged_user_client):
+ """Test the test page."""
+ response = logged_user_client.get('/pdf-extractor/test')
+ assert response.status_code == 200
+ assert 'PDF metadata extraction' in str(response.data)