Skip to content

Commit

Permalink
deposit: extract metadata from PDF
Browse files Browse the repository at this point in the history
* Extracts metadata from PDF using GROBID and stores them into metadata of the file.
* Changes property types for journal data.
* Installs pycountry.
* Closes #87.

Co-Authored-by: Sébastien Délèze <[email protected]>
Sébastien Délèze committed Dec 20, 2019
1 parent 1b3f1fd commit 5fd915d
Showing 8 changed files with 399 additions and 13 deletions.
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
@@ -18,6 +18,7 @@ python-slugify = "*"
python3-saml = "*"
xmltodict = "*"
marshmallow = "<=3.0.0b6"
pycountry = "*"

[dev-packages]
Flask-Debugtoolbar = ">=0.10.1"
9 changes: 8 additions & 1 deletion Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

36 changes: 31 additions & 5 deletions sonar/modules/deposits/api.py
Original file line number Diff line number Diff line change
@@ -51,11 +51,7 @@ class DepositRecord(SonarRecord):
schema = 'deposit'

@classmethod
def create(cls,
data,
id_=None,
dbcommit=False,
with_bucket=True,
def create(cls, data, id_=None, dbcommit=False, with_bucket=True,
**kwargs):
"""Create deposit record."""
record = super(DepositRecord, cls).create(data,
@@ -64,3 +60,33 @@ def create(cls,
with_bucket=with_bucket,
**kwargs)
return record

def populate_with_pdf_metadata(self, pdf_metadata, default_title=None):
"""Update data for record."""
self['metadata'] = {}

if 'title' in pdf_metadata:
self['metadata']['title'] = pdf_metadata['title']
else:
self['metadata']['title'] = default_title

if 'languages' in pdf_metadata:
self['metadata']['languages'] = pdf_metadata['languages']

if 'authors' in pdf_metadata:
if 'contributors' not in self:
self['contributors'] = []

for author in pdf_metadata['authors']:
self['contributors'].append({'name': author['name']})

if 'abstract' in pdf_metadata:
if 'abstracts' not in self['metadata']:
self['metadata']['abstracts'] = []

self['metadata']['abstracts'].append(pdf_metadata['abstract'])

if 'journal' in pdf_metadata:
self['metadata']['journal'] = pdf_metadata['journal']

return self
Original file line number Diff line number Diff line change
@@ -46,7 +46,7 @@
},
"metadata": {
"type": "object",
"required": ["document_type", "languages", "title"],
"required": ["languages", "title"],
"propertiesOrder": [ "document_type", "languages", "title", "publication_date", "journal", "abstracts", "etc" ],
"properties": {
"document_type": {
@@ -125,12 +125,12 @@
"volume": {
"title": "Volume",
"description": "Volume of the journal.",
"type": "integer"
"type": "string"
},
"number": {
"title": "Number",
"description": "Number of the journal.",
"type": "integer"
"type": "string"
},
"pages": {
"title": "Pages",
@@ -154,6 +154,7 @@
},
"etc": {
"title": "ETC.",
"default": "",
"type": "string"
}
}
@@ -276,4 +277,4 @@
}
}
}
}
}
19 changes: 16 additions & 3 deletions sonar/modules/deposits/rest.py
Original file line number Diff line number Diff line change
@@ -27,6 +27,8 @@
from invenio_rest import ContentNegotiatedMethodView

from sonar.modules.deposits.api import DepositRecord
from sonar.modules.pdf_extractor.pdf_extractor import PDFExtractor
from sonar.modules.pdf_extractor.utils import format_extracted_data


class FilesResource(ContentNegotiatedMethodView):
@@ -63,16 +65,28 @@ def post(pid=None):
# deposit.files[text_key]['file_type'] = 'full-text'
# deposit.commit()

file_content = BytesIO(request.get_data())

# Store document
deposit.files[key] = BytesIO(request.get_data())
deposit.files[key] = file_content
deposit.files[key]['label'] = re.search(r'(.*)\..*$', key).group(1)
deposit.files[key]['embargo'] = False
deposit.files[key]['embargoDate'] = None
deposit.files[key]['expect'] = False
deposit.files[key]['category'] = request.args['type']
deposit.files[key]['file_type'] = 'file'
deposit.commit()

# Extract data from pdf and populate deposit
if request.args['type'] == 'main':
pdf_extractor = PDFExtractor()
pdf_metadata = format_extracted_data(
pdf_extractor.process_raw(request.get_data()))

# deposit.populate_with_pdf_metadata(
# pdf_metadata, "Deposit #{pid}".format(pid=pid))
deposit.files[key]['pdf_metadata'] = pdf_metadata

deposit.commit()
db.session.commit()

return make_response(jsonify(deposit.files[key].dumps()))
@@ -106,7 +120,6 @@ def put(pid=None, key=None):
files_view = FilesResource.as_view('files')
file_view = FileResource.as_view('file')


blueprint = Blueprint('deposits', __name__, url_prefix='/deposits/<pid>/')
blueprint.add_url_rule('/custom-files/<key>', view_func=file_view)
blueprint.add_url_rule('/custom-files', view_func=files_view)
73 changes: 73 additions & 0 deletions sonar/modules/pdf_extractor/utils.py
Original file line number Diff line number Diff line change
@@ -21,6 +21,8 @@
import subprocess
import tempfile

import pycountry


def extract_text_from_content(content):
"""Extract full-text from content which will be stored in a temporary file.
@@ -44,3 +46,74 @@ def extract_text_from_file(file):
text = re.sub('[\r\n\f]+', ' ', text)

return text


def format_extracted_data(data):
"""Format the extracted metadata from PDF."""
formatted_data = {}
if '#text' in data['teiHeader']['fileDesc']['titleStmt']['title']:
formatted_data['title'] = data['teiHeader']['fileDesc']['titleStmt'][
'title']['#text']

if data['text']['@xml:lang']:
language = pycountry.languages.get(alpha_2=data['text']['@xml:lang'])
if language:
if hasattr(language, 'bibliographic'):
formatted_data['languages'] = [language.bibliographic]
else:
formatted_data['languages'] = [language.alpha_3]

if 'analytic' in data['teiHeader']['fileDesc']['sourceDesc'][
'biblStruct'] and data['teiHeader']['fileDesc']['sourceDesc'][
'biblStruct']['analytic'] and 'author' in data['teiHeader'][
'fileDesc']['sourceDesc']['biblStruct']['analytic']:
authors = data['teiHeader']['fileDesc']['sourceDesc']['biblStruct'][
'analytic']['author']
if not isinstance(authors, list):
authors = [authors]

formatted_data['authors'] = []
for author in authors:
if 'persName' in author:
new_author = {}

if 'surname' in author['persName']:
new_author['name'] = author['persName']['surname']

if not isinstance(author['persName']['forename'], list):
author['persName']['forename'] = [
author['persName']['forename']
]

for forename in author['persName']['forename']:
new_author[
'name'] = forename['#text'] + ' ' + new_author['name']

formatted_data['authors'].append(new_author)

if data['teiHeader']['fileDesc']['sourceDesc']['biblStruct']['monogr'][
'imprint']:
imprint = data['teiHeader']['fileDesc']['sourceDesc']['biblStruct'][
'monogr']['imprint']
if 'publisher' in imprint:
formatted_data['journal'] = {'name': imprint['publisher']}

if not isinstance(imprint['biblScope'], list):
imprint['biblScope'] = [imprint['biblScope']]

for item in imprint['biblScope']:
if item['@unit'] in ['page', 'volume', 'number']:
key = item['@unit']
if key == 'page':
key = 'pages'

formatted_data['journal'][
key] = item['#text'] if '#text' in item else item[
'@from'] + '-' + item['@to']

if 'abstract' in data['teiHeader']['profileDesc'] and data['teiHeader'][
'profileDesc']['abstract']:
formatted_data['abstract'] = data['teiHeader']['profileDesc'][
'abstract']['p']

return formatted_data
205 changes: 205 additions & 0 deletions tests/ui/pdf_extractor/data/extracted_data.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
{
"@xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance",
"@xsi:schemaLocation": "http://www.tei-c.org/ns/1.0 /opt/grobid/grobid-home/schemas/xsd/Grobid.xsd",
"teiHeader": {
"@xml:lang": "en",
"encodingDesc": {
"appInfo": {
"application": {
"@ident": "GROBID",
"@version": "0.5.5",
"@when": "2019-12-11T07:17+0000",
"ref": {
"#text": "GROBID - A machine learning software for extracting information from scholarly documents",
"@target": "https://github.com/kermitt2/grobid"
}
}
}
},
"fileDesc": {
"publicationStmt": {
"availability": {
"@status": "unknown",
"p": "Copyright Frontiers Media SA"
},
"date": {
"#text": "April 2019",
"@type": "published",
"@when": "2019-04"
},
"publisher": "Frontiers Media SA"
},
"sourceDesc": {
"biblStruct": {
"analytic": {
"author": [
{
"persName": {
"forename": {
"#text": "Kay",
"@type": "first"
},
"surname": "Helfricht"
}
},
{
"persName": {
"forename": {
"#text": "Matthias",
"@type": "first"
},
"surname": "Huss"
}
},
{
"persName": {
"forename": {
"#text": "Andrea",
"@type": "first"
},
"surname": "Fischer"
}
},
{
"persName": {
"forename": {
"#text": "Jan-Christoph",
"@type": "first"
},
"surname": "Otto"
}
},
{
"affiliation": {
"@key": "aff0",
"address": {
"country": {
"#text": "Germany",
"@key": "DE"
},
"settlement": "Nuremberg"
},
"orgName": {
"#text": "University of Erlangen",
"@type": "institution"
}
}
},
{
"affiliation": {
"@key": "aff1",
"address": {
"country": {
"#text": "United States",
"@key": "US"
}
},
"orgName": {
"#text": "University of Alaska System",
"@type": "institution"
}
}
},
{
"affiliation": {
"@key": "aff2",
"address": {
"country": {
"#text": "Germany",
"@key": "DE"
}
},
"orgName": {
"#text": "University of Erlangen-Nuremberg",
"@type": "institution"
}
}
}
],
"title": {
"#text": "Calibrated Ice Thickness Estimate for All Glaciers in Austria",
"@level": "a",
"@type": "main"
}
},
"idno": {
"#text": "10.3389/feart.2019.00068",
"@type": "DOI"
},
"monogr": {
"idno": {
"#text": "2296-6463",
"@type": "eISSN"
},
"imprint": {
"biblScope": [
{
"#text": "7",
"@unit": "volume"
},
{
"#text": "68",
"@unit": "page"
}
],
"date": {
"#text": "April 2019",
"@type": "published",
"@when": "2019-04"
},
"publisher": "Frontiers Media SA"
},
"title": [
{
"#text": "Frontiers in Earth Science",
"@level": "j",
"@type": "main"
},
{
"#text": "Front. Earth Sci.",
"@level": "j",
"@type": "abbrev"
}
]
},
"note": [
{
"#text": "Specialty section: This article was submitted to Cryospheric Sciences, a section of the journal Frontiers in Earth Science Received: 22 May 2018 Accepted: 19 March 2019",
"@type": "submission"
},
"ORIGINAL RESEARCH Edited by: Reviewed by: *Correspondence: Citation: Helfricht K, Huss M, Fischer A and Otto J-C (2019) Calibrated Ice Thickness Estimate for All Glaciers in Austria. Front. Earth Sci. 7:68."
]
}
},
"titleStmt": {
"title": {
"#text": "Calibrated Ice Thickness Estimate for All Glaciers in Austria",
"@level": "a",
"@type": "main"
}
}
},
"profileDesc": {
"abstract": {
"p": "Knowledge on ice thickness distribution and total ice volume is a prerequisite for computing future glacier change for both glaciological and hydrological applications. Various ice thickness estimation methods have been developed but regional differences in fundamental model parameters are substantial. Parameters calibrated with measured data at specific points in time and space can vary when glacier geometry and dynamics change. This study contributes to a better understanding of accuracies and limitations of modeled ice thicknesses by taking advantage of a comprehensive data set of in-situ ice thickness measurements from 58 glaciers in the Austrian Alps and observed glacier geometries of three Austrian glacier inventories (GI) between 1969 and 2006. The field data are used to calibrate an established ice thickness model to calculate an improved ice thickness data set for the Austrian Alps. A cross-validation between modeled and measured point ice thickness indicates a model uncertainty of 25-31% of the measured point ice thickness. The comparison of the modeled and measured average glacier ice thickness revealed an underestimation of 5% with a mean standard deviation of 15% for the glaciers with calibration data. The apparent mass balance gradient, the primary model parameter accounting for the effects of surface mass balance distribution as well as ice flux, substantially decreases over time and has to be adjusted for each temporal increment to correctly reproduce observed ice thickness. This reflects the general stagnation of glaciers in Austria. Using the calibrated parameter set, 93% of the observed ice thickness change on a glacier-specific scale could be captured for the periods between the GI. We applied optimized apparent mass balance gradients to all glaciers of the latest Austrian glacier inventory and found a volume of 15.9 km 3 for the year 2006. The ten largest glaciers account for 25% of area and 35% of total ice volume. An estimate based on mass balance measurements from nine glaciers indicates an additional volume loss of 3.5 ± 0.4 km 3 (i.e., 22 ± 2.5%) until 2016. Relative changes in area and volume were largest at glaciers smaller than 1 km 2 , and relative volume changes appear to be higher than relative area changes for all considered time periods."
},
"textClass": {
"keywords": {
"term": [
"glacier",
"ice thickness measurements",
"glacier inventory",
"glacier modeling",
"climate change",
"ice cover",
"glacier surface elevation change",
"glacier mass balance"
]
}
}
}
},
"text": {
"@xml:lang": "en"
}
}
60 changes: 60 additions & 0 deletions tests/ui/pdf_extractor/test_pdf_extractor_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# -*- coding: utf-8 -*-
#
# Swiss Open Access Repository
# Copyright (C) 2019 RERO
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

"""Test PDF extractor utils."""

import json
import os

from sonar.modules.pdf_extractor.utils import format_extracted_data


def test_format_extracted_data(app):
"""Test format extracted data."""
# format_extracted_data({})
json_file = os.path.dirname(
os.path.abspath(__file__)) + '/data/extracted_data.json'

with open(json_file, 'rb') as file:
# Test standard extraction
extracted_data = json.load(file)
formatted_data = format_extracted_data(extracted_data)
assert 'title' in formatted_data
assert formatted_data['title'] == 'Calibrated Ice Thickness Estimate' \
' for All Glaciers in Austria'

# Test authors
extracted_data['teiHeader']['fileDesc']['sourceDesc']['biblStruct'][
'analytic']['author'] = extracted_data['teiHeader']['fileDesc'][
'sourceDesc']['biblStruct']['analytic']['author'][0]

formatted_data = format_extracted_data(extracted_data)
assert len(formatted_data['authors']) == 1

# Test languages
extracted_data['text']['@xml:lang'] = 'de'
formatted_data = format_extracted_data(extracted_data)
assert formatted_data['languages'][0] == 'ger'

# Test imprint
extracted_data['teiHeader']['fileDesc']['sourceDesc']['biblStruct'][
'monogr']['imprint']['biblScope'] = extracted_data['teiHeader'][
'fileDesc']['sourceDesc']['biblStruct']['monogr']['imprint'][
'biblScope'][0]
formatted_data = format_extracted_data(extracted_data)
assert formatted_data['journal']['name'] == 'Frontiers Media SA'
assert formatted_data['journal']['volume'] == '7'

0 comments on commit 5fd915d

Please sign in to comment.