-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Extracts metadata from PDF using GROBID and stores them into metadata of the file. * Changes property types for journal data. * Installs pycountry. * Closes #87. Co-Authored-by: Sébastien Délèze <[email protected]>
Sébastien Délèze
committed
Dec 20, 2019
1 parent
1b3f1fd
commit 5fd915d
Showing
8 changed files
with
399 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,205 @@ | ||
{ | ||
"@xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", | ||
"@xsi:schemaLocation": "http://www.tei-c.org/ns/1.0 /opt/grobid/grobid-home/schemas/xsd/Grobid.xsd", | ||
"teiHeader": { | ||
"@xml:lang": "en", | ||
"encodingDesc": { | ||
"appInfo": { | ||
"application": { | ||
"@ident": "GROBID", | ||
"@version": "0.5.5", | ||
"@when": "2019-12-11T07:17+0000", | ||
"ref": { | ||
"#text": "GROBID - A machine learning software for extracting information from scholarly documents", | ||
"@target": "https://github.com/kermitt2/grobid" | ||
} | ||
} | ||
} | ||
}, | ||
"fileDesc": { | ||
"publicationStmt": { | ||
"availability": { | ||
"@status": "unknown", | ||
"p": "Copyright Frontiers Media SA" | ||
}, | ||
"date": { | ||
"#text": "April 2019", | ||
"@type": "published", | ||
"@when": "2019-04" | ||
}, | ||
"publisher": "Frontiers Media SA" | ||
}, | ||
"sourceDesc": { | ||
"biblStruct": { | ||
"analytic": { | ||
"author": [ | ||
{ | ||
"persName": { | ||
"forename": { | ||
"#text": "Kay", | ||
"@type": "first" | ||
}, | ||
"surname": "Helfricht" | ||
} | ||
}, | ||
{ | ||
"persName": { | ||
"forename": { | ||
"#text": "Matthias", | ||
"@type": "first" | ||
}, | ||
"surname": "Huss" | ||
} | ||
}, | ||
{ | ||
"persName": { | ||
"forename": { | ||
"#text": "Andrea", | ||
"@type": "first" | ||
}, | ||
"surname": "Fischer" | ||
} | ||
}, | ||
{ | ||
"persName": { | ||
"forename": { | ||
"#text": "Jan-Christoph", | ||
"@type": "first" | ||
}, | ||
"surname": "Otto" | ||
} | ||
}, | ||
{ | ||
"affiliation": { | ||
"@key": "aff0", | ||
"address": { | ||
"country": { | ||
"#text": "Germany", | ||
"@key": "DE" | ||
}, | ||
"settlement": "Nuremberg" | ||
}, | ||
"orgName": { | ||
"#text": "University of Erlangen", | ||
"@type": "institution" | ||
} | ||
} | ||
}, | ||
{ | ||
"affiliation": { | ||
"@key": "aff1", | ||
"address": { | ||
"country": { | ||
"#text": "United States", | ||
"@key": "US" | ||
} | ||
}, | ||
"orgName": { | ||
"#text": "University of Alaska System", | ||
"@type": "institution" | ||
} | ||
} | ||
}, | ||
{ | ||
"affiliation": { | ||
"@key": "aff2", | ||
"address": { | ||
"country": { | ||
"#text": "Germany", | ||
"@key": "DE" | ||
} | ||
}, | ||
"orgName": { | ||
"#text": "University of Erlangen-Nuremberg", | ||
"@type": "institution" | ||
} | ||
} | ||
} | ||
], | ||
"title": { | ||
"#text": "Calibrated Ice Thickness Estimate for All Glaciers in Austria", | ||
"@level": "a", | ||
"@type": "main" | ||
} | ||
}, | ||
"idno": { | ||
"#text": "10.3389/feart.2019.00068", | ||
"@type": "DOI" | ||
}, | ||
"monogr": { | ||
"idno": { | ||
"#text": "2296-6463", | ||
"@type": "eISSN" | ||
}, | ||
"imprint": { | ||
"biblScope": [ | ||
{ | ||
"#text": "7", | ||
"@unit": "volume" | ||
}, | ||
{ | ||
"#text": "68", | ||
"@unit": "page" | ||
} | ||
], | ||
"date": { | ||
"#text": "April 2019", | ||
"@type": "published", | ||
"@when": "2019-04" | ||
}, | ||
"publisher": "Frontiers Media SA" | ||
}, | ||
"title": [ | ||
{ | ||
"#text": "Frontiers in Earth Science", | ||
"@level": "j", | ||
"@type": "main" | ||
}, | ||
{ | ||
"#text": "Front. Earth Sci.", | ||
"@level": "j", | ||
"@type": "abbrev" | ||
} | ||
] | ||
}, | ||
"note": [ | ||
{ | ||
"#text": "Specialty section: This article was submitted to Cryospheric Sciences, a section of the journal Frontiers in Earth Science Received: 22 May 2018 Accepted: 19 March 2019", | ||
"@type": "submission" | ||
}, | ||
"ORIGINAL RESEARCH Edited by: Reviewed by: *Correspondence: Citation: Helfricht K, Huss M, Fischer A and Otto J-C (2019) Calibrated Ice Thickness Estimate for All Glaciers in Austria. Front. Earth Sci. 7:68." | ||
] | ||
} | ||
}, | ||
"titleStmt": { | ||
"title": { | ||
"#text": "Calibrated Ice Thickness Estimate for All Glaciers in Austria", | ||
"@level": "a", | ||
"@type": "main" | ||
} | ||
} | ||
}, | ||
"profileDesc": { | ||
"abstract": { | ||
"p": "Knowledge on ice thickness distribution and total ice volume is a prerequisite for computing future glacier change for both glaciological and hydrological applications. Various ice thickness estimation methods have been developed but regional differences in fundamental model parameters are substantial. Parameters calibrated with measured data at specific points in time and space can vary when glacier geometry and dynamics change. This study contributes to a better understanding of accuracies and limitations of modeled ice thicknesses by taking advantage of a comprehensive data set of in-situ ice thickness measurements from 58 glaciers in the Austrian Alps and observed glacier geometries of three Austrian glacier inventories (GI) between 1969 and 2006. The field data are used to calibrate an established ice thickness model to calculate an improved ice thickness data set for the Austrian Alps. A cross-validation between modeled and measured point ice thickness indicates a model uncertainty of 25-31% of the measured point ice thickness. The comparison of the modeled and measured average glacier ice thickness revealed an underestimation of 5% with a mean standard deviation of 15% for the glaciers with calibration data. The apparent mass balance gradient, the primary model parameter accounting for the effects of surface mass balance distribution as well as ice flux, substantially decreases over time and has to be adjusted for each temporal increment to correctly reproduce observed ice thickness. This reflects the general stagnation of glaciers in Austria. Using the calibrated parameter set, 93% of the observed ice thickness change on a glacier-specific scale could be captured for the periods between the GI. We applied optimized apparent mass balance gradients to all glaciers of the latest Austrian glacier inventory and found a volume of 15.9 km 3 for the year 2006. The ten largest glaciers account for 25% of area and 35% of total ice volume. An estimate based on mass balance measurements from nine glaciers indicates an additional volume loss of 3.5 ± 0.4 km 3 (i.e., 22 ± 2.5%) until 2016. Relative changes in area and volume were largest at glaciers smaller than 1 km 2 , and relative volume changes appear to be higher than relative area changes for all considered time periods." | ||
}, | ||
"textClass": { | ||
"keywords": { | ||
"term": [ | ||
"glacier", | ||
"ice thickness measurements", | ||
"glacier inventory", | ||
"glacier modeling", | ||
"climate change", | ||
"ice cover", | ||
"glacier surface elevation change", | ||
"glacier mass balance" | ||
] | ||
} | ||
} | ||
} | ||
}, | ||
"text": { | ||
"@xml:lang": "en" | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Swiss Open Access Repository | ||
# Copyright (C) 2019 RERO | ||
# | ||
# This program is free software: you can redistribute it and/or modify | ||
# it under the terms of the GNU Affero General Public License as published by | ||
# the Free Software Foundation, version 3 of the License. | ||
# | ||
# This program is distributed in the hope that it will be useful, | ||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
# GNU Affero General Public License for more details. | ||
# | ||
# You should have received a copy of the GNU Affero General Public License | ||
# along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
|
||
"""Test PDF extractor utils.""" | ||
|
||
import json | ||
import os | ||
|
||
from sonar.modules.pdf_extractor.utils import format_extracted_data | ||
|
||
|
||
def test_format_extracted_data(app): | ||
"""Test format extracted data.""" | ||
# format_extracted_data({}) | ||
json_file = os.path.dirname( | ||
os.path.abspath(__file__)) + '/data/extracted_data.json' | ||
|
||
with open(json_file, 'rb') as file: | ||
# Test standard extraction | ||
extracted_data = json.load(file) | ||
formatted_data = format_extracted_data(extracted_data) | ||
assert 'title' in formatted_data | ||
assert formatted_data['title'] == 'Calibrated Ice Thickness Estimate' \ | ||
' for All Glaciers in Austria' | ||
|
||
# Test authors | ||
extracted_data['teiHeader']['fileDesc']['sourceDesc']['biblStruct'][ | ||
'analytic']['author'] = extracted_data['teiHeader']['fileDesc'][ | ||
'sourceDesc']['biblStruct']['analytic']['author'][0] | ||
|
||
formatted_data = format_extracted_data(extracted_data) | ||
assert len(formatted_data['authors']) == 1 | ||
|
||
# Test languages | ||
extracted_data['text']['@xml:lang'] = 'de' | ||
formatted_data = format_extracted_data(extracted_data) | ||
assert formatted_data['languages'][0] == 'ger' | ||
|
||
# Test imprint | ||
extracted_data['teiHeader']['fileDesc']['sourceDesc']['biblStruct'][ | ||
'monogr']['imprint']['biblScope'] = extracted_data['teiHeader'][ | ||
'fileDesc']['sourceDesc']['biblStruct']['monogr']['imprint'][ | ||
'biblScope'][0] | ||
formatted_data = format_extracted_data(extracted_data) | ||
assert formatted_data['journal']['name'] == 'Frontiers Media SA' | ||
assert formatted_data['journal']['volume'] == '7' |