deposit: extract metadata from PDF

* Extracts metadata from PDF using GROBID and stores them into metadata of the file. * Changes property types for journal data. * Installs pycountry. * Closes #87. Co-Authored-by: Sébastien Délèze <[email protected]>
rero · Dec 20, 2019 · 5fd915d · 5fd915d
1 parent 1b3f1fd
commit 5fd915d
Showing 8 changed files with 399 additions and 13 deletions.
diff --git a/Pipfile b/Pipfile
@@ -18,6 +18,7 @@ python-slugify = "*"
 python3-saml = "*"
 xmltodict = "*"
 marshmallow = "<=3.0.0b6"
+pycountry = "*"
 
 [dev-packages]
 Flask-Debugtoolbar = ">=0.10.1"

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/sonar/modules/deposits/api.py b/sonar/modules/deposits/api.py
@@ -51,11 +51,7 @@ class DepositRecord(SonarRecord):
     schema = 'deposit'
 
     @classmethod
-    def create(cls,
-               data,
-               id_=None,
-               dbcommit=False,
-               with_bucket=True,
+    def create(cls, data, id_=None, dbcommit=False, with_bucket=True,
                **kwargs):
         """Create deposit record."""
         record = super(DepositRecord, cls).create(data,
@@ -64,3 +60,33 @@ def create(cls,
                                                   with_bucket=with_bucket,
                                                   **kwargs)
         return record
+
+    def populate_with_pdf_metadata(self, pdf_metadata, default_title=None):
+        """Update data for record."""
+        self['metadata'] = {}
+
+        if 'title' in pdf_metadata:
+            self['metadata']['title'] = pdf_metadata['title']
+        else:
+            self['metadata']['title'] = default_title
+
+        if 'languages' in pdf_metadata:
+            self['metadata']['languages'] = pdf_metadata['languages']
+
+        if 'authors' in pdf_metadata:
+            if 'contributors' not in self:
+                self['contributors'] = []
+
+            for author in pdf_metadata['authors']:
+                self['contributors'].append({'name': author['name']})
+
+        if 'abstract' in pdf_metadata:
+            if 'abstracts' not in self['metadata']:
+                self['metadata']['abstracts'] = []
+
+            self['metadata']['abstracts'].append(pdf_metadata['abstract'])
+
+        if 'journal' in pdf_metadata:
+            self['metadata']['journal'] = pdf_metadata['journal']
+
+        return self
diff --git a/sonar/modules/deposits/jsonschemas/deposits/deposit-v1.0.0.json b/sonar/modules/deposits/jsonschemas/deposits/deposit-v1.0.0.json
@@ -46,7 +46,7 @@
     },
     "metadata": {
       "type": "object",
-      "required": ["document_type", "languages", "title"],
+      "required": ["languages", "title"],
       "propertiesOrder": [ "document_type", "languages", "title", "publication_date", "journal", "abstracts", "etc" ],
       "properties": {
         "document_type": {
@@ -125,12 +125,12 @@
             "volume": {
               "title": "Volume",
               "description": "Volume of the journal.",
-              "type": "integer"
+              "type": "string"
             },
             "number": {
               "title": "Number",
               "description": "Number of the journal.",
-              "type": "integer"
+              "type": "string"
             },
             "pages": {
               "title": "Pages",
@@ -154,6 +154,7 @@
         },
         "etc": {
           "title": "ETC.",
+          "default": "",
           "type": "string"
         }
       }
@@ -276,4 +277,4 @@
       }
     }
   }
-}
+}
diff --git a/sonar/modules/deposits/rest.py b/sonar/modules/deposits/rest.py
@@ -27,6 +27,8 @@
 from invenio_rest import ContentNegotiatedMethodView
 
 from sonar.modules.deposits.api import DepositRecord
+from sonar.modules.pdf_extractor.pdf_extractor import PDFExtractor
+from sonar.modules.pdf_extractor.utils import format_extracted_data
 
 
 class FilesResource(ContentNegotiatedMethodView):
@@ -63,16 +65,28 @@ def post(pid=None):
         # deposit.files[text_key]['file_type'] = 'full-text'
         # deposit.commit()
 
+        file_content = BytesIO(request.get_data())
+
         # Store document
-        deposit.files[key] = BytesIO(request.get_data())
+        deposit.files[key] = file_content
         deposit.files[key]['label'] = re.search(r'(.*)\..*$', key).group(1)
         deposit.files[key]['embargo'] = False
         deposit.files[key]['embargoDate'] = None
         deposit.files[key]['expect'] = False
         deposit.files[key]['category'] = request.args['type']
         deposit.files[key]['file_type'] = 'file'
-        deposit.commit()
 
+        # Extract data from pdf and populate deposit
+        if request.args['type'] == 'main':
+            pdf_extractor = PDFExtractor()
+            pdf_metadata = format_extracted_data(
+                pdf_extractor.process_raw(request.get_data()))
+
+            # deposit.populate_with_pdf_metadata(
+            #     pdf_metadata, "Deposit #{pid}".format(pid=pid))
+            deposit.files[key]['pdf_metadata'] = pdf_metadata
+
+        deposit.commit()
         db.session.commit()
 
         return make_response(jsonify(deposit.files[key].dumps()))
@@ -106,7 +120,6 @@ def put(pid=None, key=None):
 files_view = FilesResource.as_view('files')
 file_view = FileResource.as_view('file')
 
-
 blueprint = Blueprint('deposits', __name__, url_prefix='/deposits/<pid>/')
 blueprint.add_url_rule('/custom-files/<key>', view_func=file_view)
 blueprint.add_url_rule('/custom-files', view_func=files_view)
diff --git a/sonar/modules/pdf_extractor/utils.py b/sonar/modules/pdf_extractor/utils.py
@@ -21,6 +21,8 @@
 import subprocess
 import tempfile
 
+import pycountry
+
 
 def extract_text_from_content(content):
     """Extract full-text from content which will be stored in a temporary file.
@@ -44,3 +46,74 @@ def extract_text_from_file(file):
     text = re.sub('[\r\n\f]+', ' ', text)
 
     return text
+
+
+def format_extracted_data(data):
+    """Format the extracted metadata from PDF."""
+    formatted_data = {}
+    if '#text' in data['teiHeader']['fileDesc']['titleStmt']['title']:
+        formatted_data['title'] = data['teiHeader']['fileDesc']['titleStmt'][
+            'title']['#text']
+
+    if data['text']['@xml:lang']:
+        language = pycountry.languages.get(alpha_2=data['text']['@xml:lang'])
+        if language:
+            if hasattr(language, 'bibliographic'):
+                formatted_data['languages'] = [language.bibliographic]
+            else:
+                formatted_data['languages'] = [language.alpha_3]
+
+    if 'analytic' in data['teiHeader']['fileDesc']['sourceDesc'][
+            'biblStruct'] and data['teiHeader']['fileDesc']['sourceDesc'][
+                'biblStruct']['analytic'] and 'author' in data['teiHeader'][
+                    'fileDesc']['sourceDesc']['biblStruct']['analytic']:
+        authors = data['teiHeader']['fileDesc']['sourceDesc']['biblStruct'][
+            'analytic']['author']
+        if not isinstance(authors, list):
+            authors = [authors]
+
+        formatted_data['authors'] = []
+        for author in authors:
+            if 'persName' in author:
+                new_author = {}
+
+                if 'surname' in author['persName']:
+                    new_author['name'] = author['persName']['surname']
+
+                if not isinstance(author['persName']['forename'], list):
+                    author['persName']['forename'] = [
+                        author['persName']['forename']
+                    ]
+
+                for forename in author['persName']['forename']:
+                    new_author[
+                        'name'] = forename['#text'] + ' ' + new_author['name']
+
+                formatted_data['authors'].append(new_author)
+
+    if data['teiHeader']['fileDesc']['sourceDesc']['biblStruct']['monogr'][
+            'imprint']:
+        imprint = data['teiHeader']['fileDesc']['sourceDesc']['biblStruct'][
+            'monogr']['imprint']
+        if 'publisher' in imprint:
+            formatted_data['journal'] = {'name': imprint['publisher']}
+
+            if not isinstance(imprint['biblScope'], list):
+                imprint['biblScope'] = [imprint['biblScope']]
+
+            for item in imprint['biblScope']:
+                if item['@unit'] in ['page', 'volume', 'number']:
+                    key = item['@unit']
+                    if key == 'page':
+                        key = 'pages'
+
+                    formatted_data['journal'][
+                        key] = item['#text'] if '#text' in item else item[
+                            '@from'] + '-' + item['@to']
+
+    if 'abstract' in data['teiHeader']['profileDesc'] and data['teiHeader'][
+            'profileDesc']['abstract']:
+        formatted_data['abstract'] = data['teiHeader']['profileDesc'][
+            'abstract']['p']
+
+    return formatted_data
diff --git a/tests/ui/pdf_extractor/data/extracted_data.json b/tests/ui/pdf_extractor/data/extracted_data.json
@@ -0,0 +1,205 @@
+{
+  "@xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance",
+  "@xsi:schemaLocation": "http://www.tei-c.org/ns/1.0 /opt/grobid/grobid-home/schemas/xsd/Grobid.xsd",
+  "teiHeader": {
+    "@xml:lang": "en",
+    "encodingDesc": {
+      "appInfo": {
+        "application": {
+          "@ident": "GROBID",
+          "@version": "0.5.5",
+          "@when": "2019-12-11T07:17+0000",
+          "ref": {
+            "#text": "GROBID - A machine learning software for extracting information from scholarly documents",
+            "@target": "https://github.com/kermitt2/grobid"
+          }
+        }
+      }
+    },
+    "fileDesc": {
+      "publicationStmt": {
+        "availability": {
+          "@status": "unknown",
+          "p": "Copyright Frontiers Media SA"
+        },
+        "date": {
+          "#text": "April 2019",
+          "@type": "published",
+          "@when": "2019-04"
+        },
+        "publisher": "Frontiers Media SA"
+      },
+      "sourceDesc": {
+        "biblStruct": {
+          "analytic": {
+            "author": [
+              {
+                "persName": {
+                  "forename": {
+                    "#text": "Kay",
+                    "@type": "first"
+                  },
+                  "surname": "Helfricht"
+                }
+              },
+              {
+                "persName": {
+                  "forename": {
+                    "#text": "Matthias",
+                    "@type": "first"
+                  },
+                  "surname": "Huss"
+                }
+              },
+              {
+                "persName": {
+                  "forename": {
+                    "#text": "Andrea",
+                    "@type": "first"
+                  },
+                  "surname": "Fischer"
+                }
+              },
+              {
+                "persName": {
+                  "forename": {
+                    "#text": "Jan-Christoph",
+                    "@type": "first"
+                  },
+                  "surname": "Otto"
+                }
+              },
+              {
+                "affiliation": {
+                  "@key": "aff0",
+                  "address": {
+                    "country": {
+                      "#text": "Germany",
+                      "@key": "DE"
+                    },
+                    "settlement": "Nuremberg"
+                  },
+                  "orgName": {
+                    "#text": "University of Erlangen",
+                    "@type": "institution"
+                  }
+                }
+              },
+              {
+                "affiliation": {
+                  "@key": "aff1",
+                  "address": {
+                    "country": {
+                      "#text": "United States",
+                      "@key": "US"
+                    }
+                  },
+                  "orgName": {
+                    "#text": "University of Alaska System",
+                    "@type": "institution"
+                  }
+                }
+              },
+              {
+                "affiliation": {
+                  "@key": "aff2",
+                  "address": {
+                    "country": {
+                      "#text": "Germany",
+                      "@key": "DE"
+                    }
+                  },
+                  "orgName": {
+                    "#text": "University of Erlangen-Nuremberg",
+                    "@type": "institution"
+                  }
+                }
+              }
+            ],
+            "title": {
+              "#text": "Calibrated Ice Thickness Estimate for All Glaciers in Austria",
+              "@level": "a",
+              "@type": "main"
+            }
+          },
+          "idno": {
+            "#text": "10.3389/feart.2019.00068",
+            "@type": "DOI"
+          },
+          "monogr": {
+            "idno": {
+              "#text": "2296-6463",
+              "@type": "eISSN"
+            },
+            "imprint": {
+              "biblScope": [
+                {
+                  "#text": "7",
+                  "@unit": "volume"
+                },
+                {
+                  "#text": "68",
+                  "@unit": "page"
+                }
+              ],
+              "date": {
+                "#text": "April 2019",
+                "@type": "published",
+                "@when": "2019-04"
+              },
+              "publisher": "Frontiers Media SA"
+            },
+            "title": [
+              {
+                "#text": "Frontiers in Earth Science",
+                "@level": "j",
+                "@type": "main"
+              },
+              {
+                "#text": "Front. Earth Sci.",
+                "@level": "j",
+                "@type": "abbrev"
+              }
+            ]
+          },
+          "note": [
+            {
+              "#text": "Specialty section: This article was submitted to Cryospheric Sciences, a section of the journal Frontiers in Earth Science Received: 22 May 2018 Accepted: 19 March 2019",
+              "@type": "submission"
+            },
+            "ORIGINAL RESEARCH Edited by: Reviewed by: *Correspondence: Citation: Helfricht K, Huss M, Fischer A and Otto J-C (2019) Calibrated Ice Thickness Estimate for All Glaciers in Austria. Front. Earth Sci. 7:68."
+          ]
+        }
+      },
+      "titleStmt": {
+        "title": {
+          "#text": "Calibrated Ice Thickness Estimate for All Glaciers in Austria",
+          "@level": "a",
+          "@type": "main"
+        }
+      }
+    },
+    "profileDesc": {
+      "abstract": {
+        "p": "Knowledge on ice thickness distribution and total ice volume is a prerequisite for computing future glacier change for both glaciological and hydrological applications. Various ice thickness estimation methods have been developed but regional differences in fundamental model parameters are substantial. Parameters calibrated with measured data at specific points in time and space can vary when glacier geometry and dynamics change. This study contributes to a better understanding of accuracies and limitations of modeled ice thicknesses by taking advantage of a comprehensive data set of in-situ ice thickness measurements from 58 glaciers in the Austrian Alps and observed glacier geometries of three Austrian glacier inventories (GI) between 1969 and 2006. The field data are used to calibrate an established ice thickness model to calculate an improved ice thickness data set for the Austrian Alps. A cross-validation between modeled and measured point ice thickness indicates a model uncertainty of 25-31% of the measured point ice thickness. The comparison of the modeled and measured average glacier ice thickness revealed an underestimation of 5% with a mean standard deviation of 15% for the glaciers with calibration data. The apparent mass balance gradient, the primary model parameter accounting for the effects of surface mass balance distribution as well as ice flux, substantially decreases over time and has to be adjusted for each temporal increment to correctly reproduce observed ice thickness. This reflects the general stagnation of glaciers in Austria. Using the calibrated parameter set, 93% of the observed ice thickness change on a glacier-specific scale could be captured for the periods between the GI. We applied optimized apparent mass balance gradients to all glaciers of the latest Austrian glacier inventory and found a volume of 15.9 km 3 for the year 2006. The ten largest glaciers account for 25% of area and 35% of total ice volume. An estimate based on mass balance measurements from nine glaciers indicates an additional volume loss of 3.5 ± 0.4 km 3 (i.e., 22 ± 2.5%) until 2016. Relative changes in area and volume were largest at glaciers smaller than 1 km 2 , and relative volume changes appear to be higher than relative area changes for all considered time periods."
+      },
+      "textClass": {
+        "keywords": {
+          "term": [
+            "glacier",
+            "ice thickness measurements",
+            "glacier inventory",
+            "glacier modeling",
+            "climate change",
+            "ice cover",
+            "glacier surface elevation change",
+            "glacier mass balance"
+          ]
+        }
+      }
+    }
+  },
+  "text": {
+    "@xml:lang": "en"
+  }
+}
diff --git a/tests/ui/pdf_extractor/test_pdf_extractor_utils.py b/tests/ui/pdf_extractor/test_pdf_extractor_utils.py
@@ -0,0 +1,60 @@
+# -*- coding: utf-8 -*-
+#
+# Swiss Open Access Repository
+# Copyright (C) 2019 RERO
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+"""Test PDF extractor utils."""
+
+import json
+import os
+
+from sonar.modules.pdf_extractor.utils import format_extracted_data
+
+
+def test_format_extracted_data(app):
+    """Test format extracted data."""
+    # format_extracted_data({})
+    json_file = os.path.dirname(
+        os.path.abspath(__file__)) + '/data/extracted_data.json'
+
+    with open(json_file, 'rb') as file:
+        # Test standard extraction
+        extracted_data = json.load(file)
+        formatted_data = format_extracted_data(extracted_data)
+        assert 'title' in formatted_data
+        assert formatted_data['title'] == 'Calibrated Ice Thickness Estimate' \
+            ' for All Glaciers in Austria'
+
+        # Test authors
+        extracted_data['teiHeader']['fileDesc']['sourceDesc']['biblStruct'][
+            'analytic']['author'] = extracted_data['teiHeader']['fileDesc'][
+                'sourceDesc']['biblStruct']['analytic']['author'][0]
+
+        formatted_data = format_extracted_data(extracted_data)
+        assert len(formatted_data['authors']) == 1
+
+        # Test languages
+        extracted_data['text']['@xml:lang'] = 'de'
+        formatted_data = format_extracted_data(extracted_data)
+        assert formatted_data['languages'][0] == 'ger'
+
+        # Test imprint
+        extracted_data['teiHeader']['fileDesc']['sourceDesc']['biblStruct'][
+            'monogr']['imprint']['biblScope'] = extracted_data['teiHeader'][
+                'fileDesc']['sourceDesc']['biblStruct']['monogr']['imprint'][
+                    'biblScope'][0]
+        formatted_data = format_extracted_data(extracted_data)
+        assert formatted_data['journal']['name'] == 'Frontiers Media SA'
+        assert formatted_data['journal']['volume'] == '7'