diff --git a/.github/workflows/ci-build-manual-crf.yml b/.github/workflows/ci-build-manual-crf.yml index 815d4e8cc2..6606f0bfd1 100644 --- a/.github/workflows/ci-build-manual-crf.yml +++ b/.github/workflows/ci-build-manual-crf.yml @@ -42,6 +42,6 @@ jobs: registry: docker.io pushImage: true tags: | - latest-develop${{ github.event.inputs.suffix != '' && '-' || '' }}${{ github.event.inputs.suffix }}, latest-crf${{ github.event.inputs.suffix != '' && '-' || '' }}${{ github.event.inputs.suffix }} + latest-develop, latest-crf${{ github.event.inputs.suffix != '' && '-' || '' }}${{ github.event.inputs.suffix }} - name: Image digest run: echo ${{ steps.docker_build.outputs.digest }} diff --git a/.github/workflows/ci-build-unstable.yml b/.github/workflows/ci-build-unstable.yml index 8f0b2eed66..808931f3d7 100644 --- a/.github/workflows/ci-build-unstable.yml +++ b/.github/workflows/ci-build-unstable.yml @@ -13,11 +13,11 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Set up JDK 17 + - name: Set up JDK 11 uses: actions/setup-java@v4 with: - java-version: '17.0.10+7' - distribution: 'temurin' + java-version: '11' + distribution: 'adopt' cache: 'gradle' - name: Build with Gradle run: ./gradlew clean assemble --info --stacktrace --no-daemon diff --git a/CHANGELOG.md b/CHANGELOG.md index 1bd8938507..49522646a4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,29 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [0.8.1] - 2024-06-10 + +### Added + - Identified URLs are now added in the TEI output #1099 + - Added DL models for patent processing #1082 + - Copyright and licence identification models #1078 + - Add research infrastructure recognition for funding processing #1085 + +### Changed + - Improved the recognition of URLs using (when available) PDF annotations, such as clickable links + - Updated TEI schema #1084 + - Review patent process #1082 + - Add Kotlin language to support development and testing #1096 + +### Fixed + - Sentence segmentation avoids to split sentences with an URL in the middle #1097 + - Sentence segmentation is now applied to funding and acknowledgement #1106 + - Docker image was optimized to reduce the needed space #1088 + - Fixed OOBE when processing large quantities of notes #1075 + - Corrected `` coordinate attribute name #1070 + - Fix missing coordinates in paragraph continuation #1076 + - Fixed JSON log output + ## [0.8.0] - 2023-11-19 ### Added diff --git a/Readme.md b/Readme.md index 0547371813..66b4dd6791 100644 --- a/Readme.md +++ b/Readme.md @@ -105,11 +105,10 @@ Detailed end-to-end [benchmarking](https://grobid.readthedocs.io/en/latest/Bench A series of additional modules have been developed for performing __structure aware__ text mining directly on scholar PDF, reusing GROBID's PDF processing and sequence labelling weaponry: - [software-mention](https://github.com/ourresearch/software-mentions): recognition of software mentions and associated attributes in scientific literature -- [datastet](https://github.com/kermitt2/datastet): identification of named and implicit research datasets and associated attributes in scientific articles +- [datastet](https://github.com/kermitt2/datastet): identification of sections and sentences introducing datasets in a scientific article, identification of dataset names and attributes (implict and named datasets) and classification of the type of datasets - [grobid-quantities](https://github.com/kermitt2/grobid-quantities): recognition and normalization of physical quantities/measurements - [grobid-superconductors](https://github.com/lfoppiano/grobid-superconductors): recognition of superconductor material and properties in scientific literature - [entity-fishing](https://github.com/kermitt2/entity-fishing), a tool for extracting Wikidata entities from text and document, which can also use Grobid to pre-process scientific articles in PDF, leading to more precise and relevant entity extraction and the capacity to annotate the PDF with interactive layout -- [datastet](https://github.com/kermitt2/datastet): identification of sections and sentences introducing datasets in a scientific article, identification of dataset names (implict and named datasets) and classification of the type of these datasets - [grobid-ner](https://github.com/kermitt2/grobid-ner): named entity recognition - [grobid-astro](https://github.com/kermitt2/grobid-astro): recognition of astronomical entities in scientific papers - [grobid-bio](https://github.com/kermitt2/grobid-bio): a toy bio-entity tagger using BioNLP/NLPBA 2004 dataset diff --git a/build.gradle b/build.gradle index 89546fb8ac..e90717a7a6 100644 --- a/build.gradle +++ b/build.gradle @@ -60,19 +60,29 @@ subprojects { } } -// sourceCompatibility = 1.11 -// targetCompatibility = 1.11 - - kotlin { - jvmToolchain(17) - } - - java { - toolchain { - languageVersion.set(JavaLanguageVersion.of(17)) + sourceCompatibility = 1.11 + targetCompatibility = 1.11 + + tasks.withType(KotlinCompile).configureEach { + sourceCompatibility = JavaVersion.VERSION_11 + targetCompatibility = JavaVersion.VERSION_11 + kotlinOptions { + jvmTarget = JavaVersion.VERSION_11 } } +// kotlin { +// jvmToolchain(11) +// } + +// java { +// toolchain { +// languageVersion.set(JavaLanguageVersion.of(11)) +// vendor.set(JvmVendorSpec.ADOPTIUM) +// +// } +// } + repositories { mavenCentral() maven { @@ -316,6 +326,7 @@ project("grobid-home") { } import org.apache.tools.ant.taskdefs.condition.Os +import org.jetbrains.kotlin.gradle.tasks.KotlinCompile project(":grobid-service") { apply plugin: 'application' diff --git a/doc/Benchmarking-biorxiv.md b/doc/Benchmarking-biorxiv.md index 0084192ffa..2644ed8773 100644 --- a/doc/Benchmarking-biorxiv.md +++ b/doc/Benchmarking-biorxiv.md @@ -2,7 +2,7 @@ ## General -This is the end-to-end benchmarking result for GROBID version **0.7.3** against the `bioRxiv` test set (`biorxiv-10k-test-2000`), see the [End-to-end evaluation](End-to-end-evaluation.md) page for explanations and for reproducing this evaluation. +This is the end-to-end benchmarking result for GROBID version **0.8.1** against the `bioRxiv` test set (`biorxiv-10k-test-2000`), see the [End-to-end evaluation](End-to-end-evaluation.md) page for explanations and for reproducing this evaluation. The following end-to-end results are using: @@ -22,13 +22,14 @@ Other versions of these benchmarks with variants and **Deep Learning models** (e Evaluation on 2000 PDF preprints out of 2000 (no failure). -Runtime for processing 2000 PDF: **3133s** (1.56 second per PDF) on Ubuntu 16.04, 4 CPU i7-4790K (8 threads), 16GB RAM (workstation bought in 2015 for 1600 euros) and with a GeForce GTX 1050 Ti GPU. +Runtime for processing 2000 PDF: **1713** seconds (0.85 seconds per PDF file) on Ubuntu 22.04, 16 CPU (32 threads), 128GB RAM and with a GeForce GTX 1080 Ti GPU. + +Note: with CRF only models runtime is 622s (0.31 second per PDF) with 4GPU, 8 threads. -Note: with CRF only models runtime is 622s (0.31 second per PDF). ## Header metadata -Evaluation on 2000 random PDF files out of 2000 PDF (ratio 1.0). +Evaluation on 2000 random PDF files out of 1998 PDF (ratio 1.0). #### Strict Matching (exact matches) @@ -37,13 +38,15 @@ Evaluation on 2000 random PDF files out of 2000 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | | abstract | 2.2 | 2.16 | 2.18 | 1990 | -| authors | 73.66 | 73.04 | 73.35 | 1999 | -| first_author | 94.3 | 93.59 | 93.94 | 1997 | +| authors | 83.2 | 82.49 | 82.84 | 1999 | +| first_author | 97.02 | 96.29 | 96.66 | 1997 | | keywords | 58.71 | 59.83 | 59.27 | 839 | -| title | 82.36 | 81.5 | 81.93 | 2000 | +| title | 77.67 | 76.85 | 77.26 | 2000 | | | | | | | -| **all fields (micro avg.)** | **62.91** | **62.37** | **62.64** | 8825 | -| all fields (macro avg.) | 62.25 | 62.02 | 62.13 | 8825 | +| **all fields (micro avg.)** | **64.62** | **64.07** | **64.35** | 8825 | +| all fields (macro avg.) | 63.76 | 63.53 | 63.64 | 8825 | + + #### Soft Matching (ignoring punctuation, case and space characters mismatches) @@ -52,13 +55,15 @@ Evaluation on 2000 random PDF files out of 2000 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | | abstract | 59.71 | 58.54 | 59.12 | 1990 | -| authors | 75.93 | 75.29 | 75.61 | 1999 | -| first_author | 94.9 | 94.19 | 94.55 | 1997 | +| authors | 83.7 | 82.99 | 83.35 | 1999 | +| first_author | 97.23 | 96.49 | 96.86 | 1997 | | keywords | 63.86 | 65.08 | 64.46 | 839 | -| title | 90.8 | 89.85 | 90.32 | 2000 | +| title | 79.89 | 79.05 | 79.47 | 2000 | | | | | | | -| **all fields (micro avg.)** | **78.8** | **78.12** | **78.46** | 8825 | -| all fields (macro avg.) | 77.04 | 76.59 | 76.81 | 8825 | +| **all fields (micro avg.)** | **78.61** | **77.94** | **78.27** | 8825 | +| all fields (macro avg.) | 76.88 | 76.43 | 76.65 | 8825 | + + #### Levenshtein Matching (Minimum Levenshtein distance at 0.8) @@ -67,13 +72,15 @@ Evaluation on 2000 random PDF files out of 2000 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | | abstract | 80.22 | 78.64 | 79.42 | 1990 | -| authors | 89.61 | 88.84 | 89.22 | 1999 | -| first_author | 95.21 | 94.49 | 94.85 | 1997 | +| authors | 92.18 | 91.4 | 91.79 | 1999 | +| first_author | 97.48 | 96.75 | 97.11 | 1997 | | keywords | 79.42 | 80.93 | 80.17 | 839 | -| title | 94.59 | 93.6 | 94.09 | 2000 | +| title | 92.02 | 91.05 | 91.53 | 2000 | | | | | | | -| **all fields (micro avg.)** | **88.91** | **88.15** | **88.53** | 8825 | -| all fields (macro avg.) | 87.81 | 87.3 | 87.55 | 8825 | +| **all fields (micro avg.)** | **89.43** | **88.66** | **89.04** | 8825 | +| all fields (macro avg.) | 88.26 | 87.75 | 88 | 8825 | + + #### Ratcliff/Obershelp Matching (Minimum Ratcliff/Obershelp similarity at 0.95) @@ -82,34 +89,34 @@ Evaluation on 2000 random PDF files out of 2000 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | | abstract | 76.88 | 75.38 | 76.12 | 1990 | -| authors | 82.04 | 81.34 | 81.69 | 1999 | -| first_author | 94.3 | 93.59 | 93.94 | 1997 | +| authors | 87.79 | 87.04 | 87.42 | 1999 | +| first_author | 97.02 | 96.29 | 96.66 | 1997 | | keywords | 71.35 | 72.71 | 72.02 | 839 | -| title | 93.58 | 92.6 | 93.09 | 2000 | +| title | 87.87 | 86.95 | 87.41 | 2000 | | | | | | | -| **all fields (micro avg.)** | **85.23** | **84.5** | **84.86** | 8825 | -| all fields (macro avg.) | 83.63 | 83.12 | 83.37 | 8825 | +| **all fields (micro avg.)** | **85.86** | **85.12** | **85.49** | 8825 | +| all fields (macro avg.) | 84.18 | 83.67 | 83.92 | 8825 | #### Instance-level results ``` -Total expected instances: 2000 -Total correct instances: 29 (strict) -Total correct instances: 718 (soft) -Total correct instances: 1207 (Levenshtein) -Total correct instances: 1024 (ObservedRatcliffObershelp) - -Instance-level recall: 1.45 (strict) -Instance-level recall: 35.9 (soft) -Instance-level recall: 60.35 (Levenshtein) -Instance-level recall: 51.2 (RatcliffObershelp) +Total expected instances: 2000 +Total correct instances: 35 (strict) +Total correct instances: 708 (soft) +Total correct instances: 1222 (Levenshtein) +Total correct instances: 1046 (ObservedRatcliffObershelp) + +Instance-level recall: 1.75 (strict) +Instance-level recall: 35.4 (soft) +Instance-level recall: 61.1 (Levenshtein) +Instance-level recall: 52.3 (RatcliffObershelp) ``` ## Citation metadata -Evaluation on 2000 random PDF files out of 2000 PDF (ratio 1.0). +Evaluation on 2000 random PDF files out of 1998 PDF (ratio 1.0). #### Strict Matching (exact matches) @@ -117,41 +124,45 @@ Evaluation on 2000 random PDF files out of 2000 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | -| authors | 88.14 | 83.22 | 85.61 | 97183 | -| date | 91.69 | 86.3 | 88.91 | 97630 | -| doi | 70.83 | 83.79 | 76.77 | 16894 | -| first_author | 95.05 | 89.67 | 92.28 | 97183 | -| inTitle | 82.83 | 79.41 | 81.08 | 96430 | -| issue | 94.33 | 92.04 | 93.17 | 30312 | -| page | 94.97 | 78.34 | 85.85 | 88597 | +| authors | 88.16 | 83.24 | 85.63 | 97183 | +| date | 91.69 | 86.31 | 88.92 | 97630 | +| doi | 70.84 | 83.79 | 76.78 | 16894 | +| first_author | 95.06 | 89.68 | 92.29 | 97183 | +| inTitle | 82.83 | 79.4 | 81.08 | 96430 | +| issue | 94.34 | 92.04 | 93.18 | 30312 | +| page | 94.97 | 78.34 | 85.86 | 88597 | | pmcid | 66.38 | 86.12 | 74.97 | 807 | -| pmid | 70.06 | 84.95 | 76.79 | 2093 | -| title | 84.96 | 83.66 | 84.3 | 92463 | -| volume | 96.23 | 95.23 | 95.72 | 87709 | +| pmid | 70.08 | 84.95 | 76.8 | 2093 | +| title | 84.88 | 83.58 | 84.23 | 92463 | +| volume | 96.23 | 95.23 | 95.73 | 87709 | | | | | | | -| **all fields (micro avg.)** | **89.85** | **85.35** | **87.54** | 707301 | +| **all fields (micro avg.)** | **89.85** | **85.34** | **87.54** | 707301 | | all fields (macro avg.) | 85.04 | 85.7 | 85.04 | 707301 | + + #### Soft Matching (ignoring punctuation, case and space characters mismatches) **Field-level results** | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | -| authors | 89.3 | 84.32 | 86.74 | 97183 | -| date | 91.69 | 86.3 | 88.91 | 97630 | -| doi | 75.32 | 89.09 | 81.63 | 16894 | -| first_author | 95.48 | 90.07 | 92.7 | 97183 | +| authors | 89.31 | 84.33 | 86.75 | 97183 | +| date | 91.69 | 86.31 | 88.92 | 97630 | +| doi | 75.34 | 89.11 | 81.65 | 16894 | +| first_author | 95.48 | 90.08 | 92.7 | 97183 | | inTitle | 92.32 | 88.51 | 90.38 | 96430 | -| issue | 94.33 | 92.04 | 93.17 | 30312 | -| page | 94.97 | 78.34 | 85.85 | 88597 | +| issue | 94.34 | 92.04 | 93.18 | 30312 | +| page | 94.97 | 78.34 | 85.86 | 88597 | | pmcid | 75.64 | 98.14 | 85.44 | 807 | -| pmid | 74.47 | 90.3 | 81.62 | 2093 | -| title | 93.21 | 91.79 | 92.5 | 92463 | -| volume | 96.23 | 95.23 | 95.72 | 87709 | +| pmid | 74.5 | 90.3 | 81.64 | 2093 | +| title | 93.23 | 91.8 | 92.51 | 92463 | +| volume | 96.23 | 95.23 | 95.73 | 87709 | | | | | | | -| **all fields (micro avg.)** | **92.66** | **88.01** | **90.27** | 707301 | -| all fields (macro avg.) | 88.45 | 89.47 | 88.61 | 707301 | +| **all fields (micro avg.)** | **92.66** | **88.02** | **90.28** | 707301 | +| all fields (macro avg.) | 88.46 | 89.47 | 88.61 | 707301 | + + #### Levenshtein Matching (Minimum Levenshtein distance at 0.8) @@ -159,20 +170,22 @@ Evaluation on 2000 random PDF files out of 2000 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | -| authors | 94.57 | 89.29 | 91.85 | 97183 | -| date | 91.69 | 86.3 | 88.91 | 97630 | -| doi | 77.59 | 91.78 | 84.09 | 16894 | -| first_author | 95.63 | 90.21 | 92.84 | 97183 | -| inTitle | 93.3 | 89.44 | 91.33 | 96430 | -| issue | 94.33 | 92.04 | 93.17 | 30312 | -| page | 94.97 | 78.34 | 85.85 | 88597 | +| authors | 94.58 | 89.3 | 91.87 | 97183 | +| date | 91.69 | 86.31 | 88.92 | 97630 | +| doi | 77.6 | 91.79 | 84.1 | 16894 | +| first_author | 95.63 | 90.22 | 92.85 | 97183 | +| inTitle | 93.3 | 89.45 | 91.33 | 96430 | +| issue | 94.34 | 92.04 | 93.18 | 30312 | +| page | 94.97 | 78.34 | 85.86 | 88597 | | pmcid | 75.64 | 98.14 | 85.44 | 807 | -| pmid | 74.47 | 90.3 | 81.62 | 2093 | -| title | 96.04 | 94.57 | 95.3 | 92463 | -| volume | 96.23 | 95.23 | 95.72 | 87709 | +| pmid | 74.5 | 90.3 | 81.64 | 2093 | +| title | 96.05 | 94.58 | 95.31 | 92463 | +| volume | 96.23 | 95.23 | 95.73 | 87709 | | | | | | | -| **all fields (micro avg.)** | **93.98** | **89.27** | **91.57** | 707301 | -| all fields (macro avg.) | 89.5 | 90.51 | 89.65 | 707301 | +| **all fields (micro avg.)** | **93.99** | **89.28** | **91.57** | 707301 | +| all fields (macro avg.) | 89.51 | 90.52 | 89.66 | 707301 | + + #### Ratcliff/Obershelp Matching (Minimum Ratcliff/Obershelp similarity at 0.95) @@ -180,80 +193,83 @@ Evaluation on 2000 random PDF files out of 2000 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | -| authors | 91.52 | 86.41 | 88.89 | 97183 | -| date | 91.69 | 86.3 | 88.91 | 97630 | -| doi | 76.02 | 89.93 | 82.39 | 16894 | -| first_author | 95.1 | 89.71 | 92.33 | 97183 | +| authors | 91.54 | 86.43 | 88.91 | 97183 | +| date | 91.69 | 86.31 | 88.92 | 97630 | +| doi | 76.04 | 89.94 | 82.41 | 16894 | +| first_author | 95.1 | 89.72 | 92.33 | 97183 | | inTitle | 91.06 | 87.29 | 89.13 | 96430 | -| issue | 94.33 | 92.04 | 93.17 | 30312 | -| page | 94.97 | 78.34 | 85.85 | 88597 | +| issue | 94.34 | 92.04 | 93.18 | 30312 | +| page | 94.97 | 78.34 | 85.86 | 88597 | | pmcid | 66.38 | 86.12 | 74.97 | 807 | -| pmid | 70.06 | 84.95 | 76.79 | 2093 | -| title | 95.35 | 93.89 | 94.61 | 92463 | -| volume | 96.23 | 95.23 | 95.72 | 87709 | +| pmid | 70.08 | 84.95 | 76.8 | 2093 | +| title | 95.35 | 93.89 | 94.62 | 92463 | +| volume | 96.23 | 95.23 | 95.73 | 87709 | | | | | | | -| **all fields (micro avg.)** | **93.01** | **88.35** | **90.62** | 707301 | -| all fields (macro avg.) | 87.52 | 88.2 | 87.53 | 707301 | +| **all fields (micro avg.)** | **93.02** | **88.36** | **90.63** | 707301 | +| all fields (macro avg.) | 87.53 | 88.21 | 87.53 | 707301 | + #### Instance-level results ``` -Total expected instances: 98799 -Total extracted instances: 98068 -Total correct instances: 43806 (strict) -Total correct instances: 54774 (soft) -Total correct instances: 58973 (Levenshtein) -Total correct instances: 55696 (RatcliffObershelp) +Total expected instances: 98799 +Total extracted instances: 98068 +Total correct instances: 43771 (strict) +Total correct instances: 54778 (soft) +Total correct instances: 58972 (Levenshtein) +Total correct instances: 55693 (RatcliffObershelp) -Instance-level precision: 44.67 (strict) -Instance-level precision: 55.85 (soft) -Instance-level precision: 60.13 (Levenshtein) -Instance-level precision: 56.79 (RatcliffObershelp) +Instance-level precision: 44.63 (strict) +Instance-level precision: 55.86 (soft) +Instance-level precision: 60.13 (Levenshtein) +Instance-level precision: 56.79 (RatcliffObershelp) -Instance-level recall: 44.34 (strict) -Instance-level recall: 55.44 (soft) -Instance-level recall: 59.69 (Levenshtein) -Instance-level recall: 56.37 (RatcliffObershelp) +Instance-level recall: 44.3 (strict) +Instance-level recall: 55.44 (soft) +Instance-level recall: 59.69 (Levenshtein) +Instance-level recall: 56.37 (RatcliffObershelp) -Instance-level f-score: 44.5 (strict) -Instance-level f-score: 55.65 (soft) -Instance-level f-score: 59.91 (Levenshtein) -Instance-level f-score: 56.58 (RatcliffObershelp) +Instance-level f-score: 44.47 (strict) +Instance-level f-score: 55.65 (soft) +Instance-level f-score: 59.91 (Levenshtein) +Instance-level f-score: 56.58 (RatcliffObershelp) -Matching 1 : 79286 +Matching 1 : 79296 -Matching 2 : 4449 +Matching 2 : 4442 -Matching 3 : 4366 +Matching 3 : 4371 -Matching 4 : 2086 +Matching 4 : 2084 -Total matches : 90187 +Total matches : 90193 ``` #### Citation context resolution ``` -Total expected references: 98797 - 49.4 references per article -Total predicted references: 98068 - 49.03 references per article +Total expected references: 98797 - 49.4 references per article +Total predicted references: 98068 - 49.03 references per article -Total expected citation contexts: 142862 - 71.43 citation contexts per article -Total predicted citation contexts: 135679 - 67.84 citation contexts per article +Total expected citation contexts: 142862 - 71.43 citation contexts per article +Total predicted citation contexts: 135692 - 67.85 citation contexts per article -Total correct predicted citation contexts: 116704 - 58.35 citation contexts per article -Total wrong predicted citation contexts: 18975 (wrong callout matching, callout missing in NLM, or matching with a bib. ref. not aligned with a bib.ref. in NLM) +Total correct predicted citation contexts: 116736 - 58.37 citation contexts per article +Total wrong predicted citation contexts: 18956 (wrong callout matching, callout missing in NLM, or matching with a bib. ref. not aligned with a bib.ref. in NLM) -Precision citation contexts: 86.01 -Recall citation contexts: 81.69 -fscore citation contexts: 83.8 +Precision citation contexts: 86.03 +Recall citation contexts: 81.71 +fscore citation contexts: 83.82 ``` + ## Fulltext structures Fulltext structure contents are complicated to capture from JATS NLM files. They are often normalized and different from the actual PDF content and are can be inconsistent from one document to another. The scores of the following metrics are thus not very meaningful in absolute term, in particular for the strict matching (textual content of the srtructure can be very long). As relative values for comparing different models, they seem however useful. -Evaluation on 2000 random PDF files out of 2000 PDF (ratio 1.0). + +Evaluation on 2000 random PDF files out of 1998 PDF (ratio 1.0). #### Strict Matching (exact matches) @@ -261,16 +277,18 @@ Evaluation on 2000 random PDF files out of 2000 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | -| availability_stmt | 0 | 0 | 0 | 0 | -| figure_title | 4.24 | 2.01 | 2.72 | 22978 | -| reference_citation | 71.04 | 71.33 | 71.18 | 147470 | -| reference_figure | 70.59 | 67.74 | 69.13 | 47984 | -| reference_table | 48.12 | 83.06 | 60.94 | 5957 | -| section_title | 72.6 | 69.59 | 71.07 | 32399 | -| table_title | 4.34 | 2.88 | 3.46 | 3925 | +| availability_stmt | 29.95 | 25.78 | 27.71 | 446 | +| figure_title | 4.23 | 2.01 | 2.72 | 22978 | +| funding_stmt | 4.16 | 24.43 | 7.11 | 745 | +| reference_citation | 71.05 | 71.33 | 71.19 | 147470 | +| reference_figure | 70.59 | 67.74 | 69.14 | 47984 | +| reference_table | 48.11 | 83.03 | 60.92 | 5957 | +| section_title | 72.59 | 69.6 | 71.06 | 32398 | +| table_title | 4.31 | 2.85 | 3.43 | 3925 | | | | | | | -| **all fields (micro avg.)** | **66.59** | **63.58** | **65.05** | 260713 | -| all fields (macro avg.) | 45.16 | 49.43 | 46.42 | 260713 | +| **all fields (micro avg.)** | **65.46** | **63.41** | **64.42** | 261903 | +| all fields (macro avg.) | 38.12 | 43.35 | 39.16 | 261903 | + #### Soft Matching (ignoring punctuation, case and space characters mismatches) @@ -279,14 +297,28 @@ Evaluation on 2000 random PDF files out of 2000 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | -| availability_stmt | 0 | 0 | 0 | 0 | -| figure_title | 69.47 | 32.89 | 44.65 | 22978 | -| reference_citation | 83.03 | 83.37 | 83.2 | 147470 | -| reference_figure | 71.21 | 68.34 | 69.75 | 47984 | -| reference_table | 48.57 | 83.83 | 61.51 | 5957 | -| section_title | 76.48 | 73.31 | 74.87 | 32399 | -| table_title | 51.4 | 34.09 | 40.99 | 3925 | +| availability_stmt | 50.52 | 43.5 | 46.75 | 446 | +| figure_title | 69.47 | 32.91 | 44.67 | 22978 | +| funding_stmt | 4.37 | 25.64 | 7.46 | 745 | +| reference_citation | 83.04 | 83.37 | 83.21 | 147470 | +| reference_figure | 71.22 | 68.34 | 69.75 | 47984 | +| reference_table | 48.56 | 83.8 | 61.49 | 5957 | +| section_title | 76.47 | 73.32 | 74.86 | 32398 | +| table_title | 51.44 | 34.06 | 40.99 | 3925 | +| | | | | | +| **all fields (micro avg.)** | **76.38** | **73.99** | **75.17** | 261903 | +| all fields (macro avg.) | 56.89 | 55.62 | 53.65 | 261903 | + + +**Document-level ratio results** + +| label | precision | recall | f1 | support | +|--- |--- |--- |--- |--- | +| availability_stmt | 84.77 | 86.1 | 85.43 | 446 | | | | | | | -| **all fields (micro avg.)** | **77.68** | **74.17** | **75.89** | 260713 | -| all fields (macro avg.) | 66.7 | 62.64 | 62.49 | 260713 | +| **all fields (micro avg.)** | **84.77** | **86.1** | **85.43** | 446 | +| all fields (macro avg.) | 84.77 | 86.1 | 85.43 | 446 | + +Evaluation metrics produced in 773.926 seconds + diff --git a/doc/Benchmarking-elife.md b/doc/Benchmarking-elife.md index 964e03ff86..0eb3ebf515 100644 --- a/doc/Benchmarking-elife.md +++ b/doc/Benchmarking-elife.md @@ -2,7 +2,7 @@ ## General -This is the end-to-end benchmarking result for GROBID version **0.7.3** against the `eLife` test set, see the [End-to-end evaluation](End-to-end-evaluation.md) page for explanations and for reproducing this evaluation. +This is the end-to-end benchmarking result for GROBID version **0.8.1** against the `eLife` test set, see the [End-to-end evaluation](End-to-end-evaluation.md) page for explanations and for reproducing this evaluation. The following end-to-end results are using: @@ -22,14 +22,14 @@ Other versions of these benchmarks with variants and **Deep Learning models** (e Evaluation on 984 PDF preprints out of 984 (no failure). -Runtime for processing 984 PDF: **2002s** (2.03 seconds per PDF) on Ubuntu 16.04, 4 CPU i7-4790K (8 threads), 16GB RAM (workstation bought in 2015 for 1600 euros) and with a GeForce GTX 1050 Ti GPU. +Runtime for processing 984 PDF: **1131** seconds (1.15 seconds per PDF file) on Ubuntu 22.04, 16 CPU (32 threads), 128GB RAM and with a GeForce GTX 1080 Ti GPU. -Note: with CRF only models runtime is 492s (0.50 seconds per PDF). +Note: with CRF only models runtime is 492s (0.50 seconds per PDF) with 4GPU, 8 threads. ## Header metadata -Evaluation on 984 random PDF files out of 984 PDF (ratio 1.0). +Evaluation on 983 random PDF files out of 982 PDF (ratio 1.0). #### Strict Matching (exact matches) @@ -37,13 +37,13 @@ Evaluation on 984 random PDF files out of 984 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | -| abstract | 9.39 | 8.94 | 9.16 | 984 | -| authors | 72.68 | 72.53 | 72.61 | 983 | -| first_author | 91.03 | 90.94 | 90.98 | 982 | -| title | 86.84 | 86.48 | 86.66 | 984 | +| abstract | 9.44 | 9.16 | 9.3 | 983 | +| authors | 74.28 | 73.52 | 73.9 | 982 | +| first_author | 92.39 | 91.54 | 91.96 | 981 | +| title | 86.81 | 85.05 | 85.92 | 983 | | | | | | | -| **all fields (micro avg.)** | **65.61** | **64.71** | **65.16** | 3933 | -| all fields (macro avg.) | 64.98 | 64.72 | 64.85 | 3933 | +| **all fields (micro avg.)** | **65.96** | **64.8** | **65.37** | 3929 | +| all fields (macro avg.) | 65.73 | 64.82 | 65.27 | 3929 | @@ -53,13 +53,13 @@ Evaluation on 984 random PDF files out of 984 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | -| abstract | 24.12 | 22.97 | 23.53 | 984 | -| authors | 72.88 | 72.74 | 72.81 | 983 | -| first_author | 91.03 | 90.94 | 90.98 | 982 | -| title | 95 | 94.61 | 94.81 | 984 | +| abstract | 22.46 | 21.77 | 22.11 | 983 | +| authors | 74.59 | 73.83 | 74.21 | 982 | +| first_author | 92.39 | 91.54 | 91.96 | 981 | +| title | 94.81 | 92.88 | 93.83 | 983 | | | | | | | -| **all fields (micro avg.)** | **71.28** | **70.3** | **70.79** | 3933 | -| all fields (macro avg.) | 70.76 | 70.31 | 70.53 | 3933 | +| **all fields (micro avg.)** | **71.24** | **69.99** | **70.61** | 3929 | +| all fields (macro avg.) | 71.06 | 70 | 70.53 | 3929 | @@ -69,13 +69,13 @@ Evaluation on 984 random PDF files out of 984 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | -| abstract | 48.67 | 46.34 | 47.48 | 984 | -| authors | 86.34 | 86.16 | 86.25 | 983 | -| first_author | 91.34 | 91.24 | 91.29 | 982 | -| title | 96.33 | 95.93 | 96.13 | 984 | +| abstract | 47.53 | 46.08 | 46.8 | 983 | +| authors | 88.17 | 87.27 | 87.72 | 982 | +| first_author | 92.7 | 91.85 | 92.27 | 981 | +| title | 96.26 | 94.3 | 95.27 | 983 | | | | | | | -| **all fields (micro avg.)** | **81.03** | **79.91** | **80.47** | 3933 | -| all fields (macro avg.) | 80.67 | 79.92 | 80.29 | 3933 | +| **all fields (micro avg.)** | **81.3** | **79.87** | **80.58** | 3929 | +| all fields (macro avg.) | 81.16 | 79.88 | 80.51 | 3929 | @@ -85,33 +85,34 @@ Evaluation on 984 random PDF files out of 984 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | -| abstract | 46.53 | 44.31 | 45.39 | 984 | -| authors | 78.29 | 78.13 | 78.21 | 983 | -| first_author | 91.03 | 90.94 | 90.98 | 982 | -| title | 96.33 | 95.93 | 96.13 | 984 | +| abstract | 44.49 | 43.13 | 43.8 | 983 | +| authors | 79.94 | 79.12 | 79.53 | 982 | +| first_author | 92.39 | 91.54 | 91.96 | 981 | +| title | 96.26 | 94.3 | 95.27 | 983 | | | | | | | -| **all fields (micro avg.)** | **78.4** | **77.32** | **77.85** | 3933 | -| all fields (macro avg.) | 78.04 | 77.33 | 77.68 | 3933 | +| **all fields (micro avg.)** | **78.39** | **77.02** | **77.7** | 3929 | +| all fields (macro avg.) | 78.27 | 77.02 | 77.64 | 3929 | #### Instance-level results ``` -Total expected instances: 984 -Total correct instances: 75 (strict) -Total correct instances: 212 (soft) -Total correct instances: 383 (Levenshtein) -Total correct instances: 341 (ObservedRatcliffObershelp) - -Instance-level recall: 7.62 (strict) -Instance-level recall: 21.54 (soft) -Instance-level recall: 38.92 (Levenshtein) -Instance-level recall: 34.65 (RatcliffObershelp) +Total expected instances: 983 +Total correct instances: 73 (strict) +Total correct instances: 198 (soft) +Total correct instances: 377 (Levenshtein) +Total correct instances: 335 (ObservedRatcliffObershelp) + +Instance-level recall: 7.43 (strict) +Instance-level recall: 20.14 (soft) +Instance-level recall: 38.35 (Levenshtein) +Instance-level recall: 34.08 (RatcliffObershelp) ``` + ## Citation metadata -Evaluation on 984 random PDF files out of 984 PDF (ratio 1.0). +Evaluation on 983 random PDF files out of 982 PDF (ratio 1.0). #### Strict Matching (exact matches) @@ -119,17 +120,17 @@ Evaluation on 984 random PDF files out of 984 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | -| authors | 79.39 | 78.17 | 78.78 | 63265 | -| date | 95.86 | 93.99 | 94.92 | 63662 | -| first_author | 94.76 | 93.27 | 94.01 | 63265 | -| inTitle | 95.76 | 94.66 | 95.21 | 63213 | -| issue | 1.99 | 75 | 3.87 | 16 | -| page | 96.26 | 95.2 | 95.72 | 53375 | -| title | 90.42 | 90.84 | 90.63 | 62044 | -| volume | 97.86 | 98.17 | 98.01 | 61049 | +| authors | 79.4 | 78.19 | 78.79 | 63170 | +| date | 95.86 | 93.99 | 94.91 | 63567 | +| first_author | 94.76 | 93.28 | 94.02 | 63170 | +| inTitle | 95.77 | 94.68 | 95.22 | 63118 | +| issue | 1.99 | 75 | 3.88 | 16 | +| page | 96.26 | 95.2 | 95.72 | 53303 | +| title | 90.25 | 90.68 | 90.47 | 61950 | +| volume | 97.85 | 98.17 | 98.01 | 60955 | | | | | | | -| **all fields (micro avg.)** | **92.69** | **91.94** | **92.31** | 429889 | -| all fields (macro avg.) | 81.54 | 89.91 | 81.39 | 429889 | +| **all fields (micro avg.)** | **92.66** | **91.93** | **92.29** | 429249 | +| all fields (macro avg.) | 81.52 | 89.9 | 81.38 | 429249 | @@ -139,17 +140,17 @@ Evaluation on 984 random PDF files out of 984 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | -| authors | 79.53 | 78.31 | 78.91 | 63265 | -| date | 95.86 | 93.99 | 94.92 | 63662 | -| first_author | 94.84 | 93.35 | 94.09 | 63265 | -| inTitle | 96.24 | 95.13 | 95.68 | 63213 | -| issue | 1.99 | 75 | 3.87 | 16 | -| page | 96.26 | 95.2 | 95.72 | 53375 | -| title | 95.92 | 96.37 | 96.14 | 62044 | -| volume | 97.86 | 98.17 | 98.01 | 61049 | +| authors | 79.54 | 78.33 | 78.93 | 63170 | +| date | 95.86 | 93.99 | 94.91 | 63567 | +| first_author | 94.84 | 93.36 | 94.1 | 63170 | +| inTitle | 96.25 | 95.15 | 95.7 | 63118 | +| issue | 1.99 | 75 | 3.88 | 16 | +| page | 96.26 | 95.2 | 95.72 | 53303 | +| title | 95.92 | 96.38 | 96.15 | 61950 | +| volume | 97.85 | 98.17 | 98.01 | 60955 | | | | | | | -| **all fields (micro avg.)** | **93.59** | **92.84** | **93.22** | 429889 | -| all fields (macro avg.) | 82.31 | 90.69 | 82.17 | 429889 | +| **all fields (micro avg.)** | **93.59** | **92.85** | **93.22** | 429249 | +| all fields (macro avg.) | 82.31 | 90.7 | 82.17 | 429249 | @@ -159,17 +160,17 @@ Evaluation on 984 random PDF files out of 984 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | -| authors | 93.27 | 91.83 | 92.55 | 63265 | -| date | 95.86 | 93.99 | 94.92 | 63662 | -| first_author | 95.29 | 93.79 | 94.54 | 63265 | -| inTitle | 96.57 | 95.46 | 96.01 | 63213 | -| issue | 1.99 | 75 | 3.87 | 16 | -| page | 96.26 | 95.2 | 95.72 | 53375 | -| title | 97.65 | 98.11 | 97.88 | 62044 | -| volume | 97.86 | 98.17 | 98.01 | 61049 | +| authors | 93.29 | 91.87 | 92.58 | 63170 | +| date | 95.86 | 93.99 | 94.91 | 63567 | +| first_author | 95.29 | 93.8 | 94.54 | 63170 | +| inTitle | 96.58 | 95.47 | 96.02 | 63118 | +| issue | 1.99 | 75 | 3.88 | 16 | +| page | 96.26 | 95.2 | 95.72 | 53303 | +| title | 97.66 | 98.12 | 97.89 | 61950 | +| volume | 97.85 | 98.17 | 98.01 | 60955 | | | | | | | -| **all fields (micro avg.)** | **95.97** | **95.2** | **95.58** | 429889 | -| all fields (macro avg.) | 84.34 | 92.69 | 84.19 | 429889 | +| **all fields (micro avg.)** | **95.97** | **95.21** | **95.59** | 429249 | +| all fields (macro avg.) | 84.35 | 92.7 | 84.19 | 429249 | @@ -179,71 +180,71 @@ Evaluation on 984 random PDF files out of 984 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | -| authors | 86.69 | 85.36 | 86.02 | 63265 | -| date | 95.86 | 93.99 | 94.92 | 63662 | -| first_author | 94.78 | 93.29 | 94.03 | 63265 | -| inTitle | 96.25 | 95.14 | 95.69 | 63213 | -| issue | 1.99 | 75 | 3.87 | 16 | -| page | 96.26 | 95.2 | 95.72 | 53375 | -| title | 97.5 | 97.96 | 97.73 | 62044 | -| volume | 97.86 | 98.17 | 98.01 | 61049 | +| authors | 86.71 | 85.39 | 86.05 | 63170 | +| date | 95.86 | 93.99 | 94.91 | 63567 | +| first_author | 94.78 | 93.3 | 94.03 | 63170 | +| inTitle | 96.25 | 95.16 | 95.7 | 63118 | +| issue | 1.99 | 75 | 3.88 | 16 | +| page | 96.26 | 95.2 | 95.72 | 53303 | +| title | 97.5 | 97.97 | 97.74 | 61950 | +| volume | 97.85 | 98.17 | 98.01 | 60955 | | | | | | | -| **all fields (micro avg.)** | **94.86** | **94.1** | **94.48** | 429889 | -| all fields (macro avg.) | 83.4 | 91.76 | 83.25 | 429889 | +| **all fields (micro avg.)** | **94.87** | **94.11** | **94.49** | 429249 | +| all fields (macro avg.) | 83.4 | 91.77 | 83.26 | 429249 | #### Instance-level results ``` -Total expected instances: 63664 -Total extracted instances: 66480 -Total correct instances: 42383 (strict) -Total correct instances: 45147 (soft) -Total correct instances: 52790 (Levenshtein) -Total correct instances: 49397 (RatcliffObershelp) +Total expected instances: 63569 +Total extracted instances: 66388 +Total correct instances: 42246 (strict) +Total correct instances: 45085 (soft) +Total correct instances: 52715 (Levenshtein) +Total correct instances: 49331 (RatcliffObershelp) -Instance-level precision: 63.75 (strict) -Instance-level precision: 67.91 (soft) -Instance-level precision: 79.41 (Levenshtein) -Instance-level precision: 74.3 (RatcliffObershelp) +Instance-level precision: 63.63 (strict) +Instance-level precision: 67.91 (soft) +Instance-level precision: 79.4 (Levenshtein) +Instance-level precision: 74.31 (RatcliffObershelp) -Instance-level recall: 66.57 (strict) -Instance-level recall: 70.91 (soft) -Instance-level recall: 82.92 (Levenshtein) -Instance-level recall: 77.59 (RatcliffObershelp) +Instance-level recall: 66.46 (strict) +Instance-level recall: 70.92 (soft) +Instance-level recall: 82.93 (Levenshtein) +Instance-level recall: 77.6 (RatcliffObershelp) -Instance-level f-score: 65.13 (strict) -Instance-level f-score: 69.38 (soft) -Instance-level f-score: 81.13 (Levenshtein) -Instance-level f-score: 75.91 (RatcliffObershelp) +Instance-level f-score: 65.02 (strict) +Instance-level f-score: 69.38 (soft) +Instance-level f-score: 81.13 (Levenshtein) +Instance-level f-score: 75.92 (RatcliffObershelp) -Matching 1 : 58594 +Matching 1 : 58505 -Matching 2 : 1015 +Matching 2 : 1012 -Matching 3 : 1241 +Matching 3 : 1242 -Matching 4 : 367 +Matching 4 : 371 -Total matches : 61217 +Total matches : 61130 ``` #### Citation context resolution ``` -Total expected references: 63664 - 64.7 references per article -Total predicted references: 66480 - 67.56 references per article +Total expected references: 63569 - 64.67 references per article +Total predicted references: 66388 - 67.54 references per article -Total expected citation contexts: 109022 - 110.79 citation contexts per article -Total predicted citation contexts: 99415 - 101.03 citation contexts per article +Total expected citation contexts: 108880 - 110.76 citation contexts per article +Total predicted citation contexts: 99284 - 101 citation contexts per article -Total correct predicted citation contexts: 95626 - 97.18 citation contexts per article -Total wrong predicted citation contexts: 3789 (wrong callout matching, callout missing in NLM, or matching with a bib. ref. not aligned with a bib.ref. in NLM) +Total correct predicted citation contexts: 95494 - 97.15 citation contexts per article +Total wrong predicted citation contexts: 3790 (wrong callout matching, callout missing in NLM, or matching with a bib. ref. not aligned with a bib.ref. in NLM) -Precision citation contexts: 96.19 -Recall citation contexts: 87.71 -fscore citation contexts: 91.76 +Precision citation contexts: 96.18 +Recall citation contexts: 87.71 +fscore citation contexts: 91.75 ``` @@ -252,7 +253,7 @@ fscore citation contexts: 91.76 Fulltext structure contents are complicated to capture from JATS NLM files. They are often normalized and different from the actual PDF content and are can be inconsistent from one document to another. The scores of the following metrics are thus not very meaningful in absolute term, in particular for the strict matching (textual content of the srtructure can be very long). As relative values for comparing different models, they seem however useful. -Evaluation on 984 random PDF files out of 984 PDF (ratio 1.0). +Evaluation on 983 random PDF files out of 982 PDF (ratio 1.0). #### Strict Matching (exact matches) @@ -260,17 +261,17 @@ Evaluation on 984 random PDF files out of 984 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | -| availability_stmt | 28.79 | 25.64 | 27.12 | 585 | -| figure_title | 0.02 | 0.01 | 0.01 | 31718 | -| funding_stmt | 12.03 | 24 | 16.03 | 921 | -| reference_citation | 55.45 | 55.66 | 55.56 | 108949 | -| reference_figure | 56.74 | 49.86 | 53.08 | 68926 | -| reference_table | 68.27 | 73.46 | 70.77 | 2381 | -| section_title | 85.2 | 74.2 | 79.32 | 21831 | -| table_title | 0.45 | 0.16 | 0.23 | 1925 | +| availability_stmt | 29.94 | 26.71 | 28.24 | 584 | +| figure_title | 0.02 | 0.01 | 0.01 | 31671 | +| funding_stmt | 4.77 | 23.8 | 7.95 | 920 | +| reference_citation | 55.46 | 55.67 | 55.56 | 108807 | +| reference_figure | 56.78 | 49.91 | 53.12 | 68786 | +| reference_table | 68.24 | 73.46 | 70.75 | 2381 | +| section_title | 85.17 | 74.17 | 79.29 | 21808 | +| table_title | 0.45 | 0.16 | 0.23 | 1924 | | | | | | | -| **all fields (micro avg.)** | **55.46** | **47.77** | **51.33** | 237236 | -| all fields (macro avg.) | 38.37 | 37.87 | 37.76 | 237236 | +| **all fields (micro avg.)** | **54.74** | **47.79** | **51.03** | 236881 | +| all fields (macro avg.) | 37.6 | 37.99 | 36.89 | 236881 | @@ -280,26 +281,29 @@ Evaluation on 984 random PDF files out of 984 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | -| availability_stmt | 38.77 | 34.53 | 36.53 | 585 | -| figure_title | 48.83 | 15.12 | 23.09 | 31718 | -| funding_stmt | 12.03 | 24 | 16.03 | 921 | -| reference_citation | 91.02 | 91.37 | 91.2 | 108949 | -| reference_figure | 57.02 | 50.1 | 53.34 | 68926 | -| reference_table | 68.35 | 73.54 | 70.85 | 2381 | -| section_title | 86.08 | 74.96 | 80.13 | 21831 | -| table_title | 80.63 | 27.9 | 41.45 | 1925 | +| availability_stmt | 38.96 | 34.76 | 36.74 | 584 | +| figure_title | 48.86 | 15.12 | 23.09 | 31671 | +| funding_stmt | 4.77 | 23.8 | 7.95 | 920 | +| reference_citation | 91.04 | 91.38 | 91.21 | 108807 | +| reference_figure | 57.06 | 50.16 | 53.39 | 68786 | +| reference_table | 68.32 | 73.54 | 70.83 | 2381 | +| section_title | 86.05 | 74.93 | 80.1 | 21808 | +| table_title | 80.63 | 27.91 | 41.47 | 1924 | | | | | | | -| **all fields (micro avg.)** | **77.3** | **66.58** | **71.54** | 237236 | -| all fields (macro avg.) | 60.34 | 48.94 | 51.58 | 237236 | +| **all fields (micro avg.)** | **76.29** | **66.6** | **71.12** | 236881 | +| all fields (macro avg.) | 59.46 | 48.95 | 50.6 | 236881 | **Document-level ratio results** | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | -| availability_stmt | 96.3 | 89.06 | 92.54 | 585 | +| availability_stmt | 96.3 | 89.21 | 92.62 | 584 | | | | | | | -| **all fields (micro avg.)** | **96.3** | **89.06** | **92.54** | 585 | -| all fields (macro avg.) | 96.3 | 89.06 | 92.54 | 585 | +| **all fields (micro avg.)** | **96.3** | **89.21** | **92.62** | 584 | +| all fields (macro avg.) | 96.3 | 89.21 | 92.62 | 584 | + +Evaluation metrics produced in 640.707 seconds + diff --git a/doc/Benchmarking-plos.md b/doc/Benchmarking-plos.md index c3920a13b3..c6a71b2df3 100644 --- a/doc/Benchmarking-plos.md +++ b/doc/Benchmarking-plos.md @@ -2,7 +2,7 @@ ## General -This is the end-to-end benchmarking result for GROBID version **0.7.3** against the `PLOS` test set, see the [End-to-end evaluation](End-to-end-evaluation.md) page for explanations and for reproducing this evaluation. +This is the end-to-end benchmarking result for GROBID version **0.8.1** against the `PLOS` test set, see the [End-to-end evaluation](End-to-end-evaluation.md) page for explanations and for reproducing this evaluation. The following end-to-end results are using: @@ -22,14 +22,14 @@ Other versions of these benchmarks with variants and **Deep Learning models** (e Evaluation on 1000 PDF preprints out of 1000 (no failure). -Runtime for processing 1000 PDF: **1831s** (1.83 second per PDF) on Ubuntu 16.04, 4 CPU i7-4790K (8 threads), 16GB RAM (workstation bought in 2015 for 1600 euros) and with a GeForce GTX 1050 Ti GPU. +Runtime for processing 1000 PDF: **999** seconds, (0.99 seconds per PDF) on Ubuntu 22.04, 16 CPU (32 threads), 128GB RAM and with a GeForce GTX 1080 Ti GPU. -Note: with CRF only models runtime is 304s (0.30 seconds per PDF). +Note: with CRF only models runtime is 304s (0.30 seconds per PDF) with 4GPU, 8 threads. ## Header metadata -Evaluation on 1000 random PDF files out of 1000 PDF (ratio 1.0). +Evaluation on 1000 random PDF files out of 998 PDF (ratio 1.0). #### Strict Matching (exact matches) @@ -38,13 +38,15 @@ Evaluation on 1000 random PDF files out of 1000 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | | abstract | 13.58 | 13.65 | 13.61 | 960 | -| authors | 98.97 | 99.07 | 99.02 | 969 | -| first_author | 99.28 | 99.38 | 99.33 | 969 | +| authors | 98.87 | 98.97 | 98.92 | 969 | +| first_author | 99.18 | 99.28 | 99.23 | 969 | | keywords | 0 | 0 | 0 | 0 | -| title | 95.65 | 94.5 | 95.07 | 1000 | +| title | 95.75 | 94.6 | 95.17 | 1000 | | | | | | | -| **all fields (micro avg.)** | **77.04** | **76.94** | **76.99** | 3898 | -| all fields (macro avg.) | 76.87 | 76.65 | 76.76 | 3898 | +| **all fields (micro avg.)** | **77.01** | **76.91** | **76.96** | 3898 | +| all fields (macro avg.) | 76.84 | 76.62 | 76.73 | 3898 | + + #### Soft Matching (ignoring punctuation, case and space characters mismatches) @@ -53,13 +55,15 @@ Evaluation on 1000 random PDF files out of 1000 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | | abstract | 50.57 | 50.83 | 50.7 | 960 | -| authors | 98.97 | 99.07 | 99.02 | 969 | -| first_author | 99.28 | 99.38 | 99.33 | 969 | +| authors | 98.87 | 98.97 | 98.92 | 969 | +| first_author | 99.18 | 99.28 | 99.23 | 969 | | keywords | 0 | 0 | 0 | 0 | -| title | 99.29 | 98.1 | 98.69 | 1000 | +| title | 99.39 | 98.2 | 98.79 | 1000 | | | | | | | -| **all fields (micro avg.)** | **87.13** | **87.02** | **87.07** | 3898 | -| all fields (macro avg.) | 87.03 | 86.85 | 86.94 | 3898 | +| **all fields (micro avg.)** | **87.11** | **86.99** | **87.05** | 3898 | +| all fields (macro avg.) | 87 | 86.82 | 86.91 | 3898 | + + #### Levenshtein Matching (Minimum Levenshtein distance at 0.8) @@ -69,12 +73,14 @@ Evaluation on 1000 random PDF files out of 1000 PDF (ratio 1.0). |--- |--- |--- |--- |--- | | abstract | 76.68 | 77.08 | 76.88 | 960 | | authors | 99.28 | 99.38 | 99.33 | 969 | -| first_author | 99.38 | 99.48 | 99.43 | 969 | +| first_author | 99.28 | 99.38 | 99.33 | 969 | | keywords | 0 | 0 | 0 | 0 | | title | 99.7 | 98.5 | 99.09 | 1000 | | | | | | | -| **all fields (micro avg.)** | **93.81** | **93.69** | **93.75** | 3898 | -| all fields (macro avg.) | 93.76 | 93.61 | 93.68 | 3898 | +| **all fields (micro avg.)** | **93.78** | **93.66** | **93.72** | 3898 | +| all fields (macro avg.) | 93.73 | 93.59 | 93.66 | 3898 | + + #### Ratcliff/Obershelp Matching (Minimum Ratcliff/Obershelp similarity at 0.95) @@ -84,32 +90,33 @@ Evaluation on 1000 random PDF files out of 1000 PDF (ratio 1.0). |--- |--- |--- |--- |--- | | abstract | 66.94 | 67.29 | 67.12 | 960 | | authors | 99.18 | 99.28 | 99.23 | 969 | -| first_author | 99.28 | 99.38 | 99.33 | 969 | +| first_author | 99.18 | 99.28 | 99.23 | 969 | | keywords | 0 | 0 | 0 | 0 | | title | 99.49 | 98.3 | 98.89 | 1000 | | | | | | | -| **all fields (micro avg.)** | **91.29** | **91.17** | **91.23** | 3898 | -| all fields (macro avg.) | 91.22 | 91.06 | 91.14 | 3898 | +| **all fields (micro avg.)** | **91.27** | **91.15** | **91.21** | 3898 | +| all fields (macro avg.) | 91.2 | 91.04 | 91.12 | 3898 | #### Instance-level results ``` -Total expected instances: 1000 -Total correct instances: 139 (strict) -Total correct instances: 488 (soft) -Total correct instances: 727 (Levenshtein) -Total correct instances: 643 (ObservedRatcliffObershelp) - -Instance-level recall: 13.9 (strict) -Instance-level recall: 48.8 (soft) -Instance-level recall: 72.7 (Levenshtein) -Instance-level recall: 64.3 (RatcliffObershelp) +Total expected instances: 1000 +Total correct instances: 139 (strict) +Total correct instances: 487 (soft) +Total correct instances: 726 (Levenshtein) +Total correct instances: 642 (ObservedRatcliffObershelp) + +Instance-level recall: 13.9 (strict) +Instance-level recall: 48.7 (soft) +Instance-level recall: 72.6 (Levenshtein) +Instance-level recall: 64.2 (RatcliffObershelp) ``` + ## Citation metadata -Evaluation on 1000 random PDF files out of 1000 PDF (ratio 1.0). +Evaluation on 1000 random PDF files out of 998 PDF (ratio 1.0). #### Strict Matching (exact matches) @@ -117,17 +124,19 @@ Evaluation on 1000 random PDF files out of 1000 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | -| authors | 81.18 | 78.43 | 79.78 | 44770 | -| date | 84.62 | 81.24 | 82.9 | 45457 | -| first_author | 91.48 | 88.35 | 89.88 | 44770 | -| inTitle | 81.67 | 83.57 | 82.61 | 42795 | -| issue | 93.63 | 92.68 | 93.15 | 18983 | -| page | 93.69 | 77.57 | 84.87 | 40844 | -| title | 60.02 | 60.53 | 60.28 | 43101 | +| authors | 81.17 | 78.43 | 79.78 | 44770 | +| date | 84.61 | 81.24 | 82.89 | 45457 | +| first_author | 91.47 | 88.34 | 89.88 | 44770 | +| inTitle | 81.67 | 83.58 | 82.61 | 42795 | +| issue | 93.62 | 92.68 | 93.15 | 18983 | +| page | 93.7 | 77.57 | 84.87 | 40844 | +| title | 59.97 | 60.47 | 60.22 | 43101 | | volume | 95.89 | 96.11 | 96 | 40458 | | | | | | | -| **all fields (micro avg.)** | **84.24** | **81.45** | **82.82** | 321178 | -| all fields (macro avg.) | 85.27 | 82.31 | 83.68 | 321178 | +| **all fields (micro avg.)** | **84.23** | **81.45** | **82.81** | 321178 | +| all fields (macro avg.) | 85.26 | 82.3 | 83.67 | 321178 | + + #### Soft Matching (ignoring punctuation, case and space characters mismatches) @@ -136,34 +145,38 @@ Evaluation on 1000 random PDF files out of 1000 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | | authors | 81.49 | 78.73 | 80.09 | 44770 | -| date | 84.62 | 81.24 | 82.9 | 45457 | -| first_author | 91.69 | 88.56 | 90.1 | 44770 | +| date | 84.61 | 81.24 | 82.89 | 45457 | +| first_author | 91.69 | 88.55 | 90.09 | 44770 | | inTitle | 85.51 | 87.5 | 86.49 | 42795 | -| issue | 93.63 | 92.68 | 93.15 | 18983 | -| page | 93.69 | 77.57 | 84.87 | 40844 | -| title | 91.97 | 92.75 | 92.36 | 43101 | +| issue | 93.62 | 92.68 | 93.15 | 18983 | +| page | 93.7 | 77.57 | 84.87 | 40844 | +| title | 91.95 | 92.74 | 92.34 | 43101 | | volume | 95.89 | 96.11 | 96 | 40458 | | | | | | | -| **all fields (micro avg.)** | **89.33** | **86.37** | **87.82** | 321178 | +| **all fields (micro avg.)** | **89.32** | **86.37** | **87.82** | 321178 | | all fields (macro avg.) | 89.81 | 86.89 | 88.24 | 321178 | + + #### Levenshtein Matching (Minimum Levenshtein distance at 0.8) **Field-level results** | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | -| authors | 90.65 | 87.58 | 89.08 | 44770 | -| date | 84.62 | 81.24 | 82.9 | 45457 | -| first_author | 92.23 | 89.08 | 90.63 | 44770 | -| inTitle | 86.46 | 88.47 | 87.45 | 42795 | -| issue | 93.63 | 92.68 | 93.15 | 18983 | -| page | 93.69 | 77.57 | 84.87 | 40844 | -| title | 94.57 | 95.37 | 94.97 | 43101 | +| authors | 90.64 | 87.57 | 89.08 | 44770 | +| date | 84.61 | 81.24 | 82.89 | 45457 | +| first_author | 92.23 | 89.08 | 90.62 | 44770 | +| inTitle | 86.45 | 88.47 | 87.45 | 42795 | +| issue | 93.62 | 92.68 | 93.15 | 18983 | +| page | 93.7 | 77.57 | 84.87 | 40844 | +| title | 94.56 | 95.37 | 94.96 | 43101 | | volume | 95.89 | 96.11 | 96 | 40458 | | | | | | | | **all fields (micro avg.)** | **91.17** | **88.16** | **89.64** | 321178 | -| all fields (macro avg.) | 91.47 | 88.51 | 89.88 | 321178 | +| all fields (macro avg.) | 91.46 | 88.51 | 89.88 | 321178 | + + #### Ratcliff/Obershelp Matching (Minimum Ratcliff/Obershelp similarity at 0.95) @@ -171,12 +184,12 @@ Evaluation on 1000 random PDF files out of 1000 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | -| authors | 84.94 | 82.06 | 83.47 | 44770 | -| date | 84.62 | 81.24 | 82.9 | 45457 | -| first_author | 91.48 | 88.35 | 89.88 | 44770 | -| inTitle | 85.16 | 87.14 | 86.14 | 42795 | -| issue | 93.63 | 92.68 | 93.15 | 18983 | -| page | 93.69 | 77.57 | 84.87 | 40844 | +| authors | 84.94 | 82.06 | 83.48 | 44770 | +| date | 84.61 | 81.24 | 82.89 | 45457 | +| first_author | 91.47 | 88.34 | 89.88 | 44770 | +| inTitle | 85.16 | 87.15 | 86.14 | 42795 | +| issue | 93.62 | 92.68 | 93.15 | 18983 | +| page | 93.7 | 77.57 | 84.87 | 40844 | | title | 93.95 | 94.74 | 94.34 | 43101 | | volume | 95.89 | 96.11 | 96 | 40458 | | | | | | | @@ -187,55 +200,55 @@ Evaluation on 1000 random PDF files out of 1000 PDF (ratio 1.0). #### Instance-level results ``` -Total expected instances: 48449 -Total extracted instances: 48250 -Total correct instances: 13512 (strict) -Total correct instances: 22263 (soft) -Total correct instances: 24909 (Levenshtein) -Total correct instances: 23261 (RatcliffObershelp) +Total expected instances: 48449 +Total extracted instances: 48250 +Total correct instances: 13496 (strict) +Total correct instances: 22269 (soft) +Total correct instances: 24916 (Levenshtein) +Total correct instances: 23272 (RatcliffObershelp) -Instance-level precision: 28 (strict) -Instance-level precision: 46.14 (soft) -Instance-level precision: 51.62 (Levenshtein) -Instance-level precision: 48.21 (RatcliffObershelp) +Instance-level precision: 27.97 (strict) +Instance-level precision: 46.15 (soft) +Instance-level precision: 51.64 (Levenshtein) +Instance-level precision: 48.23 (RatcliffObershelp) -Instance-level recall: 27.89 (strict) -Instance-level recall: 45.95 (soft) -Instance-level recall: 51.41 (Levenshtein) -Instance-level recall: 48.01 (RatcliffObershelp) +Instance-level recall: 27.86 (strict) +Instance-level recall: 45.96 (soft) +Instance-level recall: 51.43 (Levenshtein) +Instance-level recall: 48.03 (RatcliffObershelp) -Instance-level f-score: 27.95 (strict) -Instance-level f-score: 46.05 (soft) -Instance-level f-score: 51.52 (Levenshtein) -Instance-level f-score: 48.11 (RatcliffObershelp) +Instance-level f-score: 27.91 (strict) +Instance-level f-score: 46.06 (soft) +Instance-level f-score: 51.53 (Levenshtein) +Instance-level f-score: 48.13 (RatcliffObershelp) -Matching 1 : 35372 +Matching 1 : 35369 -Matching 2 : 1257 +Matching 2 : 1260 -Matching 3 : 3268 +Matching 3 : 3266 -Matching 4 : 1799 +Matching 4 : 1800 -Total matches : 41696 +Total matches : 41695 ``` #### Citation context resolution ``` -Total expected references: 48449 - 48.45 references per article -Total predicted references: 48250 - 48.25 references per article +Total expected references: 48449 - 48.45 references per article +Total predicted references: 48250 - 48.25 references per article -Total expected citation contexts: 69755 - 69.75 citation contexts per article -Total predicted citation contexts: 73696 - 73.7 citation contexts per article +Total expected citation contexts: 69755 - 69.75 citation contexts per article +Total predicted citation contexts: 73696 - 73.7 citation contexts per article -Total correct predicted citation contexts: 56769 - 56.77 citation contexts per article -Total wrong predicted citation contexts: 16927 (wrong callout matching, callout missing in NLM, or matching with a bib. ref. not aligned with a bib.ref. in NLM) +Total correct predicted citation contexts: 56772 - 56.77 citation contexts per article +Total wrong predicted citation contexts: 16924 (wrong callout matching, callout missing in NLM, or matching with a bib. ref. not aligned with a bib.ref. in NLM) -Precision citation contexts: 77.03 -Recall citation contexts: 81.38 -fscore citation contexts: 79.15 +Precision citation contexts: 77.04 +Recall citation contexts: 81.39 +fscore citation contexts: 79.15 ``` @@ -243,7 +256,8 @@ fscore citation contexts: 79.15 Fulltext structure contents are complicated to capture from JATS NLM files. They are often normalized and different from the actual PDF content and are can be inconsistent from one document to another. The scores of the following metrics are thus not very meaningful in absolute term, in particular for the strict matching (textual content of the srtructure can be very long). As relative values for comparing different models, they seem however useful. -Evaluation on 1000 random PDF files out of 1000 PDF (ratio 1.0). + +Evaluation on 1000 random PDF files out of 998 PDF (ratio 1.0). #### Strict Matching (exact matches) @@ -251,17 +265,19 @@ Evaluation on 1000 random PDF files out of 1000 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | -| availability_stmt | 50.87 | 49.04 | 49.93 | 779 | +| availability_stmt | 54.06 | 52.12 | 53.07 | 779 | | figure_title | 2.11 | 0.92 | 1.28 | 8943 | -| funding_stmt | 49.49 | 35.17 | 41.12 | 1507 | +| funding_stmt | 5.27 | 28.14 | 8.88 | 1507 | | reference_citation | 86.69 | 94.65 | 90.49 | 69741 | -| reference_figure | 72.05 | 54.06 | 61.77 | 11010 | +| reference_figure | 72.06 | 54.06 | 61.77 | 11010 | | reference_table | 84.28 | 92.07 | 88 | 5159 | -| section_title | 77.18 | 65.8 | 71.04 | 17540 | -| table_title | 1.13 | 0.59 | 0.78 | 6092 | +| section_title | 77.18 | 65.8 | 71.03 | 17540 | +| table_title | 1.13 | 0.59 | 0.77 | 6092 | | | | | | | -| **all fields (micro avg.)** | **78.38** | **73.93** | **76.09** | 120771 | -| all fields (macro avg.) | 52.97 | 49.04 | 50.55 | 120771 | +| **all fields (micro avg.)** | **73.79** | **73.86** | **73.82** | 120771 | +| all fields (macro avg.) | 47.85 | 48.54 | 46.91 | 120771 | + + #### Soft Matching (ignoring punctuation, case and space characters mismatches) @@ -270,16 +286,17 @@ Evaluation on 1000 random PDF files out of 1000 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | | availability_stmt | 79.36 | 76.51 | 77.91 | 779 | -| figure_title | 81.2 | 35.36 | 49.26 | 8943 | -| funding_stmt | 56.4 | 40.08 | 46.86 | 1507 | -| reference_citation | 86.7 | 94.66 | 90.5 | 69741 | -| reference_figure | 72.51 | 54.41 | 62.17 | 11010 | +| figure_title | 81.17 | 35.33 | 49.24 | 8943 | +| funding_stmt | 6.89 | 36.76 | 11.6 | 1507 | +| reference_citation | 86.7 | 94.66 | 90.51 | 69741 | +| reference_figure | 72.52 | 54.41 | 62.17 | 11010 | | reference_table | 84.46 | 92.27 | 88.19 | 5159 | -| section_title | 78.17 | 66.65 | 71.96 | 17540 | -| table_title | 15.98 | 8.39 | 11 | 6092 | +| section_title | 78.17 | 66.65 | 71.95 | 17540 | +| table_title | 15.97 | 8.39 | 11 | 6092 | | | | | | | -| **all fields (micro avg.)** | **81.93** | **77.28** | **79.54** | 120771 | -| all fields (macro avg.) | 69.35 | 58.54 | 62.23 | 120771 | +| **all fields (micro avg.)** | **77.16** | **77.24** | **77.2** | 120771 | +| all fields (macro avg.) | 63.16 | 58.12 | 57.82 | 120771 | + **Document-level ratio results** @@ -290,5 +307,5 @@ Evaluation on 1000 random PDF files out of 1000 PDF (ratio 1.0). | **all fields (micro avg.)** | **99.47** | **96.41** | **97.91** | 779 | | all fields (macro avg.) | 99.47 | 96.41 | 97.91 | 779 | -Evaluation metrics produced in 555.701 seconds +Evaluation metrics produced in 396.908 seconds diff --git a/doc/Benchmarking-pmc.md b/doc/Benchmarking-pmc.md index adfc7ed7d4..83b0ecb2a9 100644 --- a/doc/Benchmarking-pmc.md +++ b/doc/Benchmarking-pmc.md @@ -2,7 +2,7 @@ ## General -This is the end-to-end benchmarking result for GROBID version **0.7.3** against the `PMC_sample_1943` dataset, see the [End-to-end evaluation](End-to-end-evaluation.md) page for explanations and for reproducing this evaluation. +This is the end-to-end benchmarking result for GROBID version **0.8.1** against the `PMC_sample_1943` dataset, see the [End-to-end evaluation](End-to-end-evaluation.md) page for explanations and for reproducing this evaluation. The following end-to-end results are using: @@ -22,13 +22,15 @@ Other versions of these benchmarks with variants and **Deep Learning models** (e Evaluation on 1943 random PDF PMC files out of 1943 PDF from 1943 different journals (0 PDF parsing failure). -Runtime for processing 1943 PDF: **2871s** (1.4s per PDF) on Ubuntu 16.04, 4 CPU i7-4790K (8 threads), 16GB RAM (workstation bought in 2015 for 1600 euros) and with a GeForce GTX 1050 Ti GPU. +Runtime for processing 1943 PDF: **1467** seconds, (0.75s per PDF) on Ubuntu 22.04, 16 CPU (32 threads), 128GB RAM and with a GeForce GTX 1080 Ti GPU. + +Note: with CRF only models, runtime is 470s (0.24 seconds per PDF) with 4GPU, 8 threads. + -Note: with CRF only models, runtime is 470s (0.24 seconds per PDF). ## Header metadata -Evaluation on 1943 random PDF files out of 1943 PDF (ratio 1.0). +Evaluation on 1943 random PDF files out of 1941 PDF (ratio 1.0). #### Strict Matching (exact matches) @@ -37,13 +39,15 @@ Evaluation on 1943 random PDF files out of 1943 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | | abstract | 16.78 | 16.48 | 16.63 | 1911 | -| authors | 89.99 | 89.9 | 89.95 | 1941 | -| first_author | 96.65 | 96.55 | 96.6 | 1941 | +| authors | 92.01 | 91.91 | 91.96 | 1941 | +| first_author | 96.7 | 96.6 | 96.65 | 1941 | | keywords | 64.99 | 63.62 | 64.3 | 1380 | -| title | 85.39 | 85.13 | 85.26 | 1943 | +| title | 84.67 | 84.41 | 84.54 | 1943 | | | | | | | -| **all fields (micro avg.)** | **71.5** | **70.93** | **71.22** | 9116 | -| all fields (macro avg.) | 70.76 | 70.34 | 70.55 | 9116 | +| **all fields (micro avg.)** | **71.79** | **71.22** | **71.5** | 9116 | +| all fields (macro avg.) | 71.03 | 70.6 | 70.81 | 9116 | + + #### Soft Matching (ignoring punctuation, case and space characters mismatches) @@ -52,13 +56,15 @@ Evaluation on 1943 random PDF files out of 1943 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | | abstract | 63.83 | 62.69 | 63.25 | 1911 | -| authors | 90.87 | 90.78 | 90.82 | 1941 | -| first_author | 96.85 | 96.75 | 96.8 | 1941 | -| keywords | 73.65 | 72.1 | 72.87 | 1380 | -| title | 93.86 | 93.57 | 93.71 | 1943 | +| authors | 93.91 | 93.82 | 93.87 | 1941 | +| first_author | 97.06 | 96.96 | 97.01 | 1941 | +| keywords | 73.72 | 72.17 | 72.94 | 1380 | +| title | 92.15 | 91.87 | 92.01 | 1943 | | | | | | | -| **all fields (micro avg.)** | **84.61** | **83.93** | **84.27** | 9116 | -| all fields (macro avg.) | 83.81 | 83.18 | 83.49 | 9116 | +| **all fields (micro avg.)** | **84.95** | **84.27** | **84.61** | 9116 | +| all fields (macro avg.) | 84.14 | 83.5 | 83.82 | 9116 | + + #### Levenshtein Matching (Minimum Levenshtein distance at 0.8) @@ -67,13 +73,15 @@ Evaluation on 1943 random PDF files out of 1943 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | | abstract | 91.05 | 89.43 | 90.23 | 1911 | -| authors | 95 | 94.9 | 94.95 | 1941 | -| first_author | 97.06 | 96.96 | 97.01 | 1941 | +| authors | 96.08 | 95.98 | 96.03 | 1941 | +| first_author | 97.32 | 97.22 | 97.27 | 1941 | | keywords | 84.16 | 82.39 | 83.27 | 1380 | -| title | 98.86 | 98.56 | 98.71 | 1943 | +| title | 98.35 | 98.04 | 98.2 | 1943 | | | | | | | -| **all fields (micro avg.)** | **93.83** | **93.08** | **93.45** | 9116 | -| all fields (macro avg.) | 93.23 | 92.45 | 92.83 | 9116 | +| **all fields (micro avg.)** | **94.01** | **93.25** | **93.63** | 9116 | +| all fields (macro avg.) | 93.39 | 92.61 | 93 | 9116 | + + #### Ratcliff/Obershelp Matching (Minimum Ratcliff/Obershelp similarity at 0.95) @@ -82,32 +90,34 @@ Evaluation on 1943 random PDF files out of 1943 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | | abstract | 87.11 | 85.56 | 86.33 | 1911 | -| authors | 92.88 | 92.79 | 92.84 | 1941 | -| first_author | 96.65 | 96.55 | 96.6 | 1941 | -| keywords | 79.42 | 77.75 | 78.58 | 1380 | -| title | 97.37 | 97.07 | 97.22 | 1943 | +| authors | 94.95 | 94.85 | 94.9 | 1941 | +| first_author | 96.7 | 96.6 | 96.65 | 1941 | +| keywords | 79.5 | 77.83 | 78.65 | 1380 | +| title | 96.33 | 96.04 | 96.19 | 1943 | | | | | | | -| **all fields (micro avg.)** | **91.44** | **90.71** | **91.07** | 9116 | -| all fields (macro avg.) | 90.69 | 89.94 | 90.31 | 9116 | +| **all fields (micro avg.)** | **91.68** | **90.95** | **91.32** | 9116 | +| all fields (macro avg.) | 90.92 | 90.17 | 90.54 | 9116 | + #### Instance-level results ``` -Total expected instances: 1943 -Total correct instances: 220 (strict) -Total correct instances: 881 (soft) -Total correct instances: 1414 (Levenshtein) -Total correct instances: 1272 (ObservedRatcliffObershelp) - -Instance-level recall: 11.32 (strict) -Instance-level recall: 45.34 (soft) -Instance-level recall: 72.77 (Levenshtein) -Instance-level recall: 65.47 (RatcliffObershelp) +Total expected instances: 1943 +Total correct instances: 219 (strict) +Total correct instances: 904 (soft) +Total correct instances: 1434 (Levenshtein) +Total correct instances: 1294 (ObservedRatcliffObershelp) + +Instance-level recall: 11.27 (strict) +Instance-level recall: 46.53 (soft) +Instance-level recall: 73.8 (Levenshtein) +Instance-level recall: 66.6 (RatcliffObershelp) ``` + ## Citation metadata -Evaluation on 1943 random PDF files out of 1943 PDF (ratio 1.0). +Evaluation on 1943 random PDF files out of 1941 PDF (ratio 1.0). #### Strict Matching (exact matches) @@ -115,17 +125,19 @@ Evaluation on 1943 random PDF files out of 1943 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | -| authors | 83.04 | 76.32 | 79.54 | 85778 | -| date | 94.6 | 84.24 | 89.12 | 87067 | +| authors | 83.03 | 76.31 | 79.53 | 85778 | +| date | 94.6 | 84.25 | 89.13 | 87067 | | first_author | 89.78 | 82.49 | 85.98 | 85778 | -| inTitle | 73.22 | 71.87 | 72.54 | 81007 | -| issue | 91.1 | 87.74 | 89.39 | 16635 | -| page | 94.57 | 83.69 | 88.8 | 80501 | -| title | 79.91 | 75.52 | 77.65 | 80736 | -| volume | 96.02 | 89.81 | 92.81 | 80067 | +| inTitle | 73.23 | 71.88 | 72.55 | 81007 | +| issue | 91.09 | 87.74 | 89.38 | 16635 | +| page | 94.57 | 83.7 | 88.81 | 80501 | +| title | 79.67 | 75.3 | 77.42 | 80736 | +| volume | 96.01 | 89.82 | 92.81 | 80067 | | | | | | | -| **all fields (micro avg.)** | **87.25** | **80.77** | **83.88** | 597569 | -| all fields (macro avg.) | 87.78 | 81.46 | 84.48 | 597569 | +| **all fields (micro avg.)** | **87.22** | **80.74** | **83.86** | 597569 | +| all fields (macro avg.) | 87.75 | 81.44 | 84.45 | 597569 | + + #### Soft Matching (ignoring punctuation, case and space characters mismatches) @@ -133,17 +145,19 @@ Evaluation on 1943 random PDF files out of 1943 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | -| authors | 83.51 | 76.75 | 79.99 | 85778 | -| date | 94.6 | 84.24 | 89.12 | 87067 | -| first_author | 89.95 | 82.65 | 86.15 | 85778 | -| inTitle | 84.92 | 83.34 | 84.13 | 81007 | -| issue | 91.1 | 87.74 | 89.39 | 16635 | -| page | 94.57 | 83.69 | 88.8 | 80501 | -| title | 91.44 | 86.42 | 88.86 | 80736 | -| volume | 96.02 | 89.81 | 92.81 | 80067 | +| authors | 83.5 | 76.75 | 79.98 | 85778 | +| date | 94.6 | 84.25 | 89.13 | 87067 | +| first_author | 89.95 | 82.65 | 86.14 | 85778 | +| inTitle | 84.92 | 83.36 | 84.13 | 81007 | +| issue | 91.09 | 87.74 | 89.38 | 16635 | +| page | 94.57 | 83.7 | 88.81 | 80501 | +| title | 91.43 | 86.42 | 88.86 | 80736 | +| volume | 96.01 | 89.82 | 92.81 | 80067 | | | | | | | -| **all fields (micro avg.)** | **90.62** | **83.88** | **87.12** | 597569 | -| all fields (macro avg.) | 90.76 | 84.33 | 87.4 | 597569 | +| **all fields (micro avg.)** | **90.61** | **83.89** | **87.12** | 597569 | +| all fields (macro avg.) | 90.76 | 84.34 | 87.41 | 597569 | + + #### Levenshtein Matching (Minimum Levenshtein distance at 0.8) @@ -151,17 +165,19 @@ Evaluation on 1943 random PDF files out of 1943 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | -| authors | 89.22 | 81.99 | 85.45 | 85778 | -| date | 94.6 | 84.24 | 89.12 | 87067 | -| first_author | 90.16 | 82.84 | 86.34 | 85778 | -| inTitle | 86.17 | 84.57 | 85.37 | 81007 | -| issue | 91.1 | 87.74 | 89.39 | 16635 | -| page | 94.57 | 83.69 | 88.8 | 80501 | -| title | 93.8 | 88.65 | 91.15 | 80736 | -| volume | 96.02 | 89.81 | 92.81 | 80067 | +| authors | 89.21 | 81.99 | 85.45 | 85778 | +| date | 94.6 | 84.25 | 89.13 | 87067 | +| first_author | 90.15 | 82.84 | 86.34 | 85778 | +| inTitle | 86.18 | 84.59 | 85.38 | 81007 | +| issue | 91.09 | 87.74 | 89.38 | 16635 | +| page | 94.57 | 83.7 | 88.81 | 80501 | +| title | 93.8 | 88.66 | 91.15 | 80736 | +| volume | 96.01 | 89.82 | 92.81 | 80067 | | | | | | | -| **all fields (micro avg.)** | **91.96** | **85.13** | **88.41** | 597569 | -| all fields (macro avg.) | 91.95 | 85.44 | 88.55 | 597569 | +| **all fields (micro avg.)** | **91.96** | **85.14** | **88.42** | 597569 | +| all fields (macro avg.) | 91.95 | 85.45 | 88.56 | 597569 | + + #### Ratcliff/Obershelp Matching (Minimum Ratcliff/Obershelp similarity at 0.95) @@ -169,77 +185,80 @@ Evaluation on 1943 random PDF files out of 1943 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | -| authors | 85.99 | 79.03 | 82.36 | 85778 | -| date | 94.6 | 84.24 | 89.12 | 87067 | +| authors | 85.98 | 79.02 | 82.35 | 85778 | +| date | 94.6 | 84.25 | 89.13 | 87067 | | first_author | 89.8 | 82.51 | 86 | 85778 | -| inTitle | 83.49 | 81.94 | 82.71 | 81007 | -| issue | 91.1 | 87.74 | 89.39 | 16635 | -| page | 94.57 | 83.69 | 88.8 | 80501 | -| title | 93.39 | 88.26 | 90.75 | 80736 | -| volume | 96.02 | 89.81 | 92.81 | 80067 | +| inTitle | 83.49 | 81.95 | 82.72 | 81007 | +| issue | 91.09 | 87.74 | 89.38 | 16635 | +| page | 94.57 | 83.7 | 88.81 | 80501 | +| title | 93.39 | 88.27 | 90.76 | 80736 | +| volume | 96.01 | 89.82 | 92.81 | 80067 | | | | | | | -| **all fields (micro avg.)** | **91.01** | **84.24** | **87.5** | 597569 | -| all fields (macro avg.) | 91.12 | 84.65 | 87.74 | 597569 | +| **all fields (micro avg.)** | **91.01** | **84.25** | **87.5** | 597569 | +| all fields (macro avg.) | 91.12 | 84.66 | 87.74 | 597569 | + #### Instance-level results ``` -Total expected instances: 90125 -Total extracted instances: 85893 -Total correct instances: 38882 (strict) -Total correct instances: 50895 (soft) -Total correct instances: 55770 (Levenshtein) -Total correct instances: 52316 (RatcliffObershelp) +Total expected instances: 90125 +Total extracted instances: 85902 +Total correct instances: 38762 (strict) +Total correct instances: 50900 (soft) +Total correct instances: 55783 (Levenshtein) +Total correct instances: 52319 (RatcliffObershelp) -Instance-level precision: 45.27 (strict) -Instance-level precision: 59.25 (soft) -Instance-level precision: 64.93 (Levenshtein) -Instance-level precision: 60.91 (RatcliffObershelp) +Instance-level precision: 45.12 (strict) +Instance-level precision: 59.25 (soft) +Instance-level precision: 64.94 (Levenshtein) +Instance-level precision: 60.91 (RatcliffObershelp) -Instance-level recall: 43.14 (strict) -Instance-level recall: 56.47 (soft) -Instance-level recall: 61.88 (Levenshtein) -Instance-level recall: 58.05 (RatcliffObershelp) +Instance-level recall: 43.01 (strict) +Instance-level recall: 56.48 (soft) +Instance-level recall: 61.9 (Levenshtein) +Instance-level recall: 58.05 (RatcliffObershelp) -Instance-level f-score: 44.18 (strict) -Instance-level f-score: 57.83 (soft) -Instance-level f-score: 63.37 (Levenshtein) -Instance-level f-score: 59.44 (RatcliffObershelp) +Instance-level f-score: 44.04 (strict) +Instance-level f-score: 57.83 (soft) +Instance-level f-score: 63.38 (Levenshtein) +Instance-level f-score: 59.44 (RatcliffObershelp) -Matching 1 : 68320 +Matching 1 : 68328 -Matching 2 : 4150 +Matching 2 : 4154 -Matching 3 : 1866 +Matching 3 : 1863 -Matching 4 : 665 +Matching 4 : 662 -Total matches : 75001 +Total matches : 75007 ``` + #### Citation context resolution ``` -Total expected references: 90125 - 46.38 references per article -Total predicted references: 85893 - 44.21 references per article +Total expected references: 90125 - 46.38 references per article +Total predicted references: 85902 - 44.21 references per article -Total expected citation contexts: 139835 - 71.97 citation contexts per article -Total predicted citation contexts: 115367 - 59.38 citation contexts per article +Total expected citation contexts: 139835 - 71.97 citation contexts per article +Total predicted citation contexts: 115373 - 59.38 citation contexts per article -Total correct predicted citation contexts: 97270 - 50.06 citation contexts per article -Total wrong predicted citation contexts: 18097 (wrong callout matching, callout missing in NLM, or matching with a bib. ref. not aligned with a bib.ref. in NLM) +Total correct predicted citation contexts: 97277 - 50.07 citation contexts per article +Total wrong predicted citation contexts: 18096 (wrong callout matching, callout missing in NLM, or matching with a bib. ref. not aligned with a bib.ref. in NLM) -Precision citation contexts: 84.31 -Recall citation contexts: 69.56 -fscore citation contexts: 76.23 +Precision citation contexts: 84.32 +Recall citation contexts: 69.57 +fscore citation contexts: 76.23 ``` + ## Fulltext structures Fulltext structure contents are complicated to capture from JATS NLM files. They are often normalized and different from the actual PDF content and are can be inconsistent from one document to another. The scores of the following metrics are thus not very meaningful in absolute term, in particular for the strict matching (textual content of the srtructure can be very long). As relative values for comparing different models, they seem however useful. -Evaluation on 1943 random PDF files out of 1943 PDF (ratio 1.0). +Evaluation on 1943 random PDF files out of 1941 PDF (ratio 1.0). #### Strict Matching (exact matches) @@ -247,15 +266,17 @@ Evaluation on 1943 random PDF files out of 1943 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | -| figure_title | 31.4 | 24.57 | 27.57 | 7281 | +| figure_title | 31.44 | 24.61 | 27.61 | 7281 | | reference_citation | 57.43 | 58.68 | 58.05 | 134196 | -| reference_figure | 61.2 | 65.88 | 63.45 | 19330 | +| reference_figure | 61.21 | 65.9 | 63.47 | 19330 | | reference_table | 83.01 | 88.39 | 85.62 | 7327 | -| section_title | 76.38 | 67.77 | 71.82 | 27619 | -| table_title | 57.29 | 50.29 | 53.56 | 3971 | +| section_title | 76.39 | 67.77 | 71.82 | 27619 | +| table_title | 57.3 | 50.29 | 53.57 | 3971 | | | | | | | -| **all fields (micro avg.)** | **60.4** | **60.31** | **60.36** | 199724 | -| all fields (macro avg.) | 61.12 | 59.26 | 60.01 | 199724 | +| **all fields (micro avg.)** | **60.41** | **60.32** | **60.36** | 199724 | +| all fields (macro avg.) | 61.13 | 59.27 | 60.02 | 199724 | + + #### Soft Matching (ignoring punctuation, case and space characters mismatches) @@ -263,12 +284,15 @@ Evaluation on 1943 random PDF files out of 1943 PDF (ratio 1.0). | label | precision | recall | f1 | support | |--- |--- |--- |--- |--- | -| figure_title | 78.62 | 61.52 | 69.02 | 7281 | +| figure_title | 78.68 | 61.58 | 69.09 | 7281 | | reference_citation | 61.68 | 63.03 | 62.35 | 134196 | -| reference_figure | 61.68 | 66.4 | 63.95 | 19330 | +| reference_figure | 61.69 | 66.41 | 63.97 | 19330 | | reference_table | 83.19 | 88.58 | 85.8 | 7327 | -| section_title | 81.23 | 72.08 | 76.38 | 27619 | -| table_title | 81.87 | 71.87 | 76.55 | 3971 | +| section_title | 81.25 | 72.08 | 76.39 | 27619 | +| table_title | 81.89 | 71.87 | 76.56 | 3971 | | | | | | | -| **all fields (micro avg.)** | **65.76** | **65.66** | **65.71** | 199724 | -| all fields (macro avg.) | 74.71 | 70.58 | 72.34 | 199724 | +| **all fields (micro avg.)** | **65.77** | **65.67** | **65.72** | 199724 | +| all fields (macro avg.) | 74.73 | 70.59 | 72.36 | 199724 | + + + diff --git a/doc/Configuration.md b/doc/Configuration.md index 8ea092594b..342b7778b2 100644 --- a/doc/Configuration.md +++ b/doc/Configuration.md @@ -85,7 +85,7 @@ CORS for the GROBID web API service can be configurated by the following yaml pa GROBID uses external implementation for recognizing the language used in a publication and for performing sentence disambiguation. -There is currently only one possible language recognition implementation possible (Cybozu Language Detector) and two possible sentence segmenters (OpenNLP, default and the Pragmatic Segmenter). +There is currently only one possible language recognition implementation possible (Cybozu Language Detector) and two possible sentence segmenters (OpenNLP (default) and the Pragmatic Segmenter). ```yml # the actual implementation for language recognition to be used @@ -95,6 +95,7 @@ There is currently only one possible language recognition implementation possibl #sentenceDetectorFactory: "org.grobid.core.lang.impl.PragmaticSentenceDetectorFactory" sentenceDetectorFactory: "org.grobid.core.lang.impl.OpenNLPSentenceDetectorFactory" ``` +**NOTE**: While OpenNLP is 60 time faster than the Pragmatic Segmenter, it performs "slightly" worst. The pragmatic segmenter runs with the JRuby Interpreter. ### Service configuration @@ -121,7 +122,7 @@ When executing the service, models can be loaded in a lazy manner (if you plan t modelPreload: true ``` -Finally the following part specifies the port to be used by the GROBID web service: +Finally, the following part specifies the port to be used by the GROBID web service: ```yml server: diff --git a/doc/Deep-Learning-models.md b/doc/Deep-Learning-models.md index 335e3d3d4f..c3db143110 100644 --- a/doc/Deep-Learning-models.md +++ b/doc/Deep-Learning-models.md @@ -20,7 +20,7 @@ Current neural models can be up to 50 times slower than CRF, depending on the ar By default, only CRF models are used by Grobid. You need to select the Deep Learning models you would like to use in the GROBID configuration yaml file (`grobid/grobid-home/config/grobid.yaml`). See [here](https://grobid.readthedocs.io/en/latest/Configuration/#configuring-the-models) for more details on how to select these models. The most convenient way to use the Deep Learning models is to use the full GROBID Docker image and pass a configuration file at launch of the container describing the selected models to be used instead of the default CRF ones. Note that the full GROBID Docker image is already configured to use Deep Learning models for bibliographical reference and affiliation-address parsing. -For current GROBID version 0.8.0, we recommend considering the usage of the following Deep Learning models: +For current GROBID version 0.8.1, we recommend considering the usage of the following Deep Learning models: - `citation` model: for bibliographical parsing, the `BidLSTM_CRF_FEATURES` architecture provides currently the best accuracy, significantly better than CRF (+3 to +5 points in F1-Score). With a GPU, there is normally no runtime impact by selecting this model. SciBERT fine-tuned model performs currently at lower accuracy. @@ -57,7 +57,7 @@ DeLFT version `0.3.2` has been tested successfully with Python 3.7 and 3.8. For ```shell cd deflt/ -python3 grobidTagger.py delft/applications/citation tag --architecture BidLSTM_CRF +python -m delft.applications.grobidTagger citation tag --architecture BidLSTM_CRF ``` If it works (you see some annotations in JSON format), you are sure to have a working DeLFT environment for **all** GROBID models. The next steps address the native bridge between DeLFT and the JVM running GROBID. @@ -98,7 +98,7 @@ If you are using a Python environment for the DeLFT installation, you can set th ```yaml delft: - python_virtualEnv: /where/my/damned/python/virtualenv/is/ + python_virtualEnv: /where/my/damned/python/virtualenv/is/ ``` Normally by setting the Python environment path in the config file (e.g. `pythonVirtualEnv: "../delft/env"`), you will not need to launch GROBID in the same activated environment. diff --git a/doc/Frequently-asked-questions.md b/doc/Frequently-asked-questions.md index 809d98dff0..6fdc5850d9 100644 --- a/doc/Frequently-asked-questions.md +++ b/doc/Frequently-asked-questions.md @@ -56,7 +56,7 @@ In addition, consider more RAM memory when running Deep Learning model on CPU, e You will get the embedded images converted into `.png` by using the normal batch command. For instance: ```console -java -Xmx4G -Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep -jar grobid-core/build/libs/grobid-core-0.8.0-onejar.jar -gH grobid-home -dIn ~/test/in/ -dOut ~/test/out -exe processFullText +java -Xmx4G -Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep -jar grobid-core/build/libs/grobid-core-0.8.1-onejar.jar -gH grobid-home -dIn ~/test/in/ -dOut ~/test/out -exe processFullText ``` There is a web service doing the same, returning everything in a big zip file, `processFulltextAssetDocument`, still usable but deprecated. diff --git a/doc/Grobid-batch.md b/doc/Grobid-batch.md index bd936935ea..d856126eab 100644 --- a/doc/Grobid-batch.md +++ b/doc/Grobid-batch.md @@ -20,7 +20,7 @@ The following command display some help for the batch commands: Be sure to replace `<current version>` with the current version of GROBID that you have installed and built. For example: ```bash -> java -Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep -jar grobid-core/build/libs/grobid-core-0.8.0-onejar.jar -h +> java -Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep -jar grobid-core/build/libs/grobid-core-0.8.1-onejar.jar -h ``` The available batch commands are listed bellow. For those commands, at least `-Xmx1G` is used to set the JVM memory to avoid *OutOfMemoryException* given the current size of the Grobid models and the crazyness of some PDF. For complete fulltext processing, which involve all the GROBID models, `-Xmx4G` is recommended (although allocating less memory is usually fine). @@ -42,7 +42,7 @@ The needed parameters for that command are: Example: ```bash -> java -Xmx1G -Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep -jar grobid-core/build/libs/grobid-core-0.8.0-onejar.jar -gH grobid-home -dIn /path/to/input/directory -dOut /path/to/output/directory -r -exe processHeader +> java -Xmx1G -Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep -jar grobid-core/build/libs/grobid-core-0.8.1-onejar.jar -gH grobid-home -dIn /path/to/input/directory -dOut /path/to/output/directory -r -exe processHeader ``` WARNING: the expected extension of the PDF files to be processed is .pdf @@ -68,7 +68,7 @@ WARNING: the expected extension of the PDF files to be processed is .pdf Example: ```bash -> java -Xmx4G -Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep -jar grobid-core/build/libs/grobid-core-0.8.0-onejar.jar -gH grobid-home -dIn /path/to/input/directory -dOut /path/to/output/directory -exe processFullText +> java -Xmx4G -Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep -jar grobid-core/build/libs/grobid-core-0.8.1-onejar.jar -gH grobid-home -dIn /path/to/input/directory -dOut /path/to/output/directory -exe processFullText ``` WARNING: the expected extension of the PDF files to be processed is .pdf @@ -82,7 +82,7 @@ WARNING: the expected extension of the PDF files to be processed is .pdf Example: ```bash -> java -Xmx1G -Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep -jar grobid-core/build/libs/grobid-core-0.8.0-onejar.jar -gH grobid-home -exe processDate -s "some date to extract and format" +> java -Xmx1G -Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep -jar grobid-core/build/libs/grobid-core-0.8.1-onejar.jar -gH grobid-home -exe processDate -s "some date to extract and format" ``` ### processAuthorsHeader @@ -94,7 +94,7 @@ Example: Example: ```bash -> java -Xmx1G -Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep -jar grobid-core/build/libs/grobid-core-0.8.0-onejar.jar -gH grobid-home -exe processAuthorsHeader -s "some authors" +> java -Xmx1G -Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep -jar grobid-core/build/libs/grobid-core-0.8.1-onejar.jar -gH grobid-home -exe processAuthorsHeader -s "some authors" ``` ### processAuthorsCitation @@ -106,7 +106,7 @@ Example: Example: ```bash -> java -Xmx1G -Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep -jar grobid-core/build/libs/grobid-core-0.8.0-onejar.jar -gH grobid-home -exe processAuthorsCitation -s "some authors" +> java -Xmx1G -Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep -jar grobid-core/build/libs/grobid-core-0.8.1-onejar.jar -gH grobid-home -exe processAuthorsCitation -s "some authors" ``` ### processAffiliation @@ -118,7 +118,7 @@ Example: Example: ```bash -> java -Xmx1G -Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep -jar grobid-core/build/libs/grobid-core-0.8.0-onejar.jar -gH grobid-home -exe processAffiliation -s "some affiliation" +> java -Xmx1G -Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep -jar grobid-core/build/libs/grobid-core-0.8.1-onejar.jar -gH grobid-home -exe processAffiliation -s "some affiliation" ``` ### processRawReference @@ -130,7 +130,7 @@ Example: Example: ```bash -> java -Xmx1G -Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep -jar grobid-core/build/libs/grobid-core-0.8.0-onejar.jar -gH grobid-home -exe processRawReference -s "a reference string" +> java -Xmx1G -Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep -jar grobid-core/build/libs/grobid-core-0.8.1-onejar.jar -gH grobid-home -exe processRawReference -s "a reference string" ``` ### processReferences @@ -146,7 +146,7 @@ Example: Example: ```bash -> java -Xmx2G -Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep -jar grobid-core/build/libs/grobid-core-0.8.0-onejar.jar -gH grobid-home -dIn /path/to/input/directory -dOut /path/to/output/directory -exe processReferences +> java -Xmx2G -Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep -jar grobid-core/build/libs/grobid-core-0.8.1-onejar.jar -gH grobid-home -dIn /path/to/input/directory -dOut /path/to/output/directory -exe processReferences ``` WARNING: the expected extension of the PDF files to be processed is `.pdf` @@ -162,7 +162,7 @@ WARNING: the expected extension of the PDF files to be processed is `.pdf` Example: ```bash -> java -Xmx1G -Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep -jar grobid-core/build/libs/grobid-core-0.8.0-onejar.jar -gH grobid-home -dIn /path/to/input/directory -dOut /path/to/output/directory -exe processCitationPatentST36 +> java -Xmx1G -Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep -jar grobid-core/build/libs/grobid-core-0.8.1-onejar.jar -gH grobid-home -dIn /path/to/input/directory -dOut /path/to/output/directory -exe processCitationPatentST36 ``` WARNING: extension of the ST.36 files to be processed must be `.xml` @@ -178,7 +178,7 @@ WARNING: extension of the ST.36 files to be processed must be `.xml` Example: ``` -> java -Xmx1G -Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep -jar grobid-core/build/libs/grobid-core-0.8.0-onejar.jar -gH grobid-home -dIn /path/to/input/directory -dOut /path/to/output/directory -exe processCitationPatentTXT +> java -Xmx1G -Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep -jar grobid-core/build/libs/grobid-core-0.8.1-onejar.jar -gH grobid-home -dIn /path/to/input/directory -dOut /path/to/output/directory -exe processCitationPatentTXT ``` WARNING: extension of the text files to be processed must be `.txt`, and expected encoding is `UTF-8` @@ -194,7 +194,7 @@ WARNING: extension of the text files to be processed must be `.txt`, and expecte Example: ``` -> java -Xmx1G -Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep -jar grobid-core/build/libs/grobid-core-0.8.0-onejar.jar -gH grobid-home -dIn /path/to/input/directory -dOut /path/to/output/directory -exe processCitationPatentPDF +> java -Xmx1G -Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep -jar grobid-core/build/libs/grobid-core-0.8.1-onejar.jar -gH grobid-home -dIn /path/to/input/directory -dOut /path/to/output/directory -exe processCitationPatentPDF ``` WARNING: extension of the text files to be processed must be `.pdf` @@ -210,7 +210,7 @@ WARNING: extension of the text files to be processed must be `.pdf` Example: ```bash -> java -Xmx4G -Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep -jar grobid-core/build/libs/grobid-core-0.8.0-onejar.jar -gH grobid-home -dIn /path/to/input/directory -dOut /path/to/output/directory -exe createTraining +> java -Xmx4G -Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep -jar grobid-core/build/libs/grobid-core-0.8.1-onejar.jar -gH grobid-home -dIn /path/to/input/directory -dOut /path/to/output/directory -exe createTraining ``` WARNING: the expected extension of the PDF files to be processed is `.pdf` @@ -226,7 +226,7 @@ WARNING: the expected extension of the PDF files to be processed is `.pdf` Example: ```bash -> java -Xmx4G -Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep -jar grobid-core/build/libs/grobid-core-0.8.0-onejar.jar -gH grobid-home -dIn /path/to/input/directory -dOut /path/to/output/directory -exe createTrainingBlank +> java -Xmx4G -Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep -jar grobid-core/build/libs/grobid-core-0.8.1-onejar.jar -gH grobid-home -dIn /path/to/input/directory -dOut /path/to/output/directory -exe createTrainingBlank ``` WARNING: the expected extension of the PDF files to be processed is `.pdf` @@ -244,7 +244,7 @@ The needed parameters for that command are: Example: ```bash -> java -Xmx2G -Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep -jar grobid-core/build/libs/grobid-core-0.8.0-onejar.jar -gH grobid-home -dIn /path/to/input/directory -dOut /path/to/output/directory -r -exe processPDFAnnotation +> java -Xmx2G -Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep -jar grobid-core/build/libs/grobid-core-0.8.1-onejar.jar -gH grobid-home -dIn /path/to/input/directory -dOut /path/to/output/directory -r -exe processPDFAnnotation ``` WARNING: extension of the text files to be processed must be `.pdf` diff --git a/doc/Grobid-docker.md b/doc/Grobid-docker.md index fd839dc6fb..7b447a0cec 100644 --- a/doc/Grobid-docker.md +++ b/doc/Grobid-docker.md @@ -26,13 +26,13 @@ The process for retrieving and running the image is as follow: Current latest version: ```bash -> docker pull grobid/grobid:0.8.0 +> docker pull grobid/grobid:0.8.1 ``` - Run the container: ```bash -> docker run --rm --gpus all --init --ulimit core=0 -p 8070:8070 grobid/grobid:0.8.0 +> docker run --rm --gpus all --init --ulimit core=0 -p 8070:8070 grobid/grobid:0.8.1 ``` The image will automatically uses the GPU and CUDA version available on your host machine, but only on Linux. GPU usage via a container on Windows and MacOS machine is currently not supported by Docker. If no GPU are available, CPU will be used. @@ -88,7 +88,7 @@ The process for retrieving and running the image is as follow: Latest version: ```bash -> docker pull lfoppiano/grobid:0.8.0 +> docker pull lfoppiano/grobid:0.8.1 ``` - Run the container: @@ -100,7 +100,7 @@ Latest version: Latest version: ```bash -> docker run --rm --init --ulimit core=0 -p 8070:8070 lfoppiano/grobid:0.8.0 +> docker run --rm --init --ulimit core=0 -p 8070:8070 lfoppiano/grobid:0.8.1 ``` Note the default version is running on port `8070`, however it can be mapped on the more traditional port `8080` of your host with the following command: @@ -121,7 +121,7 @@ Grobid web services are then available as described in the [service documentatio The simplest way to pass a modified configuration to the docker image is to mount the yaml GROBID config file `grobid.yaml` when running the image. Modify the config file `grobid/grobid-home/config/grobid.yaml` according to your requirements on the host machine and mount it when running the image as follow: ```bash -docker run --rm --gpus all --init --ulimit core=0 -p 8080:8070 -p 8081:8071 -v /home/lopez/grobid/grobid-home/config/grobid.yaml:/opt/grobid/grobid-home/config/grobid.yaml:ro grobid/grobid:0.8.0 +docker run --rm --gpus all --init --ulimit core=0 -p 8080:8070 -p 8081:8071 -v /home/lopez/grobid/grobid-home/config/grobid.yaml:/opt/grobid/grobid-home/config/grobid.yaml:ro grobid/grobid:0.8.1 ``` You need to use an absolute path to specify your modified `grobid.yaml` file. @@ -222,25 +222,25 @@ Without this requirement, the image might default to CPU, even if GPU are availa For being able to use both CRF and Deep Learningmodels, use the dockerfile `./Dockerfile.delft`. The only important information then is the version which will be checked out from the tags. ```bash -> docker build -t grobid/grobid:0.8.0 --build-arg GROBID_VERSION=0.8.0 --file Dockerfile.delft . +> docker build -t grobid/grobid:0.8.1 --build-arg GROBID_VERSION=0.8.1 --file Dockerfile.delft . ``` Similarly, if you want to create a docker image from the current master, development version: ```bash -docker build -t grobid/grobid:0.8.1-SNAPSHOT --build-arg GROBID_VERSION=0.8.1-SNAPSHOT --file Dockerfile.delft . +docker build -t grobid/grobid:0.8.2-SNAPSHOT --build-arg GROBID_VERSION=0.8.2-SNAPSHOT --file Dockerfile.delft . ``` -In order to run the container of the newly created image, for example for the development version `0.8.1-SNAPSHOT`, using all GPU available: +In order to run the container of the newly created image, for example for the development version `0.8.2-SNAPSHOT`, using all GPU available: ```bash -> docker run --rm --gpus all --init --ulimit core=0 -p 8080:8070 -p 8081:8071 grobid/grobid:0.8.1-SNAPSHOT +> docker run --rm --gpus all --init --ulimit core=0 -p 8080:8070 -p 8081:8071 grobid/grobid:0.8.2-SNAPSHOT ``` In practice, you need to indicate which models should use a Deep Learning model implementation and which ones can remain with a faster CRF model implementation, which is done currently in the `grobid.yaml` file. Modify the config file `grobid/grobid-home/config/grobid.yaml` accordingly on the host machine and mount it when running the image as follow: ```bash -docker run --rm --gpus all --init --ulimit core=0 -p 8080:8070 -p 8081:8071 -v /home/lopez/grobid/grobid-home/config/grobid.yaml:/opt/grobid/grobid-home/config/grobid.yaml:ro grobid/grobid:0.8.1-SNAPSHOT +docker run --rm --gpus all --init --ulimit core=0 -p 8080:8070 -p 8081:8071 -v /home/lopez/grobid/grobid-home/config/grobid.yaml:/opt/grobid/grobid-home/config/grobid.yaml:ro grobid/grobid:0.8.2-SNAPSHOT ``` You need to use an absolute path to specify your modified `grobid.yaml` file. @@ -262,19 +262,19 @@ The container name is given by the command: For building a CRF-only image, the dockerfile to be used is `./Dockerfile.crf`. The only important information then is the version which will be checked out from the tags. ```bash -> docker build -t grobid/grobid:0.8.0 --build-arg GROBID_VERSION=0.8.0 --file Dockerfile.crf . +> docker build -t grobid/grobid:0.8.1 --build-arg GROBID_VERSION=0.8.1 --file Dockerfile.crf . ``` Similarly, if you want to create a docker image from the current master, development version: ```bash -> docker build -t grobid/grobid:0.8.1-SNAPSHOT --build-arg GROBID_VERSION=0.8.1-SNAPSHOT --file Dockerfile.crf . +> docker build -t grobid/grobid:0.8.2-SNAPSHOT --build-arg GROBID_VERSION=0.8.2-SNAPSHOT --file Dockerfile.crf . ``` -In order to run the container of the newly created image, for example for version `0.8.1`: +In order to run the container of the newly created image, for example for version `0.8.2-SNAPSHOT`: ```bash -> docker run --rm --init --ulimit core=0 -p 8080:8070 -p 8081:8071 grobid/grobid:0.8.1 +> docker run --rm --init --ulimit core=0 -p 8080:8070 -p 8081:8071 grobid/grobid:0.8.2-SNAPSHOT ``` For testing or debugging purposes, you can connect to the container with a bash shell (logs are under `/opt/grobid/logs/`): diff --git a/doc/Grobid-java-library.md b/doc/Grobid-java-library.md index 7c2e99f535..1e3ea74c52 100644 --- a/doc/Grobid-java-library.md +++ b/doc/Grobid-java-library.md @@ -9,7 +9,7 @@ The second option is of course to build yourself Grobid and to use the generated ## Using maven -The Java artefacts of the latest GROBID release (0.8.0) are uploaded on a DIY repository. +The Java artefacts of the latest GROBID release (0.8.1) are uploaded on a DIY repository. You need to add the following snippet in your `pom.xml` in order to configure it: @@ -29,19 +29,19 @@ Here an example of `grobid-core` dependency: <dependency> <groupId>org.grobid</groupId> <artifactId>grobid-core</artifactId> - <version>0.8.0</version> + <version>0.8.1</version> </dependency> ``` -If you want to work on a SNAPSHOT development version, you need to download and build the current master yourself, and include in your pom file the path to the local snapshot Grobid jar file, for instance as follow (if necessary replace `0.8.1-SNAPSHOT` by the valid `<current version>`): +If you want to work on a SNAPSHOT development version, you need to download and build the current master yourself, and include in your pom file the path to the local snapshot Grobid jar file, for instance as follow (if necessary replace `0.8.2-SNAPSHOT` by the valid `<current version>`): ```xml <dependency> <groupId>org.grobid</groupId> <artifactId>grobid-core</artifactId> - <version>0.8.1-SNAPSHOT</version> + <version>0.8.2-SNAPSHOT</version> <scope>system</scope> - <systemPath>${project.basedir}/lib/grobid-core-0.8.1-SNAPSHOT.jar</systemPath> + <systemPath>${project.basedir}/lib/grobid-core-0.8.2-SNAPSHOT.jar</systemPath> </dependency> ``` @@ -59,8 +59,8 @@ Add the following snippet in your gradle.build file: and add the Grobid dependency as well: ``` - implement 'org.grobid:grobid-core:0.8.0' - implement 'org.grobid:grobid-trainer:0.8.0' + implement 'org.grobid:grobid-core:0.8.1' + implement 'org.grobid:grobid-trainer:0.8.1' ``` ## API call diff --git a/doc/Grobid-service.md b/doc/Grobid-service.md index 10913e7534..4b3f4129cc 100644 --- a/doc/Grobid-service.md +++ b/doc/Grobid-service.md @@ -27,9 +27,9 @@ From a development installation, you can also build and install the service as a cd .. mkdir grobid-installation cd grobid-installation -unzip ../grobid/grobid-service/build/distributions/grobid-service-0.8.0.zip -mv grobid-service-0.8.0 grobid-service -unzip ../grobid/grobid-home/build/distributions/grobid-home-0.8.0.zip +unzip ../grobid/grobid-service/build/distributions/grobid-service-0.8.1.zip +mv grobid-service-0.8.1 grobid-service +unzip ../grobid/grobid-home/build/distributions/grobid-home-0.8.1.zip ./grobid-service/bin/grobid-service ``` @@ -125,13 +125,15 @@ The consolidation parameters (`consolidateHeader`, `consolidateCitations`, `cons * `1`, means consolidation against CrossRef/biblio-glutton and update of metadata: when we have a DOI match, the publisher metadata are combined with the metadata extracted from the PDF, possibly correcting them * `2`, means consolidation against CrossRef/biblio-glutton and, if matching, addition of the DOI only +The consolidation for header can use a fourth value (`3`), restricting the consolidation to the usage of DOI only, if a DOI has been extracted in the header section. + ### PDF to TEI conversion services #### /api/processHeaderDocument Extract the header of the input PDF document, normalize it and convert it into a TEI XML or [BibTeX] format. -`consolidateHeader` is a string of value `0` (no consolidation), `1` (consolidate and inject all extra metadata, default value), or `2` (consolidate the header metadata and inject DOI only). +`consolidateHeader` is a string of value `0` (no consolidation), `1` (consolidate and inject all extra metadata, default value), `2` (consolidate the header metadata and inject DOI only) or `3` (consolidate using only extracted DOI, if extracted, and do not try to consolidate using any other metadata). | method | request type | response type | parameters | requirement | description | |------------|-----------------------|---------------------|--------------------------|----------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| diff --git a/doc/Install-Grobid.md b/doc/Install-Grobid.md index 01d2da98ef..1f85aff34e 100644 --- a/doc/Install-Grobid.md +++ b/doc/Install-Grobid.md @@ -8,17 +8,17 @@ Note: Java/JDK 8 is not supported anymore from Grobid version `0.8.0` and the mi ### Latest stable release -The [latest stable release](https://github.com/kermitt2/grobid#latest-version) of GROBID is version ```0.8.0``` which can be downloaded as follow: +The [latest stable release](https://github.com/kermitt2/grobid#latest-version) of GROBID is version ```0.8.1``` which can be downloaded as follow: ```bash -> wget https://github.com/kermitt2/grobid/archive/0.8.0.zip -> unzip 0.8.0.zip +> wget https://github.com/kermitt2/grobid/archive/0.8.1.zip +> unzip 0.8.1.zip ``` or using the [docker](Grobid-docker.md) container. ### Current development version -The current development version is ```0.8.1-SNAPSHOT```, which can be downloaded from GitHub and built as follow: +The current development version is ```0.8.2-SNAPSHOT```, which can be downloaded from GitHub and built as follow: Clone source code from github: ```bash diff --git a/doc/Notes-grobid-developers.md b/doc/Notes-grobid-developers.md index 151d17ffd5..c5b642c1cb 100644 --- a/doc/Notes-grobid-developers.md +++ b/doc/Notes-grobid-developers.md @@ -9,16 +9,16 @@ The idea anyway is that people will use Grobid with the Docker image, the servic In order to make a new release: -+ tag the project branch to be releases, for instance a version `0.8.0`: ++ tag the project branch to be releases, for instance a version `0.8.1`: ``` -> git tag 0.8.0 -> git push origin 0.8.0 +> git tag 0.8.1 +> git push origin 0.8.1 ``` + create a github release: the easiest is to use the GitHub web interface -+ do something to publish the Java artefacts... currrently just uploading them on AWS S3 ++ do something to publish the Java artefacts... currently just uploading them on AWS S3 + you're not done, you need to update the documentation, `Readme.md`, `CHANGELOG.md` and end-to-end benchmarking (PMC and bioRxiv sets). @@ -35,7 +35,7 @@ In order to make a new release: ``` dependencies { - implementation 'org.grobid:grobid-core:0.7.3' + implementation 'org.grobid:grobid-core:0.8.1' } ``` @@ -55,7 +55,7 @@ for maven projects: <dependency> <groupId>org.grobid</groupId> <artifactId>grobid-core</artifactId> - <version>0.8.0</version> + <version>0.8.1</version> </dependency> ``` diff --git a/doc/References.md b/doc/References.md index 26887cd3f2..dfee440ee1 100644 --- a/doc/References.md +++ b/doc/References.md @@ -3,7 +3,7 @@ If you want to cite this work, please simply refer to the github project: ``` -GROBID (2008-2022) <https://github.com/kermitt2/grobid> +GROBID (2008-2024) <https://github.com/kermitt2/grobid> ``` Please do not include a particular person name to emphasize the project and the tool ! @@ -17,7 +17,7 @@ Here's a BibTeX entry using the [Software Heritage](https://www.softwareheritage title = {GROBID}, howpublished = {\url{https://github.com/kermitt2/grobid}}, publisher = {GitHub}, - year = {2008--2023}, + year = {2008--2024}, archivePrefix = {swh}, eprint = {1:dir:dab86b296e3c3216e2241968f0d63b68e8209d3c} } diff --git a/doc/Run-Grobid.md b/doc/Run-Grobid.md index 673a2c20c5..2f095fedcb 100644 --- a/doc/Run-Grobid.md +++ b/doc/Run-Grobid.md @@ -9,13 +9,13 @@ For convenience, we provide two docker images: - the **full** image provides the best accuracy, because it includes all the required python and TensorFlow libraries, GPU support and all Deep Learning model resources. However it requires more resources, ideally a GPU (it will be automatically detected on Linux). If you have a limited amount of PDF, a good machine, and prioritize accuracy, use this Grobid flavor. To run this version of Grobid, the command is: ```console -docker run --rm --gpus all --init --ulimit core=0 -p 8070:8070 grobid/grobid:0.8.0 +docker run --rm --gpus all --init --ulimit core=0 -p 8070:8070 grobid/grobid:0.8.1 ``` - the **lightweight** image offers best runtime performance, memory usage and Docker image size. However, it does not use some of the best performing models in term of accuracy. If you have a lot of PDF to process, a low resource system, and accuracy is not so important, use this flavor: ```console -docker run --rm --init --ulimit core=0 -p 8070:8070 lfoppiano/grobid:0.8.0 +docker run --rm --init --ulimit core=0 -p 8070:8070 lfoppiano/grobid:0.8.1 ``` More documentation on the Docker images can be found [here](Grobid-docker.md). diff --git a/doc/training/General-principles.md b/doc/training/General-principles.md index f5edef0b18..dd40f3e3cc 100644 --- a/doc/training/General-principles.md +++ b/doc/training/General-principles.md @@ -8,7 +8,7 @@ This maybe of interest if the current state of the models does not correctly rec The addition of training in Grobid is __not__ done from scratch, but from pre-annotated training data generated by the existing models in Grobid. This ensures that the syntax of the new training data will be (normally) correct and that the stream of text will be easy to align with the text extracted from the PDF. This permits also to take advantage of the existing models which will annotate correctly a certain amount of text, and to focus on the corrections, thus improving the productivity of the annotator. -For generating pre-annotated training files for Grobid based on the existing models, see the instructions for running the software in batch [here](../Training-the-models-of-Grobid/#generation-of-training-data) and [here](../Grobid-batch/#createtraining). +For generating pre-annotated training files for Grobid based on the existing models, see the instructions for running the software in batch [here](../../Training-the-models-of-Grobid/#generation-of-training-data) and [here](../../Grobid-batch/#createtraining). After running the batch `createTraining` on a set of PDF files using methods for creating training data, each article comes with: diff --git a/gradle.properties b/gradle.properties index 35b314eaff..67c6121e23 100644 --- a/gradle.properties +++ b/gradle.properties @@ -1,4 +1,4 @@ -version=0.8.1-SNAPSHOT +version=0.8.1 # Set workers to 1 that even for parallel builds it works. (I guess the shadow plugin makes some trouble) org.gradle.workers.max=1 org.gradle.caching = true @@ -10,3 +10,5 @@ org.gradle.vfs.watch = true #systemProp.https.proxyPort= #systemProp.https.proxyHost= #systemProp.https.proxyPort= + +org.gradle.java.installations.auto-download=false diff --git a/grobid-core/doc/Annotations/ActivityDiagram.png b/grobid-core/doc/Annotations/ActivityDiagram.png deleted file mode 100644 index 346b80e565..0000000000 Binary files a/grobid-core/doc/Annotations/ActivityDiagram.png and /dev/null differ diff --git a/grobid-core/doc/Annotations/ArgoUMLFiles/ActivityDiagram.zargo b/grobid-core/doc/Annotations/ArgoUMLFiles/ActivityDiagram.zargo deleted file mode 100644 index 5a9f0edb40..0000000000 Binary files a/grobid-core/doc/Annotations/ArgoUMLFiles/ActivityDiagram.zargo and /dev/null differ diff --git a/grobid-core/doc/Annotations/ArgoUMLFiles/ClassDiagram.zargo b/grobid-core/doc/Annotations/ArgoUMLFiles/ClassDiagram.zargo deleted file mode 100644 index 60c5e2b6aa..0000000000 Binary files a/grobid-core/doc/Annotations/ArgoUMLFiles/ClassDiagram.zargo and /dev/null differ diff --git a/grobid-core/doc/Annotations/ClassDiagram.png b/grobid-core/doc/Annotations/ClassDiagram.png deleted file mode 100644 index f863ce24af..0000000000 Binary files a/grobid-core/doc/Annotations/ClassDiagram.png and /dev/null differ diff --git a/grobid-core/doc/licenseAgreemet.txt b/grobid-core/doc/licenseAgreemet.txt deleted file mode 100755 index f23f82aab2..0000000000 --- a/grobid-core/doc/licenseAgreemet.txt +++ /dev/null @@ -1,41 +0,0 @@ -This document has been created to keep track about the exchange between grobid team and pdf2xml team about license agreement. - ------------------------------------------------- -Request sent to pdf2xml team on 22nd of August : -Hello, - - I am part of a project named grobid (hosted on source forge) under Apache license. Pdf2xml is under gpl license. For license compatibility, the tool pdf2xml is not distributed with grobid and has to be installed manually by the user/developer. - We thought that a good idea would be to deploy pdf2xml on a public maven repository, so that is will be disponible to use it. To respect the license with pdf2xml, it is possible to add a specific profile for the maven build of grobid (if you specify that specific profile it will download it else not). So if you add that profile in your build it means you accept the license agreement. - For information the archive would be a simple zip of that structure : -pdf2xml - |_win-32 - |_pdftoxml.exe - |_zlib1.dll - |_mac-64 - |_pdf2xml (build on and for mac-64 platfom) - |_lin-32 - |_pdf2xml (build on and for lin-32 platfom) - |_lin-64 - |_pdf2xml (build on and for mac-32 platfom) - -We would like to know your opinion on the subject. - -Thank you in advance for the answer to your request. - -Regards, - - -Damien ridereau ------------------------------------------------- - - ------------------------------------ -Answer recieved on 29th of August : -Hello, - -It shouldn't be a problem. Pdf2xml is under gpl since it's -based on xpdf, but I don't care much about the licence. - -Hervé ------------------------------------ - diff --git a/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java b/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java index 7bd030f923..365e5bee56 100755 --- a/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java +++ b/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java @@ -107,6 +107,7 @@ public String toString() { ", PMID='" + PMID + '\'' + ", PMCID='" + PMCID + '\'' + ", PII='" + PII + '\'' + + ", HALId='" + halId + '\'' + ", ark='" + ark + '\'' + ", istexId='" + istexId + '\'' + ", inDOI='" + inDOI + '\'' + @@ -256,6 +257,7 @@ public String toString() { private String PMID = null; private String PMCID = null; private String PII = null; + private String halId = null; private String ark = null; private String istexId = null; private String abstract_ = null; @@ -526,6 +528,10 @@ public String getDOI() { return doi; } + public String getHalId() { + return halId; + } + public String getArk() { return ark; } @@ -1060,9 +1066,20 @@ public static String cleanDOI(String doi) { doi = doi.replaceAll("[\\p{M}]", ""); doi = doi.replaceAll("\\p{InCombiningDiacriticalMarks}+", ""); + // remove possible starting/trailing parenthesis + if (doi.startsWith("(") || doi.startsWith("[") || doi.startsWith("⟨")) + doi = doi.substring(1); + + if (doi.endsWith(")") || doi.endsWith("]") || doi.endsWith("⟩")) + doi = doi.substring(0,doi.length()-1); + return doi; } + public void setHalId(String halId) { + this.halId = halId; + } + public void setArXivId(String id) { if (id != null) { arXivId = StringUtils.normalizeSpace(id); @@ -1591,6 +1608,7 @@ public void reset() { type = null; book_type = null; doi = null; + halId = null; istexId = null; ark = null; inDOI = null; @@ -2169,7 +2187,7 @@ else if (pubnum != null && pubnum.length() == 13) } } - // TODO: PII + // TODO: PII and HALId } @@ -2345,6 +2363,13 @@ else if (bookTitle == null) { tei.append("<idno type=\"DOI\">" + TextUtilities.HTMLEncode(doi) + "</idno>\n"); } + if (!StringUtils.isEmpty(halId)) { + for (int i = 0; i < indent + 2; i++) { + tei.append("\t"); + } + tei.append("<idno type=\"HALid\">" + TextUtilities.HTMLEncode(halId) + "</idno>\n"); + } + if (!StringUtils.isEmpty(arXivId)) { for (int i = 0; i < indent + 2; i++) { tei.append("\t"); @@ -2786,9 +2811,6 @@ else if (this.getYear().length() == 4) } } - /*for (int i = 0; i < indent + 2; i++) { - tei.append("\t"); - }*/ if ((volumeBlock != null) | (issue != null) || (pageRange != null) || (publication_date != null) || (publisher != null)) { for (int i = 0; i < indent + 2; i++) { @@ -2947,7 +2969,12 @@ else if (this.getYear().length() == 4) for (int i = 0; i < indent + 2; i++) { tei.append("\t"); } - if ((publication_date != null) || (pageRange != null) || (location != null) || (publisher != null) || (volumeBlock != null)) { + if (normalized_publication_date != null || + publication_date != null || + pageRange != null || + location != null || + publisher != null || + volumeBlock != null) { tei.append("<imprint>\n"); } else { @@ -3177,12 +3204,13 @@ else if (this.getYear().length() == 4) } if (uri != null) { - if (uri.startsWith("http://hal.")) { + /*if (uri.startsWith("http://hal.") || ) { for (int i = 0; i < indent + 1; i++) { tei.append("\t"); } tei.append("<idno type=\"HALid\">" + TextUtilities.HTMLEncode(uri) + "</idno>\n"); - } else { + } else */ + { for (int i = 0; i < indent + 1; i++) { tei.append("\t"); } @@ -3191,7 +3219,7 @@ else if (this.getYear().length() == 4) } if (url != null) { - if (url.startsWith("http://hal.")) { + if (url.startsWith("http://hal.") || url.startsWith("https://hal.")) { for (int i = 0; i < indent + 1; i++) { tei.append("\t"); } @@ -4117,6 +4145,7 @@ public static void injectIdentifiers(BiblioItem destination, BiblioItem source) destination.setPII(source.getPII()); destination.setIstexId(source.getIstexId()); destination.setArk(source.getArk()); + destination.setHalId(source.getHalId()); } /** @@ -4140,6 +4169,8 @@ public static void correct(BiblioItem bib, BiblioItem bibo) { bib.setIstexId(bibo.getIstexId()); if (bibo.getArk() != null) bib.setArk(bibo.getArk()); + if (bibo.getHalId() != null) + bib.setHalId(bibo.getHalId()); if (bibo.getOAURL() != null) bib.setOAURL(bibo.getOAURL()); @@ -4243,6 +4274,8 @@ public static void correct(BiblioItem bib, BiblioItem bibo) { bib.setISBN10(bibo.getISBN10()); if (bibo.getISBN13() != null) bib.setISBN13(bibo.getISBN13()); + if (bibo.getHalId() != null) + bib.setHalId(bibo.getHalId()); if (bibo.getItem() != -1) { bib.setItem(bibo.getItem()); @@ -4361,7 +4394,7 @@ public boolean rejectAsReference() { if (fullAuthors == null && collaboration == null) authorSet = false; // normally properties authors and authorList are null in the current Grobid version - if (!titleSet && !authorSet && (url == null) && (doi == null)) + if (!titleSet && !authorSet && url == null && doi == null && halId ==null) return true; else return false; diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index a019ee2a30..3545acb61f 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -747,6 +747,10 @@ else if (biblio.getE_Year().length() == 4) tei.append("\t\t\t\t\t<idno type=\"DOI\">" + TextUtilities.HTMLEncode(theDOI) + "</idno>\n"); } + if (!StringUtils.isEmpty(biblio.getHalId())) { + tei.append("\t\t\t\t\t<idno type=\"halId\">" + TextUtilities.HTMLEncode(biblio.getHalId()) + "</idno>\n"); + } + if (!StringUtils.isEmpty(biblio.getArXivId())) { tei.append("\t\t\t\t\t<idno type=\"arXiv\">" + TextUtilities.HTMLEncode(biblio.getArXivId()) + "</idno>\n"); } @@ -1638,7 +1642,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, String type = referenceInformation.getMiddle(); OffsetPosition matchingPosition = referenceInformation.getRight(); - if (pos > matchingPosition.start) + if (pos > matchingPosition.start) break; List<LayoutToken> before = clusterTokens.subList(pos, matchingPosition.start); diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java index cfeef3637f..fca9efacb9 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java @@ -190,7 +190,10 @@ public MutablePair<Element, MutableTriple<List<Funding>,List<Person>,List<Affili List<AnnotatedXMLElement> annotations = localResult.left; FundingAcknowledgmentParse localEntities = localResult.right; - List<OffsetPosition> annotationsPositionTokens = annotations.stream().map(AnnotatedXMLElement::getOffsetPosition).toList(); + List<OffsetPosition> annotationsPositionTokens = annotations.stream() + .map(AnnotatedXMLElement::getOffsetPosition) + .collect(Collectors.toList()); + List<OffsetPosition> annotationsPositionText = TextUtilities.matchTokenAndString(tokenizationFunding, paragraphText, annotationsPositionTokens); List<AnnotatedXMLElement> annotationsWithPosRefToText = new ArrayList<>(); for (int i = 0; i < annotationsPositionText.size(); i++) { @@ -250,20 +253,21 @@ private static Nodes mergeSentencesFallingOnAnnotations(Nodes sentences, List<An // We obtain the corrected coordinates that don't fall over the annotations List<OffsetPosition> correctedOffsetPositions = SentenceUtilities.correctSentencePositions(sentencePositions, annotations .stream() - .map(AnnotatedXMLElement::getOffsetPosition).toList()); + .map(AnnotatedXMLElement::getOffsetPosition) + .collect(Collectors.toList())); List<Integer> toRemove = new ArrayList<>(); for (OffsetPosition correctedOffsetPosition : correctedOffsetPositions) { List<OffsetPosition> originalSentences = sentencePositions.stream() .filter(a -> a.start >= correctedOffsetPosition.start && a.end <= correctedOffsetPosition.end) - .toList(); + .collect(Collectors.toList()); // if for each "corrected sentences offset" there are more than one original sentence that // falls into it, it means we need to merge if (originalSentences.size() > 1) { List<Integer> toMerge = originalSentences.stream() .map(sentencePositions::indexOf) - .toList(); + .collect(Collectors.toList()); Element destination = (Element) sentences.get(toMerge.get(0)); boolean needToMergeCoordinates = config.isGenerateTeiCoordinates("s"); @@ -291,7 +295,7 @@ private static Nodes mergeSentencesFallingOnAnnotations(Nodes sentences, List<An boundingBoxes.addAll(Arrays.stream(coordinates.split(";")) .filter(StringUtils::isNotBlank) .map(BoundingBox::fromString) - .toList()); + .collect(Collectors.toList())); // Group by page, then merge List<BoundingBox> postMergeBoxes = new ArrayList<>(); @@ -301,7 +305,7 @@ private static Nodes mergeSentencesFallingOnAnnotations(Nodes sentences, List<An postMergeBoxes.addAll(mergedBoundingBoxes); } - String coordsAsString = String.join(";", postMergeBoxes.stream().map(BoundingBox::toString).toList()); + String coordsAsString = String.join(";", postMergeBoxes.stream().map(BoundingBox::toString).collect(Collectors.toList())); Attribute newCoords = new Attribute("coords", coordsAsString); destination.addAttribute(newCoords); } @@ -369,7 +373,7 @@ private static void updateParagraphNodeWithAnnotations(Node paragraph, List<Anno int finalPos = pos; List<AnnotatedXMLElement> annotationsInThisChunk = annotations.stream() .filter(a -> a.getOffsetPosition().start >= finalPos && a.getOffsetPosition().end <= finalPos + text.length()) - .toList(); + .collect(Collectors.toList()); if (CollectionUtils.isNotEmpty(annotationsInThisChunk)) { List<Node> nodes = getNodesAnnotationsInTextNode(currentNode, annotationsInThisChunk, pos); @@ -407,7 +411,7 @@ private static void updateSentencesNodesWithAnnotations(Nodes sentences, List<An int finalPos = pos; List<AnnotatedXMLElement> annotationsInThisChunk = annotations.stream() .filter(a -> a.getOffsetPosition().start >= finalPos && a.getOffsetPosition().end <= finalPos + text.length()) - .toList(); + .collect(Collectors.toList()); if (CollectionUtils.isNotEmpty(annotationsInThisChunk)) { List<Node> nodes = getNodesAnnotationsInTextNode(currentNode, annotationsInThisChunk, pos); diff --git a/grobid-core/src/main/java/org/grobid/core/jni/PythonEnvironmentConfig.java b/grobid-core/src/main/java/org/grobid/core/jni/PythonEnvironmentConfig.java index c1a3781366..7254d81d98 100644 --- a/grobid-core/src/main/java/org/grobid/core/jni/PythonEnvironmentConfig.java +++ b/grobid-core/src/main/java/org/grobid/core/jni/PythonEnvironmentConfig.java @@ -100,14 +100,14 @@ public static PythonEnvironmentConfig getInstanceForVirtualEnv(String virtualEnv .stream() .map(path -> FilenameUtils.getName(path.getFileName().toString()) .replace("libpython", "").replace("python", "")) - .filter(version -> version.contains("3.7") || version.contains("3.8") || version.contains("3.9")) + .filter(version -> version.contains("3.7") || version.contains("3.8") || version.contains("3.9") || version.contains("3.10") || version.contains("3.11") || version.contains("3.12")) .distinct() .sorted() .collect(Collectors.toList()); if (CollectionUtils.isEmpty(pythonVersions)) { throw new GrobidException( - "Cannot find a suitable version (3.7, 3.8 or 3.9) of python in your virtual environment: " + + "Cannot find a suitable version (3.7 to 3.12) of python in your virtual environment: " + virtualEnv ); } diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/Consolidation.java b/grobid-core/src/main/java/org/grobid/core/utilities/Consolidation.java index c07ba1f46f..e0538b586b 100755 --- a/grobid-core/src/main/java/org/grobid/core/utilities/Consolidation.java +++ b/grobid-core/src/main/java/org/grobid/core/utilities/Consolidation.java @@ -115,7 +115,7 @@ public void close() { /** * Try to consolidate one bibliographical object with crossref metadata lookup web services based on - * core metadata + * core metadata. In practice, this method is used for consolidating header metadata. */ public BiblioItem consolidate(BiblioItem bib, String rawCitation, int consolidateMode) throws Exception { final List<BiblioItem> results = new ArrayList<>(); @@ -125,6 +125,7 @@ public BiblioItem consolidate(BiblioItem bib, String rawCitation, int consolidat theDOI = cleanDOI(theDOI); } final String doi = theDOI; + String halId = bib.getHalId(); String aut = bib.getFirstAuthorSurname(); String title = bib.getTitle(); String journalTitle = bib.getJournal(); @@ -171,6 +172,13 @@ public BiblioItem consolidate(BiblioItem bib, String rawCitation, int consolidat arguments.put("query.bibliographic", rawCitation); } } + if (StringUtils.isNotBlank(halId)) { + // call based on the identified HAL ID + if (arguments == null) + arguments = new HashMap<String,String>(); + if (GrobidProperties.getInstance().getConsolidationService() != GrobidConsolidationService.CROSSREF) + arguments.put("halid", halId); + } if (StringUtils.isNotBlank(aut)) { // call based on partial metadata if (GrobidProperties.getInstance().getConsolidationService() != GrobidConsolidationService.CROSSREF) { @@ -309,7 +317,9 @@ public void onError(int status, String message, Exception exception) { /** - * Try tp consolidate a list of bibliographical objects in one operation with consolidation services + * Try tp consolidate a list of bibliographical objects in one operation with consolidation services. + * In practice this method is used for consolidating the metadata of all the extracted bibliographical + * references. */ public Map<Integer,BiblioItem> consolidate(List<BibDataSet> biblios) { if (CollectionUtils.isEmpty(biblios)) @@ -333,6 +343,8 @@ public Map<Integer,BiblioItem> consolidate(List<BibDataSet> biblios) { if (StringUtils.isNotBlank(doi)) { doi = BiblioItem.cleanDOI(doi); } + // first we get the exploitable metadata + String halId = theBiblio.getHalId(); String aut = theBiblio.getFirstAuthorSurname(); String title = theBiblio.getTitle(); String journalTitle = theBiblio.getJournal(); @@ -381,6 +393,13 @@ public Map<Integer,BiblioItem> consolidate(List<BibDataSet> biblios) { arguments = new HashMap<String,String>(); arguments.put("doi", doi); } + if (StringUtils.isNotBlank(halId)) { + // call based on the identified HAL ID + if (arguments == null) + arguments = new HashMap<String,String>(); + if (GrobidProperties.getInstance().getConsolidationService() != GrobidConsolidationService.CROSSREF) + arguments.put("halid", halId); + } if (StringUtils.isNotBlank(rawCitation)) { // call with full raw string if (arguments == null) diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/crossref/WorkDeserializer.java b/grobid-core/src/main/java/org/grobid/core/utilities/crossref/WorkDeserializer.java index ac496181d0..57c80dc1c3 100644 --- a/grobid-core/src/main/java/org/grobid/core/utilities/crossref/WorkDeserializer.java +++ b/grobid-core/src/main/java/org/grobid/core/utilities/crossref/WorkDeserializer.java @@ -26,7 +26,17 @@ protected BiblioItem deserializeOneItem(JsonNode item) { biblio = new BiblioItem(); //System.out.println(item.toString()); - biblio.setDOI(item.get("DOI").asText()); + JsonNode doiNode = item.get("DOI"); + if (doiNode != null && (!doiNode.isMissingNode()) ) { + String doi = doiNode.asText(); + biblio.setDOI(doi); + } + + JsonNode halNode = item.get("halId"); + if (halNode != null && (!halNode.isMissingNode()) ) { + String halId = halNode.asText(); + biblio.setHalId(halId); + } // the following are usually provided by biblio-glutton which index augmented/aggregated // metadata @@ -170,6 +180,9 @@ protected BiblioItem deserializeOneItem(JsonNode item) { if (publishPrintNode == null || publishPrintNode.isMissingNode()) { publishPrintNode = item.get("published-print"); } + if (publishPrintNode == null || publishPrintNode.isMissingNode()) { + publishPrintNode = item.get("published"); + } if (publishPrintNode != null && (!publishPrintNode.isMissingNode())) { JsonNode datePartNode = publishPrintNode.get("date-parts"); if (datePartNode != null && (!datePartNode.isMissingNode()) && diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/glutton/GluttonRequest.java b/grobid-core/src/main/java/org/grobid/core/utilities/glutton/GluttonRequest.java index 0acf7ef1f7..3415e58f1f 100644 --- a/grobid-core/src/main/java/org/grobid/core/utilities/glutton/GluttonRequest.java +++ b/grobid-core/src/main/java/org/grobid/core/utilities/glutton/GluttonRequest.java @@ -117,6 +117,12 @@ public void execute() { doi = params.get("doi"); uriBuilder.setParameter("doi", doi); } + if (params.get("HALID") != null || params.get("halId") != null) { + String doi = params.get("HALID"); + if (doi == null) + doi = params.get("halId"); + uriBuilder.setParameter("halId", doi); + } if (params.get("PMID") != null || params.get("pmid") != null) { String pmid = params.get("PMID"); if (pmid == null) diff --git a/grobid-trainer/doc/PLOS_1000.results.grobid-0.8-1-Glutton-DeLFT-BidLSTM-CRF-FEATURES_citations_header-09.08.2024 b/grobid-trainer/doc/PLOS_1000.results.grobid-0.8-1-Glutton-DeLFT-BidLSTM-CRF-FEATURES_citations_header-09.08.2024 new file mode 100644 index 0000000000..ccea8d63aa --- /dev/null +++ b/grobid-trainer/doc/PLOS_1000.results.grobid-0.8-1-Glutton-DeLFT-BidLSTM-CRF-FEATURES_citations_header-09.08.2024 @@ -0,0 +1,404 @@ +-------------> GROBID failed on 0 PDF + +1000 PDF files processed in 999.823 seconds, 0.999823 seconds per PDF file + +Evaluation metrics produced in 385.495 seconds + +======= Header metadata ======= + +Evaluation on 1000 random PDF files out of 998 PDF (ratio 1.0). + +======= Strict Matching ======= (exact matches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +abstract 78.96 13.58 13.65 13.61 960 +authors 99.67 98.87 98.97 98.92 969 +first_author 99.75 99.18 99.28 99.23 969 +keywords 98.44 0 0 0 0 +title 98.64 95.75 94.6 95.17 1000 + +all (micro avg.) 94.25 77.01 76.91 76.96 3898 +all (macro avg.) 94.25 76.84 76.62 76.73 3898 + + +======== Soft Matching ======== (ignoring punctuation, case and space characters mismatches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +abstract 87.94 50.57 50.83 50.7 960 +authors 99.67 98.87 98.97 98.92 969 +first_author 99.75 99.18 99.28 99.23 969 +keywords 98.44 0 0 0 0 +title 99.55 99.39 98.2 98.79 1000 + +all (micro avg.) 96.73 87.11 86.99 87.05 3898 +all (macro avg.) 96.73 87 86.82 86.91 3898 + + +==== Levenshtein Matching ===== (Minimum Levenshtein distance at 0.8) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +abstract 94.29 76.68 77.08 76.88 960 +authors 99.77 99.28 99.38 99.33 969 +first_author 99.77 99.28 99.38 99.33 969 +keywords 98.44 0 0 0 0 +title 99.62 99.7 98.5 99.09 1000 + +all (micro avg.) 98.36 93.78 93.66 93.72 3898 +all (macro avg.) 98.36 93.73 93.59 93.66 3898 + + += Ratcliff/Obershelp Matching = (Minimum Ratcliff/Obershelp similarity at 0.95) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +abstract 91.92 66.94 67.29 67.12 960 +authors 99.75 99.18 99.28 99.23 969 +first_author 99.75 99.18 99.28 99.23 969 +keywords 98.44 0 0 0 0 +title 99.57 99.49 98.3 98.89 1000 + +all (micro avg.) 97.75 91.27 91.15 91.21 3898 +all (macro avg.) 97.75 91.2 91.04 91.12 3898 + +===== Instance-level results ===== + +Total expected instances: 1000 +Total correct instances: 139 (strict) +Total correct instances: 487 (soft) +Total correct instances: 726 (Levenshtein) +Total correct instances: 642 (ObservedRatcliffObershelp) + +Instance-level recall: 13.9 (strict) +Instance-level recall: 48.7 (soft) +Instance-level recall: 72.6 (Levenshtein) +Instance-level recall: 64.2 (RatcliffObershelp) + +======= Citation metadata ======= + +Evaluation on 1000 random PDF files out of 998 PDF (ratio 1.0). + +======= Strict Matching ======= (exact matches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +authors 97.41 81.17 78.43 79.78 44770 +date 97.88 84.61 81.24 82.89 45457 +first_author 98.81 91.47 88.34 89.88 44770 +inTitle 97.48 81.67 83.58 82.61 42795 +issue 99.51 93.62 92.68 93.15 18983 +page 97.17 93.7 77.57 84.87 40844 +title 94.52 59.97 60.47 60.22 43101 +volume 99.41 95.89 96.11 96 40458 + +all (micro avg.) 97.77 84.23 81.45 82.81 321178 +all (macro avg.) 97.77 85.26 82.3 83.67 321178 + + +======== Soft Matching ======== (ignoring punctuation, case and space characters mismatches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +authors 97.46 81.49 78.73 80.09 44770 +date 97.88 84.61 81.24 82.89 45457 +first_author 98.84 91.69 88.55 90.09 44770 +inTitle 98.01 85.51 87.5 86.49 42795 +issue 99.51 93.62 92.68 93.15 18983 +page 97.17 93.7 77.57 84.87 40844 +title 98.89 91.95 92.74 92.34 43101 +volume 99.41 95.89 96.11 96 40458 + +all (micro avg.) 98.4 89.32 86.37 87.82 321178 +all (macro avg.) 98.4 89.81 86.89 88.24 321178 + + +==== Levenshtein Matching ===== (Minimum Levenshtein distance at 0.8) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +authors 98.7 90.64 87.57 89.08 44770 +date 97.88 84.61 81.24 82.89 45457 +first_author 98.91 92.23 89.08 90.62 44770 +inTitle 98.14 86.45 88.47 87.45 42795 +issue 99.51 93.62 92.68 93.15 18983 +page 97.17 93.7 77.57 84.87 40844 +title 99.25 94.56 95.37 94.96 43101 +volume 99.41 95.89 96.11 96 40458 + +all (micro avg.) 98.62 91.17 88.16 89.64 321178 +all (macro avg.) 98.62 91.46 88.51 89.88 321178 + + += Ratcliff/Obershelp Matching = (Minimum Ratcliff/Obershelp similarity at 0.95) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +authors 97.93 84.94 82.06 83.48 44770 +date 97.88 84.61 81.24 82.89 45457 +first_author 98.81 91.47 88.34 89.88 44770 +inTitle 97.96 85.16 87.15 86.14 42795 +issue 99.51 93.62 92.68 93.15 18983 +page 97.17 93.7 77.57 84.87 40844 +title 99.16 93.95 94.74 94.34 43101 +volume 99.41 95.89 96.11 96 40458 + +all (micro avg.) 98.48 90 87.03 88.49 321178 +all (macro avg.) 98.48 90.42 87.49 88.84 321178 + +===== Instance-level results ===== + +Total expected instances: 48449 +Total extracted instances: 48250 +Total correct instances: 13496 (strict) +Total correct instances: 22269 (soft) +Total correct instances: 24916 (Levenshtein) +Total correct instances: 23272 (RatcliffObershelp) + +Instance-level precision: 27.97 (strict) +Instance-level precision: 46.15 (soft) +Instance-level precision: 51.64 (Levenshtein) +Instance-level precision: 48.23 (RatcliffObershelp) + +Instance-level recall: 27.86 (strict) +Instance-level recall: 45.96 (soft) +Instance-level recall: 51.43 (Levenshtein) +Instance-level recall: 48.03 (RatcliffObershelp) + +Instance-level f-score: 27.91 (strict) +Instance-level f-score: 46.06 (soft) +Instance-level f-score: 51.53 (Levenshtein) +Instance-level f-score: 48.13 (RatcliffObershelp) + +Matching 1 : 35369 + +Matching 2 : 1260 + +Matching 3 : 3266 + +Matching 4 : 1800 + +Total matches : 41695 + +======= Citation context resolution ======= + +Total expected references: 48449 - 48.45 references per article +Total predicted references: 48250 - 48.25 references per article + +Total expected citation contexts: 69755 - 69.75 citation contexts per article +Total predicted citation contexts: 73696 - 73.7 citation contexts per article + +Total correct predicted citation contexts: 56772 - 56.77 citation contexts per article +Total wrong predicted citation contexts: 16924 (wrong callout matching, callout missing in NLM, or matching with a bib. ref. not aligned with a bib.ref. in NLM) + +Precision citation contexts: 77.04 +Recall citation contexts: 81.39 +fscore citation contexts: 79.15 + +======= Fulltext structures ======= + +Evaluation on 1000 random PDF files out of 998 PDF (ratio 1.0). + +======= Strict Matching ======= (exact matches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +availability_stmt 99.53 54.06 52.12 53.07 779 +figure_title 91.69 2.11 0.92 1.28 8943 +funding_stmt 94.29 5.27 28.14 8.88 1507 +reference_citation 90.9 86.69 94.65 90.49 69741 +reference_figure 95.17 72.06 54.06 61.77 11010 +reference_table 99.15 84.28 92.07 88 5159 +section_title 93.83 77.18 65.8 71.03 17540 +table_title 93.95 1.13 0.59 0.77 6092 + +all (micro avg.) 94.81 73.79 73.86 73.82 120771 +all (macro avg.) 94.81 47.85 48.54 46.91 120771 + + +======== Soft Matching ======== (ignoring punctuation, case and space characters mismatches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +availability_stmt 99.77 79.36 76.51 77.91 779 +figure_title 95.76 81.17 35.33 49.24 8943 +funding_stmt 94.29 6.89 36.76 11.6 1507 +reference_citation 90.64 86.7 94.66 90.51 69741 +reference_figure 95.07 72.52 54.41 62.17 11010 +reference_table 99.14 84.46 92.27 88.19 5159 +section_title 93.89 78.17 66.65 71.95 17540 +table_title 94.48 15.97 8.39 11 6092 + +all (micro avg.) 95.38 77.16 77.24 77.2 120771 +all (macro avg.) 95.38 63.16 58.12 57.82 120771 + +===== Document-level ratio results ===== + +label accuracy precision recall f1 support + +availability_stmt 95.43 99.47 96.41 97.91 779 + +all (micro avg.) 95.43 99.47 96.41 97.91 779 +all (macro avg.) 95.43 99.47 96.41 97.91 779 + + +************************************************************************************ +COUNTER: org.grobid.core.engines.counters.TableRejectionCounters +************************************************************************************ +------------------------------------------------------------------------------------ + CANNOT_PARSE_LABEL_TO_INT: 8 + CONTENT_SIZE_TOO_SMALL: 95 + CONTENT_WIDTH_TOO_SMALL: 9 + EMPTY_LABEL_OR_HEADER_OR_CONTENT: 749 + HEADER_NOT_STARTS_WITH_TABLE_WORD: 179 + HEADER_NOT_CONSECUTIVE: 849 + HEADER_AND_CONTENT_DIFFERENT_PAGES: 7 + HEADER_AND_CONTENT_INTERSECT: 740 +==================================================================================== + +************************************************************************************ +COUNTER: org.grobid.core.engines.counters.ReferenceMarkerMatcherCounters +************************************************************************************ +------------------------------------------------------------------------------------ + UNMATCHED_REF_MARKERS: 2067 + MATCHED_REF_MARKERS_AFTER_POST_FILTERING: 21 + STYLE_AUTHORS: 776 + STYLE_NUMBERED: 48379 + MANY_CANDIDATES: 148 + MANY_CANDIDATES_AFTER_POST_FILTERING: 10 + NO_CANDIDATES: 2535 + INPUT_REF_STRINGS_CNT: 49923 + MATCHED_REF_MARKERS: 73696 + NO_CANDIDATES_AFTER_POST_FILTERING: 2 + STYLE_OTHER: 768 +==================================================================================== + +************************************************************************************ +COUNTER: org.grobid.core.engines.counters.FigureCounters +************************************************************************************ +------------------------------------------------------------------------------------ + SKIPPED_BAD_STANDALONE_FIGURES: 203 + SKIPPED_SMALL_STANDALONE_FIGURES: 28 + SKIPPED_BIG_STANDALONE_FIGURES: 175 +==================================================================================== + +************************************************************************************ +COUNTER: org.grobid.core.engines.label.TaggingLabelImpl +************************************************************************************ +------------------------------------------------------------------------------------ + HEADER_DOCTYPE: 1612 + CITATION_TITLE: 46696 + HEADER_DATE: 993 + HEADER_KEYWORD: 79 + NAME-HEADER_MIDDLENAME: 2135 + TABLE_FIGDESC: 1300 + NAME-HEADER_SURNAME: 7879 + NAME-CITATION_OTHER: 153160 + HEADER_FUNDING: 1045 + CITATION_BOOKTITLE: 3197 + HEADER_ADDRESS: 4675 + HEADER_AFFILIATION: 4741 + CITATION_NOTE: 3052 + FULLTEXT_CITATION_MARKER: 98167 + TABLE_NOTE: 2983 + HEADER_EMAIL: 1146 + FULLTEXT_TABLE_MARKER: 11601 + FUNDING-ACKNOWLEDGEMENT_AFFILIATION: 302 + CITATION_WEB: 29518 + HEADER_GROUP: 7 + FULLTEXT_SECTION: 33218 + TABLE_LABEL: 2996 + FUNDING-ACKNOWLEDGEMENT_PROGRAMNAME: 111 + NAME-HEADER_FORENAME: 7970 + TABLE_CONTENT: 3458 + DATE_YEAR: 52907 + CITATION_COLLABORATION: 106 + CITATION_ISSUE: 19263 + HEADER_MEETING: 2 + HEADER_EDITOR: 954 + CITATION_SERIES: 88 + CITATION_JOURNAL: 42557 + NAME-CITATION_SURNAME: 177100 + TABLE_FIGURE_HEAD: 6152 + FULLTEXT_EQUATION_MARKER: 550 + CITATION_OTHER: 291192 + FULLTEXT_FIGURE_MARKER: 16972 + HEADER_TITLE: 984 + CITATION_TECH: 323 + FIGURE_CONTENT: 266 + FIGURE_LABEL: 4371 + FULLTEXT_EQUATION_LABEL: 2751 + HEADER_OTHER: 10617 + FULLTEXT_EQUATION: 3886 + CITATION_DATE: 52097 + FULLTEXT_FIGURE: 9928 + CITATION_AUTHOR: 46797 + FULLTEXT_TABLE: 6040 + CITATION_EDITOR: 602 + FULLTEXT_OTHER: 177 + HEADER_SUBMISSION: 969 + NAME-HEADER_OTHER: 8958 + FUNDING-ACKNOWLEDGEMENT_PROJECTNAME: 91 + FIGURE_FIGDESC: 5206 + HEADER_AVAILABILITY: 770 + NAME-HEADER_SUFFIX: 10 + CITATION_VOLUME: 41079 + CITATION_LOCATION: 2881 + NAME-CITATION_SUFFIX: 141 + FUNDING-ACKNOWLEDGEMENT_INFRASTRUCTURE: 23 + FUNDING-ACKNOWLEDGEMENT_INSTITUTION: 561 + NAME-HEADER_TITLE: 137 + DATE_MONTH: 6956 + HEADER_WEB: 2 + FUNDING-ACKNOWLEDGEMENT_PERSON: 2064 + HEADER_ABSTRACT: 1314 + CITATION_INSTITUTION: 804 + HEADER_REFERENCE: 2005 + FUNDING-ACKNOWLEDGEMENT_GRANTNAME: 138 + CITATION_PAGES: 42003 + HEADER_AUTHOR: 1023 + NAME-HEADER_MARKER: 5693 + DATE_OTHER: 10473 + FUNDING-ACKNOWLEDGEMENT_OTHER: 7949 + FUNDING-ACKNOWLEDGEMENT_FUNDERNAME: 1904 + NAME-CITATION_FORENAME: 179792 + CITATION_PUBLISHER: 4109 + FUNDING-ACKNOWLEDGEMENT_GRANTNUMBER: 1223 + HEADER_PUBNUM: 1354 + NAME-CITATION_MIDDLENAME: 5392 + CITATION_PUBNUM: 11422 + HEADER_COPYRIGHT: 1030 + FULLTEXT_PARAGRAPH: 217237 + FIGURE_FIGURE_HEAD: 8637 + DATE_DAY: 6542 +==================================================================================== + +************************************************************************************ +COUNTER: FigureCounters +************************************************************************************ +------------------------------------------------------------------------------------ + STANDALONE_FIGURES: 198 + ASSIGNED_GRAPHICS_TO_FIGURES: 3417 +==================================================================================== +==================================================================================== diff --git a/grobid-trainer/doc/PMC_sample_1943.results.grobid-0.8.1-Glutton-BidLSTM-CRF_citation_reference_segmenter-header-09.08.2024 b/grobid-trainer/doc/PMC_sample_1943.results.grobid-0.8.1-Glutton-BidLSTM-CRF_citation_reference_segmenter-header-09.08.2024 new file mode 100644 index 0000000000..6254d36c32 --- /dev/null +++ b/grobid-trainer/doc/PMC_sample_1943.results.grobid-0.8.1-Glutton-BidLSTM-CRF_citation_reference_segmenter-header-09.08.2024 @@ -0,0 +1,413 @@ +PDF processing 100% │████████████████│ 1943/1943 (0:24:14 / 0:00:00) + +-------------> GROBID failed on 0 PDF + +1943 PDF files processed in 1466.622 seconds, 0.7548234688625837 seconds per PDF file + +Evaluation metrics produced in 622.52 seconds + +======= Header metadata ======= + +Evaluation on 1943 random PDF files out of 1941 PDF (ratio 1.0). + +======= Strict Matching ======= (exact matches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +abstract 82.45 16.78 16.48 16.63 1911 +authors 98.28 92.01 91.91 91.96 1941 +first_author 99.28 96.7 96.6 96.65 1941 +keywords 94.22 64.99 63.62 64.3 1380 +title 96.69 84.67 84.41 84.54 1943 + +all (micro avg.) 94.19 71.79 71.22 71.5 9116 +all (macro avg.) 94.19 71.03 70.6 70.81 9116 + + +======== Soft Matching ======== (ignoring punctuation, case and space characters mismatches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +abstract 92.1 63.83 62.69 63.25 1911 +authors 98.69 93.91 93.82 93.87 1941 +first_author 99.36 97.06 96.96 97.01 1941 +keywords 95.51 73.72 72.17 72.94 1380 +title 98.27 92.15 91.87 92.01 1943 + +all (micro avg.) 96.79 84.95 84.27 84.61 9116 +all (macro avg.) 96.79 84.14 83.5 83.82 9116 + + +==== Levenshtein Matching ===== (Minimum Levenshtein distance at 0.8) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +abstract 97.68 91.05 89.43 90.23 1911 +authors 99.15 96.08 95.98 96.03 1941 +first_author 99.41 97.32 97.22 97.27 1941 +keywords 97.05 84.16 82.39 83.27 1380 +title 99.58 98.35 98.04 98.2 1943 + +all (micro avg.) 98.58 94.01 93.25 93.63 9116 +all (macro avg.) 98.58 93.39 92.61 93 9116 + + += Ratcliff/Obershelp Matching = (Minimum Ratcliff/Obershelp similarity at 0.95) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +abstract 96.88 87.11 85.56 86.33 1911 +authors 98.91 94.95 94.85 94.9 1941 +first_author 99.28 96.7 96.6 96.65 1941 +keywords 96.36 79.5 77.83 78.65 1380 +title 99.16 96.33 96.04 96.19 1943 + +all (micro avg.) 98.12 91.68 90.95 91.32 9116 +all (macro avg.) 98.12 90.92 90.17 90.54 9116 + +===== Instance-level results ===== + +Total expected instances: 1943 +Total correct instances: 219 (strict) +Total correct instances: 904 (soft) +Total correct instances: 1434 (Levenshtein) +Total correct instances: 1294 (ObservedRatcliffObershelp) + +Instance-level recall: 11.27 (strict) +Instance-level recall: 46.53 (soft) +Instance-level recall: 73.8 (Levenshtein) +Instance-level recall: 66.6 (RatcliffObershelp) + +======= Citation metadata ======= + +Evaluation on 1943 random PDF files out of 1941 PDF (ratio 1.0). + +======= Strict Matching ======= (exact matches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +authors 97.58 83.03 76.31 79.53 85778 +date 99.23 94.6 84.25 89.13 87067 +first_author 98.53 89.78 82.49 85.98 85778 +inTitle 96.19 73.23 71.88 72.55 81007 +issue 99.67 91.09 87.74 89.38 16635 +page 98.61 94.57 83.7 88.81 80501 +title 97.21 79.67 75.3 77.42 80736 +volume 99.44 96.01 89.82 92.81 80067 + +all (micro avg.) 98.31 87.22 80.74 83.86 597569 +all (macro avg.) 98.31 87.75 81.44 84.45 597569 + + +======== Soft Matching ======== (ignoring punctuation, case and space characters mismatches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +authors 97.65 83.5 76.75 79.98 85778 +date 99.23 94.6 84.25 89.13 87067 +first_author 98.55 89.95 82.65 86.14 85778 +inTitle 97.85 84.92 83.36 84.13 81007 +issue 99.67 91.09 87.74 89.38 16635 +page 98.61 94.57 83.7 88.81 80501 +title 98.81 91.43 86.42 88.86 80736 +volume 99.44 96.01 89.82 92.81 80067 + +all (micro avg.) 98.73 90.61 83.89 87.12 597569 +all (macro avg.) 98.73 90.76 84.34 87.41 597569 + + +==== Levenshtein Matching ===== (Minimum Levenshtein distance at 0.8) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +authors 98.45 89.21 81.99 85.45 85778 +date 99.23 94.6 84.25 89.13 87067 +first_author 98.58 90.15 82.84 86.34 85778 +inTitle 98.03 86.18 84.59 85.38 81007 +issue 99.67 91.09 87.74 89.38 16635 +page 98.61 94.57 83.7 88.81 80501 +title 99.14 93.8 88.66 91.15 80736 +volume 99.44 96.01 89.82 92.81 80067 + +all (micro avg.) 98.9 91.96 85.14 88.42 597569 +all (macro avg.) 98.9 91.95 85.45 88.56 597569 + + += Ratcliff/Obershelp Matching = (Minimum Ratcliff/Obershelp similarity at 0.95) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +authors 98 85.98 79.02 82.35 85778 +date 99.23 94.6 84.25 89.13 87067 +first_author 98.53 89.8 82.51 86 85778 +inTitle 97.65 83.49 81.95 82.72 81007 +issue 99.67 91.09 87.74 89.38 16635 +page 98.61 94.57 83.7 88.81 80501 +title 99.08 93.39 88.27 90.76 80736 +volume 99.44 96.01 89.82 92.81 80067 + +all (micro avg.) 98.78 91.01 84.25 87.5 597569 +all (macro avg.) 98.78 91.12 84.66 87.74 597569 + +===== Instance-level results ===== + +Total expected instances: 90125 +Total extracted instances: 85902 +Total correct instances: 38762 (strict) +Total correct instances: 50900 (soft) +Total correct instances: 55783 (Levenshtein) +Total correct instances: 52319 (RatcliffObershelp) + +Instance-level precision: 45.12 (strict) +Instance-level precision: 59.25 (soft) +Instance-level precision: 64.94 (Levenshtein) +Instance-level precision: 60.91 (RatcliffObershelp) + +Instance-level recall: 43.01 (strict) +Instance-level recall: 56.48 (soft) +Instance-level recall: 61.9 (Levenshtein) +Instance-level recall: 58.05 (RatcliffObershelp) + +Instance-level f-score: 44.04 (strict) +Instance-level f-score: 57.83 (soft) +Instance-level f-score: 63.38 (Levenshtein) +Instance-level f-score: 59.44 (RatcliffObershelp) + +Matching 1 : 68328 + +Matching 2 : 4154 + +Matching 3 : 1863 + +Matching 4 : 662 + +Total matches : 75007 + +======= Citation context resolution ======= + +Total expected references: 90125 - 46.38 references per article +Total predicted references: 85902 - 44.21 references per article + +Total expected citation contexts: 139835 - 71.97 citation contexts per article +Total predicted citation contexts: 115373 - 59.38 citation contexts per article + +Total correct predicted citation contexts: 97277 - 50.07 citation contexts per article +Total wrong predicted citation contexts: 18096 (wrong callout matching, callout missing in NLM, or matching with a bib. ref. not aligned with a bib.ref. in NLM) + +Precision citation contexts: 84.32 +Recall citation contexts: 69.57 +fscore citation contexts: 76.23 + +======= Fulltext structures ======= + +Evaluation on 1943 random PDF files out of 1941 PDF (ratio 1.0). + +======= Strict Matching ======= (exact matches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +figure_title 96.63 31.44 24.61 27.61 7281 +reference_citation 59.15 57.43 58.68 58.05 134196 +reference_figure 94.74 61.21 65.9 63.47 19330 +reference_table 99.22 83.01 88.39 85.62 7327 +section_title 94.73 76.39 67.77 71.82 27619 +table_title 98.76 57.3 50.29 53.57 3971 + +all (micro avg.) 90.54 60.41 60.32 60.36 199724 +all (macro avg.) 90.54 61.13 59.27 60.02 199724 + + +======== Soft Matching ======== (ignoring punctuation, case and space characters mismatches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +figure_title 98.52 78.68 61.58 69.09 7281 +reference_citation 61.86 61.68 63.03 62.35 134196 +reference_figure 94.6 61.69 66.41 63.97 19330 +reference_table 99.2 83.19 88.58 85.8 7327 +section_title 95.43 81.25 72.08 76.39 27619 +table_title 99.35 81.89 71.87 76.56 3971 + +all (micro avg.) 91.49 65.77 65.67 65.72 199724 +all (macro avg.) 91.49 74.73 70.59 72.36 199724 + +===== Document-level ratio results ===== + +label accuracy precision recall f1 support + + +all (micro avg.) 0 0 0 0 0 +all (macro avg.) 0 0 0 0 0 + + +************************************************************************************ +COUNTER: org.grobid.core.engines.counters.TableRejectionCounters +************************************************************************************ +------------------------------------------------------------------------------------ + CANNOT_PARSE_LABEL_TO_INT: 128 + CONTENT_SIZE_TOO_SMALL: 78 + CONTENT_WIDTH_TOO_SMALL: 18 + EMPTY_LABEL_OR_HEADER_OR_CONTENT: 1613 + HEADER_NOT_STARTS_WITH_TABLE_WORD: 152 + HEADER_NOT_CONSECUTIVE: 934 + HEADER_AND_CONTENT_DIFFERENT_PAGES: 14 + HEADER_AND_CONTENT_INTERSECT: 557 + FEW_TOKENS_IN_HEADER: 1 +==================================================================================== + +************************************************************************************ +COUNTER: org.grobid.core.engines.counters.ReferenceMarkerMatcherCounters +************************************************************************************ +------------------------------------------------------------------------------------ + UNMATCHED_REF_MARKERS: 10272 + MATCHED_REF_MARKERS_AFTER_POST_FILTERING: 3160 + STYLE_AUTHORS: 37131 + STYLE_NUMBERED: 52105 + MANY_CANDIDATES: 4770 + MANY_CANDIDATES_AFTER_POST_FILTERING: 709 + NO_CANDIDATES: 19507 + INPUT_REF_STRINGS_CNT: 91315 + MATCHED_REF_MARKERS: 115373 + NO_CANDIDATES_AFTER_POST_FILTERING: 475 + STYLE_OTHER: 2079 +==================================================================================== + +************************************************************************************ +COUNTER: org.grobid.core.engines.counters.FigureCounters +************************************************************************************ +------------------------------------------------------------------------------------ + SKIPPED_BAD_STANDALONE_FIGURES: 613 + SKIPPED_DUE_TO_MISMATCH_OF_CAPTIONS_AND_VECTOR_AND_BITMAP_GRAPHICS: 3 + SKIPPED_SMALL_STANDALONE_FIGURES: 498 + SKIPPED_BIG_STANDALONE_FIGURES: 115 +==================================================================================== + +************************************************************************************ +COUNTER: org.grobid.core.engines.label.TaggingLabelImpl +************************************************************************************ +------------------------------------------------------------------------------------ + HEADER_DOCTYPE: 3266 + AFFILIATION-ADDRESS_POSTCODE: 1 + CITATION_TITLE: 82283 + HEADER_DATE: 1073 + HEADER_KEYWORD: 1456 + NAME-HEADER_MIDDLENAME: 5932 + TABLE_FIGDESC: 4066 + NAME-HEADER_SURNAME: 14114 + NAME-CITATION_OTHER: 432612 + HEADER_FUNDING: 150 + CITATION_BOOKTITLE: 5933 + HEADER_ADDRESS: 6112 + HEADER_AFFILIATION: 6269 + CITATION_NOTE: 3133 + FULLTEXT_CITATION_MARKER: 181528 + TABLE_NOTE: 2609 + HEADER_EMAIL: 2244 + FULLTEXT_TABLE_MARKER: 14682 + FUNDING-ACKNOWLEDGEMENT_AFFILIATION: 552 + CITATION_WEB: 1381 + HEADER_GROUP: 5 + FULLTEXT_SECTION: 52100 + TABLE_LABEL: 3291 + FUNDING-ACKNOWLEDGEMENT_PROGRAMNAME: 158 + NAME-HEADER_FORENAME: 14294 + DATE_YEAR: 87269 + TABLE_CONTENT: 4786 + CITATION_COLLABORATION: 141 + HEADER_MEETING: 33 + CITATION_ISSUE: 16589 + HEADER_EDITOR: 137 + AFFILIATION-ADDRESS_SETTLEMENT: 2 + CITATION_SERIES: 183 + CITATION_JOURNAL: 78696 + NAME-CITATION_SURNAME: 328073 + TABLE_FIGURE_HEAD: 4695 + FULLTEXT_EQUATION_MARKER: 1665 + CITATION_OTHER: 451548 + FULLTEXT_FIGURE_MARKER: 37808 + HEADER_TITLE: 1972 + CITATION_TECH: 307 + FIGURE_CONTENT: 2856 + FIGURE_LABEL: 5958 + FULLTEXT_EQUATION_LABEL: 1922 + HEADER_OTHER: 10388 + AFFILIATION-ADDRESS_OTHER: 10 + FULLTEXT_EQUATION: 4342 + TABLE_OTHER: 1 + CITATION_DATE: 86567 + CITATION_AUTHOR: 86348 + FULLTEXT_FIGURE: 13959 + FULLTEXT_TABLE: 9311 + CITATION_EDITOR: 2522 + FULLTEXT_OTHER: 197 + HEADER_SUBMISSION: 1233 + NAME-HEADER_OTHER: 17663 + FUNDING-ACKNOWLEDGEMENT_PROJECTNAME: 201 + FIGURE_FIGDESC: 6971 + NAME-HEADER_SUFFIX: 20 + HEADER_AVAILABILITY: 5 + CITATION_VOLUME: 76288 + CITATION_LOCATION: 7629 + NAME-CITATION_SUFFIX: 392 + FUNDING-ACKNOWLEDGEMENT_INFRASTRUCTURE: 34 + AFFILIATION-ADDRESS_DEPARTMENT: 2 + NAME-HEADER_TITLE: 757 + FUNDING-ACKNOWLEDGEMENT_INSTITUTION: 742 + DATE_MONTH: 3247 + HEADER_WEB: 334 + AFFILIATION-ADDRESS_ADDRLINE: 1 + FUNDING-ACKNOWLEDGEMENT_PERSON: 4867 + HEADER_ABSTRACT: 2267 + CITATION_INSTITUTION: 1041 + HEADER_REFERENCE: 3174 + AFFILIATION-ADDRESS_INSTITUTION: 4 + FUNDING-ACKNOWLEDGEMENT_GRANTNAME: 299 + CITATION_PAGES: 80775 + HEADER_AUTHOR: 4317 + NAME-HEADER_MARKER: 8177 + AFFILIATION-ADDRESS_COUNTRY: 2 + DATE_OTHER: 5079 + FUNDING-ACKNOWLEDGEMENT_OTHER: 14046 + FUNDING-ACKNOWLEDGEMENT_FUNDERNAME: 3200 + NAME-CITATION_FORENAME: 318969 + CITATION_PUBLISHER: 7043 + FUNDING-ACKNOWLEDGEMENT_GRANTNUMBER: 2124 + HEADER_PUBNUM: 1858 + NAME-CITATION_MIDDLENAME: 65932 + CITATION_PUBNUM: 10306 + HEADER_COPYRIGHT: 2436 + FULLTEXT_PARAGRAPH: 379716 + FIGURE_FIGURE_HEAD: 9774 + DATE_DAY: 3060 +==================================================================================== + +************************************************************************************ +COUNTER: FigureCounters +************************************************************************************ +------------------------------------------------------------------------------------ + STANDALONE_FIGURES: 409 + ASSIGNED_GRAPHICS_TO_FIGURES: 3885 +==================================================================================== +==================================================================================== + + diff --git a/grobid-trainer/doc/bioRxiv_test_2000.results.grobid-0.8-1-Glutton-DeLFT-BidLSTM-CRF-FEATURES_CITATIONS_reference_segmenter_header-09.08.2024 b/grobid-trainer/doc/bioRxiv_test_2000.results.grobid-0.8-1-Glutton-DeLFT-BidLSTM-CRF-FEATURES_CITATIONS_reference_segmenter_header-09.08.2024 new file mode 100644 index 0000000000..172556ea33 --- /dev/null +++ b/grobid-trainer/doc/bioRxiv_test_2000.results.grobid-0.8-1-Glutton-DeLFT-BidLSTM-CRF-FEATURES_CITATIONS_reference_segmenter_header-09.08.2024 @@ -0,0 +1,428 @@ + +-------------> GROBID failed on 0 PDF + +2000 PDF files processed in 1713.514 seconds, 0.856757 seconds per PDF file + +Evaluation metrics produced in 785.488 seconds + +======= Header metadata ======= + +Evaluation on 2000 random PDF files out of 1998 PDF (ratio 1.0). + +======= Strict Matching ======= (exact matches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +abstract 78 2.2 2.16 2.18 1990 +authors 96.05 83.2 82.49 82.84 1999 +first_author 99.15 97.02 96.29 96.66 1997 +keywords 95.82 58.71 59.83 59.27 839 +title 94.77 77.67 76.85 77.26 2000 + +all (micro avg.) 92.76 64.62 64.07 64.35 8825 +all (macro avg.) 92.76 63.76 63.53 63.64 8825 + + +======== Soft Matching ======== (ignoring punctuation, case and space characters mismatches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +abstract 90.67 59.71 58.54 59.12 1990 +authors 96.16 83.7 82.99 83.35 1999 +first_author 99.2 97.23 96.49 96.86 1997 +keywords 96.32 63.86 65.08 64.46 839 +title 95.27 79.89 79.05 79.47 2000 + +all (micro avg.) 95.52 78.61 77.94 78.27 8825 +all (macro avg.) 95.52 76.88 76.43 76.65 8825 + + +==== Levenshtein Matching ===== (Minimum Levenshtein distance at 0.8) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +abstract 95.18 80.22 78.64 79.42 1990 +authors 98.06 92.18 91.4 91.79 1999 +first_author 99.26 97.48 96.75 97.11 1997 +keywords 97.82 79.42 80.93 80.17 839 +title 97.98 92.02 91.05 91.53 2000 + +all (micro avg.) 97.66 89.43 88.66 89.04 8825 +all (macro avg.) 97.66 88.26 87.75 88 8825 + + += Ratcliff/Obershelp Matching = (Minimum Ratcliff/Obershelp similarity at 0.95) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +abstract 94.45 76.88 75.38 76.12 1990 +authors 97.08 87.79 87.04 87.42 1999 +first_author 99.15 97.02 96.29 96.66 1997 +keywords 97.04 71.35 72.71 72.02 839 +title 97.05 87.87 86.95 87.41 2000 + +all (micro avg.) 96.96 85.86 85.12 85.49 8825 +all (macro avg.) 96.96 84.18 83.67 83.92 8825 + +===== Instance-level results ===== + +Total expected instances: 2000 +Total correct instances: 35 (strict) +Total correct instances: 708 (soft) +Total correct instances: 1222 (Levenshtein) +Total correct instances: 1046 (ObservedRatcliffObershelp) + +Instance-level recall: 1.75 (strict) +Instance-level recall: 35.4 (soft) +Instance-level recall: 61.1 (Levenshtein) +Instance-level recall: 52.3 (RatcliffObershelp) + +======= Citation metadata ======= + +Evaluation on 2000 random PDF files out of 1998 PDF (ratio 1.0). + +======= Strict Matching ======= (exact matches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +authors 98.4 88.16 83.24 85.63 97183 +date 98.87 91.69 86.31 88.92 97630 +doi 99.13 70.84 83.79 76.78 16894 +first_author 99.31 95.06 89.68 92.29 97183 +inTitle 97.69 82.83 79.4 81.08 96430 +issue 99.61 94.34 92.04 93.18 30312 +page 97.52 94.97 78.34 85.86 88597 +pmcid 99.95 66.38 86.12 74.97 807 +pmid 99.87 70.08 84.95 76.8 2093 +title 97.98 84.88 83.58 84.23 92463 +volume 99.46 96.23 95.23 95.73 87709 + +all (micro avg.) 98.89 89.85 85.34 87.54 707301 +all (macro avg.) 98.89 85.04 85.7 85.04 707301 + + +======== Soft Matching ======== (ignoring punctuation, case and space characters mismatches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +authors 98.55 89.31 84.33 86.75 97183 +date 98.87 91.69 86.31 88.92 97630 +doi 99.26 75.34 89.11 81.65 16894 +first_author 99.37 95.48 90.08 92.7 97183 +inTitle 98.97 92.32 88.51 90.38 96430 +issue 99.61 94.34 92.04 93.18 30312 +page 97.52 94.97 78.34 85.86 88597 +pmcid 99.96 75.64 98.14 85.44 807 +pmid 99.89 74.5 90.3 81.64 2093 +title 99.08 93.23 91.8 92.51 92463 +volume 99.46 96.23 95.23 95.73 87709 + +all (micro avg.) 99.14 92.66 88.02 90.28 707301 +all (macro avg.) 99.14 88.46 89.47 88.61 707301 + + +==== Levenshtein Matching ===== (Minimum Levenshtein distance at 0.8) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +authors 99.25 94.58 89.3 91.87 97183 +date 98.87 91.69 86.31 88.92 97630 +doi 99.32 77.6 91.79 84.1 16894 +first_author 99.39 95.63 90.22 92.85 97183 +inTitle 99.1 93.3 89.45 91.33 96430 +issue 99.61 94.34 92.04 93.18 30312 +page 97.52 94.97 78.34 85.86 88597 +pmcid 99.96 75.64 98.14 85.44 807 +pmid 99.89 74.5 90.3 81.64 2093 +title 99.46 96.05 94.58 95.31 92463 +volume 99.46 96.23 95.23 95.73 87709 + +all (micro avg.) 99.26 93.99 89.28 91.57 707301 +all (macro avg.) 99.26 89.51 90.52 89.66 707301 + + += Ratcliff/Obershelp Matching = (Minimum Ratcliff/Obershelp similarity at 0.95) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +authors 98.85 91.54 86.43 88.91 97183 +date 98.87 91.69 86.31 88.92 97630 +doi 99.28 76.04 89.94 82.41 16894 +first_author 99.32 95.1 89.72 92.33 97183 +inTitle 98.8 91.06 87.29 89.13 96430 +issue 99.61 94.34 92.04 93.18 30312 +page 97.52 94.97 78.34 85.86 88597 +pmcid 99.95 66.38 86.12 74.97 807 +pmid 99.87 70.08 84.95 76.8 2093 +title 99.37 95.35 93.89 94.62 92463 +volume 99.46 96.23 95.23 95.73 87709 + +all (micro avg.) 99.17 93.02 88.36 90.63 707301 +all (macro avg.) 99.17 87.53 88.21 87.53 707301 + +===== Instance-level results ===== + +Total expected instances: 98799 +Total extracted instances: 98068 +Total correct instances: 43771 (strict) +Total correct instances: 54778 (soft) +Total correct instances: 58972 (Levenshtein) +Total correct instances: 55693 (RatcliffObershelp) + +Instance-level precision: 44.63 (strict) +Instance-level precision: 55.86 (soft) +Instance-level precision: 60.13 (Levenshtein) +Instance-level precision: 56.79 (RatcliffObershelp) + +Instance-level recall: 44.3 (strict) +Instance-level recall: 55.44 (soft) +Instance-level recall: 59.69 (Levenshtein) +Instance-level recall: 56.37 (RatcliffObershelp) + +Instance-level f-score: 44.47 (strict) +Instance-level f-score: 55.65 (soft) +Instance-level f-score: 59.91 (Levenshtein) +Instance-level f-score: 56.58 (RatcliffObershelp) + +Matching 1 : 79296 + +Matching 2 : 4442 + +Matching 3 : 4371 + +Matching 4 : 2084 + +Total matches : 90193 + +======= Citation context resolution ======= + +Total expected references: 98797 - 49.4 references per article +Total predicted references: 98068 - 49.03 references per article + +Total expected citation contexts: 142862 - 71.43 citation contexts per article +Total predicted citation contexts: 135692 - 67.85 citation contexts per article + +Total correct predicted citation contexts: 116736 - 58.37 citation contexts per article +Total wrong predicted citation contexts: 18956 (wrong callout matching, callout missing in NLM, or matching with a bib. ref. not aligned with a bib.ref. in NLM) + +Precision citation contexts: 86.03 +Recall citation contexts: 81.71 +fscore citation contexts: 83.82 + +======= Fulltext structures ======= + +Evaluation on 2000 random PDF files out of 1998 PDF (ratio 1.0). + +======= Strict Matching ======= (exact matches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +availability_stmt 99.83 29.95 25.78 27.71 446 +figure_title 90.58 4.23 2.01 2.72 22978 +funding_stmt 98.64 4.16 24.43 7.11 745 +reference_citation 75.64 71.05 71.33 71.19 147470 +reference_figure 91.7 70.59 67.74 69.14 47984 +reference_table 98.18 48.11 83.03 60.92 5957 +section_title 94.75 72.59 69.6 71.06 32398 +table_title 98.2 4.31 2.85 3.43 3925 + +all (micro avg.) 93.44 65.46 63.41 64.42 261903 +all (macro avg.) 93.44 38.12 43.35 39.16 261903 + + +======== Soft Matching ======== (ignoring punctuation, case and space characters mismatches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +availability_stmt 99.86 50.52 43.5 46.75 446 +figure_title 94.27 69.47 32.91 44.67 22978 +funding_stmt 98.53 4.37 25.64 7.46 745 +reference_citation 84.56 83.04 83.37 83.21 147470 +reference_figure 91.15 71.22 68.34 69.75 47984 +reference_table 98.05 48.56 83.8 61.49 5957 +section_title 95.04 76.47 73.32 74.86 32398 +table_title 98.81 51.44 34.06 40.99 3925 + +all (micro avg.) 95.03 76.38 73.99 75.17 261903 +all (macro avg.) 95.03 56.89 55.62 53.65 261903 + +===== Document-level ratio results ===== + +label accuracy precision recall f1 support + +availability_stmt 65.75 84.77 86.1 85.43 446 + +all (micro avg.) 65.75 84.77 86.1 85.43 446 +all (macro avg.) 65.75 84.77 86.1 85.43 446 + + +************************************************************************************ +COUNTER: org.grobid.core.engines.counters.ReferenceMarkerMatcherCounters +************************************************************************************ +------------------------------------------------------------------------------------ + MATCHED_REF_MARKERS_AFTER_POST_FILTERING: 3131 + UNMATCHED_REF_MARKERS: 6386 + STYLE_AUTHORS: 40708 + STYLE_NUMBERED: 55611 + MANY_CANDIDATES: 5922 + MANY_CANDIDATES_AFTER_POST_FILTERING: 686 + NO_CANDIDATES: 8881 + INPUT_REF_STRINGS_CNT: 98118 + MATCHED_REF_MARKERS: 135692 + NO_CANDIDATES_AFTER_POST_FILTERING: 954 + STYLE_OTHER: 1799 +==================================================================================== + +************************************************************************************ +COUNTER: org.grobid.core.engines.counters.TableRejectionCounters +************************************************************************************ +------------------------------------------------------------------------------------ + CANNOT_PARSE_LABEL_TO_INT: 177 + CONTENT_SIZE_TOO_SMALL: 56 + CONTENT_WIDTH_TOO_SMALL: 2 + EMPTY_LABEL_OR_HEADER_OR_CONTENT: 4588 + HEADER_NOT_STARTS_WITH_TABLE_WORD: 184 + HEADER_AND_CONTENT_DIFFERENT_PAGES: 97 + HEADER_NOT_CONSECUTIVE: 363 + HEADER_AND_CONTENT_INTERSECT: 203 +==================================================================================== + +************************************************************************************ +COUNTER: org.grobid.core.engines.counters.FigureCounters +************************************************************************************ +------------------------------------------------------------------------------------ + SKIPPED_BAD_STANDALONE_FIGURES: 5344 + SKIPPED_DUE_TO_MISMATCH_OF_CAPTIONS_AND_VECTOR_AND_BITMAP_GRAPHICS: 17 + SKIPPED_SMALL_STANDALONE_FIGURES: 2706 + SKIPPED_BIG_STANDALONE_FIGURES: 2638 + TOO_MANY_FIGURES_PER_PAGE: 5 +==================================================================================== + +************************************************************************************ +COUNTER: org.grobid.core.engines.label.TaggingLabelImpl +************************************************************************************ +------------------------------------------------------------------------------------ + HEADER_DOCTYPE: 77 + AFFILIATION-ADDRESS_POSTCODE: 2 + CITATION_TITLE: 95289 + HEADER_DATE: 128 + HEADER_KEYWORD: 943 + NAME-HEADER_MIDDLENAME: 5296 + TABLE_FIGDESC: 3075 + NAME-HEADER_SURNAME: 13980 + NAME-CITATION_OTHER: 626355 + CITATION_BOOKTITLE: 4119 + HEADER_FUNDING: 86 + HEADER_ADDRESS: 7437 + HEADER_AFFILIATION: 7861 + CITATION_NOTE: 2905 + FULLTEXT_CITATION_MARKER: 192257 + TABLE_NOTE: 3805 + HEADER_EMAIL: 2628 + FULLTEXT_TABLE_MARKER: 19422 + FUNDING-ACKNOWLEDGEMENT_AFFILIATION: 572 + CITATION_WEB: 7080 + HEADER_GROUP: 7 + FULLTEXT_SECTION: 68305 + TABLE_LABEL: 2256 + FUNDING-ACKNOWLEDGEMENT_PROGRAMNAME: 264 + DATE_YEAR: 100147 + NAME-HEADER_FORENAME: 14297 + TABLE_CONTENT: 4627 + CITATION_COLLABORATION: 277 + CITATION_ISSUE: 30124 + HEADER_MEETING: 4 + HEADER_EDITOR: 4 + AFFILIATION-ADDRESS_SETTLEMENT: 2 + CITATION_SERIES: 126 + CITATION_JOURNAL: 91336 + NAME-CITATION_SURNAME: 403488 + TABLE_FIGURE_HEAD: 4627 + FULLTEXT_EQUATION_MARKER: 3844 + CITATION_OTHER: 547124 + FULLTEXT_FIGURE_MARKER: 85170 + HEADER_TITLE: 2058 + CITATION_TECH: 337 + FIGURE_CONTENT: 4811 + FIGURE_LABEL: 11937 + FULLTEXT_EQUATION_LABEL: 6859 + HEADER_OTHER: 7878 + AFFILIATION-ADDRESS_OTHER: 11 + AFFILIATION-ADDRESS_LABORATORY: 1 + FULLTEXT_EQUATION: 16851 + CITATION_DATE: 100647 + CITATION_AUTHOR: 97282 + FULLTEXT_FIGURE: 31693 + FULLTEXT_TABLE: 13122 + AFFILIATION-ADDRESS_MARKER: 3 + CITATION_EDITOR: 904 + FULLTEXT_OTHER: 925 + HEADER_SUBMISSION: 59 + NAME-HEADER_OTHER: 16488 + FUNDING-ACKNOWLEDGEMENT_PROJECTNAME: 198 + AFFILIATION-ADDRESS_POSTBOX: 1 + FIGURE_FIGDESC: 15413 + HEADER_AVAILABILITY: 81 + NAME-HEADER_SUFFIX: 11 + CITATION_VOLUME: 88118 + CITATION_LOCATION: 3307 + FUNDING-ACKNOWLEDGEMENT_INFRASTRUCTURE: 131 + NAME-CITATION_SUFFIX: 153 + AFFILIATION-ADDRESS_DEPARTMENT: 2 + FUNDING-ACKNOWLEDGEMENT_INSTITUTION: 1120 + NAME-HEADER_TITLE: 524 + DATE_MONTH: 5633 + HEADER_WEB: 26 + FUNDING-ACKNOWLEDGEMENT_PERSON: 5527 + HEADER_ABSTRACT: 2360 + CITATION_INSTITUTION: 600 + HEADER_REFERENCE: 244 + AFFILIATION-ADDRESS_INSTITUTION: 4 + FUNDING-ACKNOWLEDGEMENT_GRANTNAME: 515 + CITATION_PAGES: 89019 + HEADER_AUTHOR: 2964 + NAME-HEADER_MARKER: 11983 + AFFILIATION-ADDRESS_COUNTRY: 2 + DATE_OTHER: 7346 + FUNDING-ACKNOWLEDGEMENT_OTHER: 17956 + FUNDING-ACKNOWLEDGEMENT_FUNDERNAME: 4648 + NAME-CITATION_FORENAME: 403393 + CITATION_PUBLISHER: 4379 + FUNDING-ACKNOWLEDGEMENT_GRANTNUMBER: 3396 + HEADER_PUBNUM: 125 + CITATION_PUBNUM: 21585 + NAME-CITATION_MIDDLENAME: 90431 + FULLTEXT_PARAGRAPH: 505587 + HEADER_COPYRIGHT: 55 + FIGURE_FIGURE_HEAD: 22976 + DATE_DAY: 3754 +==================================================================================== + +************************************************************************************ +COUNTER: FigureCounters +************************************************************************************ +------------------------------------------------------------------------------------ + STANDALONE_FIGURES: 1802 + ASSIGNED_GRAPHICS_TO_FIGURES: 4018 +==================================================================================== +==================================================================================== diff --git a/grobid-trainer/doc/eLife_984.results.grobid-0.8-1--Glutton-DeLFT-BidLSTM-CRF-FEATURES_citations_reference_segmenter-09.08.2024 b/grobid-trainer/doc/eLife_984.results.grobid-0.8-1--Glutton-DeLFT-BidLSTM-CRF-FEATURES_citations_reference_segmenter-09.08.2024 new file mode 100644 index 0000000000..9027d5b255 --- /dev/null +++ b/grobid-trainer/doc/eLife_984.results.grobid-0.8-1--Glutton-DeLFT-BidLSTM-CRF-FEATURES_citations_reference_segmenter-09.08.2024 @@ -0,0 +1,402 @@ +-------------> GROBID failed on 1 PDF + +984 PDF files processed in 1130.986 seconds, 1.1493760162601627 seconds per PDF file + +Evaluation metrics produced in 634.047 seconds + +======= Header metadata ======= + +Evaluation on 983 random PDF files out of 982 PDF (ratio 1.0). + +======= Strict Matching ======= (exact matches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +abstract 77.29 9.44 9.16 9.3 983 +authors 93.36 74.28 73.52 73.9 982 +first_author 97.84 92.39 91.54 91.96 981 +title 96.26 86.81 85.05 85.92 983 + +all (micro avg.) 91.19 65.96 64.8 65.37 3929 +all (macro avg.) 91.19 65.73 64.82 65.27 3929 + + +======== Soft Matching ======== (ignoring punctuation, case and space characters mismatches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +abstract 80.44 22.46 21.77 22.11 983 +authors 93.44 74.59 73.83 74.21 982 +first_author 97.84 92.39 91.54 91.96 981 +title 98.22 94.81 92.88 93.83 983 + +all (micro avg.) 92.48 71.24 69.99 70.61 3929 +all (macro avg.) 92.48 71.06 70 70.53 3929 + + +==== Levenshtein Matching ===== (Minimum Levenshtein distance at 0.8) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +abstract 86.52 47.53 46.08 46.8 983 +authors 96.8 88.17 87.27 87.72 982 +first_author 97.91 92.7 91.85 92.27 981 +title 98.58 96.26 94.3 95.27 983 + +all (micro avg.) 94.95 81.3 79.87 80.58 3929 +all (macro avg.) 94.95 81.16 79.88 80.51 3929 + + += Ratcliff/Obershelp Matching = (Minimum Ratcliff/Obershelp similarity at 0.95) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +abstract 85.78 44.49 43.13 43.8 983 +authors 94.76 79.94 79.12 79.53 982 +first_author 97.84 92.39 91.54 91.96 981 +title 98.58 96.26 94.3 95.27 983 + +all (micro avg.) 94.24 78.39 77.02 77.7 3929 +all (macro avg.) 94.24 78.27 77.02 77.64 3929 + +===== Instance-level results ===== + +Total expected instances: 983 +Total correct instances: 73 (strict) +Total correct instances: 198 (soft) +Total correct instances: 377 (Levenshtein) +Total correct instances: 335 (ObservedRatcliffObershelp) + +Instance-level recall: 7.43 (strict) +Instance-level recall: 20.14 (soft) +Instance-level recall: 38.35 (Levenshtein) +Instance-level recall: 34.08 (RatcliffObershelp) + +======= Citation metadata ======= + +Evaluation on 983 random PDF files out of 982 PDF (ratio 1.0). + +======= Strict Matching ======= (exact matches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +authors 96.99 79.4 78.19 78.79 63170 +date 99.39 95.86 93.99 94.91 63567 +first_author 99.22 94.76 93.28 94.02 63170 +inTitle 99.38 95.77 94.68 95.22 63118 +issue 99.86 1.99 75 3.88 16 +page 99.36 96.26 95.2 95.72 53303 +title 98.57 90.25 90.68 90.47 61950 +volume 99.65 97.85 98.17 98.01 60955 + +all (micro avg.) 99.05 92.66 91.93 92.29 429249 +all (macro avg.) 99.05 81.52 89.9 81.38 429249 + + +======== Soft Matching ======== (ignoring punctuation, case and space characters mismatches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +authors 97.01 79.54 78.33 78.93 63170 +date 99.39 95.86 93.99 94.91 63567 +first_author 99.23 94.84 93.36 94.1 63170 +inTitle 99.45 96.25 95.15 95.7 63118 +issue 99.86 1.99 75 3.88 16 +page 99.36 96.26 95.2 95.72 53303 +title 99.4 95.92 96.38 96.15 61950 +volume 99.65 97.85 98.17 98.01 60955 + +all (micro avg.) 99.17 93.59 92.85 93.22 429249 +all (macro avg.) 99.17 82.31 90.7 82.17 429249 + + +==== Levenshtein Matching ===== (Minimum Levenshtein distance at 0.8) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +authors 99.01 93.29 91.87 92.58 63170 +date 99.39 95.86 93.99 94.91 63567 +first_author 99.3 95.29 93.8 94.54 63170 +inTitle 99.5 96.58 95.47 96.02 63118 +issue 99.86 1.99 75 3.88 16 +page 99.36 96.26 95.2 95.72 53303 +title 99.65 97.66 98.12 97.89 61950 +volume 99.65 97.85 98.17 98.01 60955 + +all (micro avg.) 99.47 95.97 95.21 95.59 429249 +all (macro avg.) 99.47 84.35 92.7 84.19 429249 + + += Ratcliff/Obershelp Matching = (Minimum Ratcliff/Obershelp similarity at 0.95) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +authors 98.05 86.71 85.39 86.05 63170 +date 99.39 95.86 93.99 94.91 63567 +first_author 99.22 94.78 93.3 94.03 63170 +inTitle 99.45 96.25 95.16 95.7 63118 +issue 99.86 1.99 75 3.88 16 +page 99.36 96.26 95.2 95.72 53303 +title 99.63 97.5 97.97 97.74 61950 +volume 99.65 97.85 98.17 98.01 60955 + +all (micro avg.) 99.33 94.87 94.11 94.49 429249 +all (macro avg.) 99.33 83.4 91.77 83.26 429249 + +===== Instance-level results ===== + +Total expected instances: 63569 +Total extracted instances: 66388 +Total correct instances: 42246 (strict) +Total correct instances: 45085 (soft) +Total correct instances: 52715 (Levenshtein) +Total correct instances: 49331 (RatcliffObershelp) + +Instance-level precision: 63.63 (strict) +Instance-level precision: 67.91 (soft) +Instance-level precision: 79.4 (Levenshtein) +Instance-level precision: 74.31 (RatcliffObershelp) + +Instance-level recall: 66.46 (strict) +Instance-level recall: 70.92 (soft) +Instance-level recall: 82.93 (Levenshtein) +Instance-level recall: 77.6 (RatcliffObershelp) + +Instance-level f-score: 65.02 (strict) +Instance-level f-score: 69.38 (soft) +Instance-level f-score: 81.13 (Levenshtein) +Instance-level f-score: 75.92 (RatcliffObershelp) + +Matching 1 : 58505 + +Matching 2 : 1012 + +Matching 3 : 1242 + +Matching 4 : 371 + +Total matches : 61130 + +======= Citation context resolution ======= + +Total expected references: 63569 - 64.67 references per article +Total predicted references: 66388 - 67.54 references per article + +Total expected citation contexts: 108880 - 110.76 citation contexts per article +Total predicted citation contexts: 99284 - 101 citation contexts per article + +Total correct predicted citation contexts: 95494 - 97.15 citation contexts per article +Total wrong predicted citation contexts: 3790 (wrong callout matching, callout missing in NLM, or matching with a bib. ref. not aligned with a bib.ref. in NLM) + +Precision citation contexts: 96.18 +Recall citation contexts: 87.71 +fscore citation contexts: 91.75 + +======= Fulltext structures ======= + +Evaluation on 983 random PDF files out of 982 PDF (ratio 1.0). + +======= Strict Matching ======= (exact matches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +availability_stmt 99.76 29.94 26.71 28.24 584 +figure_title 87.45 0.02 0.01 0.01 31671 +funding_stmt 98.47 4.77 23.8 7.95 920 +reference_citation 70.68 55.46 55.67 55.56 108807 +reference_figure 81.67 56.78 49.91 53.12 68786 +reference_table 99.56 68.24 73.46 70.75 2381 +section_title 97.44 85.17 74.17 79.29 21808 +table_title 99.22 0.45 0.16 0.23 1924 + +all (micro avg.) 91.78 54.74 47.79 51.03 236881 +all (macro avg.) 91.78 37.6 37.99 36.89 236881 + + +======== Soft Matching ======== (ignoring punctuation, case and space characters mismatches) + +===== Field-level results ===== + +label accuracy precision recall f1 support + +availability_stmt 99.75 38.96 34.76 36.74 584 +figure_title 89.05 48.86 15.12 23.09 31671 +funding_stmt 98.22 4.77 23.8 7.95 920 +reference_citation 93.28 91.04 91.38 91.21 108807 +reference_figure 78.87 57.06 50.16 53.39 68786 +reference_table 99.49 68.32 73.54 70.83 2381 +section_title 97.16 86.05 74.93 80.1 21808 +table_title 99.48 80.63 27.91 41.47 1924 + +all (micro avg.) 94.41 76.29 66.6 71.12 236881 +all (macro avg.) 94.41 59.46 48.95 50.6 236881 + +===== Document-level ratio results ===== + +label accuracy precision recall f1 support + +availability_stmt 83.49 96.3 89.21 92.62 584 + +all (micro avg.) 83.49 96.3 89.21 92.62 584 +all (macro avg.) 83.49 96.3 89.21 92.62 584 + + +************************************************************************************ +COUNTER: org.grobid.core.engines.counters.TableRejectionCounters +************************************************************************************ +------------------------------------------------------------------------------------ + CANNOT_PARSE_LABEL_TO_INT: 7 + CONTENT_SIZE_TOO_SMALL: 31 + CONTENT_WIDTH_TOO_SMALL: 1 + EMPTY_LABEL_OR_HEADER_OR_CONTENT: 1704 + HEADER_NOT_STARTS_WITH_TABLE_WORD: 52 + HEADER_NOT_CONSECUTIVE: 164 + HEADER_AND_CONTENT_DIFFERENT_PAGES: 5 + HEADER_AND_CONTENT_INTERSECT: 71 +==================================================================================== + +************************************************************************************ +COUNTER: org.grobid.core.engines.counters.ReferenceMarkerMatcherCounters +************************************************************************************ +------------------------------------------------------------------------------------ + MATCHED_REF_MARKERS_AFTER_POST_FILTERING: 6448 + UNMATCHED_REF_MARKERS: 3770 + STYLE_AUTHORS: 64039 + STYLE_NUMBERED: 1241 + MANY_CANDIDATES: 7611 + MANY_CANDIDATES_AFTER_POST_FILTERING: 866 + NO_CANDIDATES: 8187 + INPUT_REF_STRINGS_CNT: 66598 + MATCHED_REF_MARKERS: 99284 + NO_CANDIDATES_AFTER_POST_FILTERING: 294 + STYLE_OTHER: 1318 +==================================================================================== + +************************************************************************************ +COUNTER: org.grobid.core.engines.counters.FigureCounters +************************************************************************************ +------------------------------------------------------------------------------------ + SKIPPED_BAD_STANDALONE_FIGURES: 1632 + SKIPPED_DUE_TO_MISMATCH_OF_CAPTIONS_AND_VECTOR_AND_BITMAP_GRAPHICS: 1 + SKIPPED_SMALL_STANDALONE_FIGURES: 1326 + SKIPPED_BIG_STANDALONE_FIGURES: 306 + TOO_MANY_FIGURES_PER_PAGE: 2 +==================================================================================== + +************************************************************************************ +COUNTER: org.grobid.core.engines.label.TaggingLabelImpl +************************************************************************************ +------------------------------------------------------------------------------------ + HEADER_DOCTYPE: 1258 + CITATION_TITLE: 63651 + HEADER_DATE: 929 + NAME-HEADER_MIDDLENAME: 4489 + HEADER_KEYWORD: 288 + TABLE_FIGDESC: 1106 + NAME-HEADER_SURNAME: 12149 + NAME-CITATION_OTHER: 313974 + HEADER_FUNDING: 315 + CITATION_BOOKTITLE: 1821 + HEADER_ADDRESS: 5313 + HEADER_AFFILIATION: 5919 + CITATION_NOTE: 651 + FULLTEXT_CITATION_MARKER: 131273 + TABLE_NOTE: 1175 + HEADER_EMAIL: 1384 + FULLTEXT_TABLE_MARKER: 4585 + FUNDING-ACKNOWLEDGEMENT_AFFILIATION: 636 + CITATION_WEB: 45809 + HEADER_GROUP: 1 + FULLTEXT_SECTION: 43257 + TABLE_LABEL: 618 + FUNDING-ACKNOWLEDGEMENT_PROGRAMNAME: 235 + DATE_YEAR: 65916 + NAME-HEADER_FORENAME: 12801 + TABLE_CONTENT: 1779 + CITATION_COLLABORATION: 141 + CITATION_ISSUE: 807 + HEADER_MEETING: 1 + HEADER_EDITOR: 926 + CITATION_SERIES: 47 + CITATION_JOURNAL: 62533 + NAME-CITATION_SURNAME: 370790 + TABLE_FIGURE_HEAD: 1251 + FULLTEXT_EQUATION_MARKER: 1190 + CITATION_OTHER: 375507 + FULLTEXT_FIGURE_MARKER: 110783 + HEADER_TITLE: 1014 + CITATION_TECH: 115 + FIGURE_CONTENT: 2846 + FIGURE_LABEL: 10277 + FULLTEXT_EQUATION_LABEL: 1638 + HEADER_OTHER: 7768 + FULLTEXT_EQUATION: 4937 + CITATION_DATE: 65096 + CITATION_AUTHOR: 66578 + FULLTEXT_FIGURE: 25071 + FULLTEXT_TABLE: 5440 + CITATION_EDITOR: 423 + FULLTEXT_OTHER: 63 + HEADER_SUBMISSION: 1664 + NAME-HEADER_OTHER: 8631 + FUNDING-ACKNOWLEDGEMENT_PROJECTNAME: 180 + FIGURE_FIGDESC: 12559 + HEADER_AVAILABILITY: 5 + NAME-HEADER_SUFFIX: 7 + CITATION_VOLUME: 61605 + CITATION_LOCATION: 3972 + NAME-CITATION_SUFFIX: 15 + FUNDING-ACKNOWLEDGEMENT_INFRASTRUCTURE: 171 + FUNDING-ACKNOWLEDGEMENT_INSTITUTION: 819 + NAME-HEADER_TITLE: 166 + DATE_MONTH: 1279 + HEADER_WEB: 169 + FUNDING-ACKNOWLEDGEMENT_PERSON: 5219 + HEADER_ABSTRACT: 1148 + CITATION_INSTITUTION: 2564 + HEADER_REFERENCE: 2046 + FUNDING-ACKNOWLEDGEMENT_GRANTNAME: 389 + CITATION_PAGES: 61658 + HEADER_AUTHOR: 3380 + NAME-HEADER_MARKER: 7440 + DATE_OTHER: 1691 + FUNDING-ACKNOWLEDGEMENT_OTHER: 14918 + FUNDING-ACKNOWLEDGEMENT_FUNDERNAME: 3245 + NAME-CITATION_FORENAME: 375540 + CITATION_PUBLISHER: 2026 + FUNDING-ACKNOWLEDGEMENT_GRANTNUMBER: 3063 + HEADER_PUBNUM: 6026 + CITATION_PUBNUM: 60525 + NAME-CITATION_MIDDLENAME: 3224 + HEADER_COPYRIGHT: 1012 + FULLTEXT_PARAGRAPH: 362605 + FIGURE_FIGURE_HEAD: 19715 + DATE_DAY: 1341 +==================================================================================== + +************************************************************************************ +COUNTER: FigureCounters +************************************************************************************ +------------------------------------------------------------------------------------ + STANDALONE_FIGURES: 411 + ASSIGNED_GRAPHICS_TO_FIGURES: 3488 +==================================================================================== +====================================================================================