Skip to content

Commit

Permalink
document: PDF metadata extraction
Browse files Browse the repository at this point in the history
* NEW API for metadata and full-text extraction for a PDF document.

Signed-off-by: Sébastien Délèze <[email protected]>
  • Loading branch information
Sébastien Délèze committed Sep 26, 2019
1 parent 72c564e commit 1ff5b6a
Show file tree
Hide file tree
Showing 35 changed files with 1,354 additions and 59 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ python:

before_install:
- sudo apt-get update
- sudo apt-get install -y libxml2 libxml2-dev libxmlsec1 libxmlsec1-dev
- sudo apt-get install -y libxml2 libxml2-dev libxmlsec1 libxmlsec1-dev xpdf
# Stop default travis services
- "sudo service mysql stop"
- "sudo service postgresql stop"
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.base
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

FROM inveniosoftware/centos7-python:3.6

RUN yum -y install libxml2-devel xmlsec1-devel xmlsec1-openssl-devel libtool-ltdl-devel
RUN yum -y install libxml2-devel xmlsec1-devel xmlsec1-openssl-devel libtool-ltdl-devel xpdf

COPY Pipfile Pipfile.lock ./
RUN pipenv install --deploy --system
6 changes: 5 additions & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,14 @@ recursive-include docs *.txt
recursive-include docs Makefile
recursive-include sonar *.gitkeep
recursive-include sonar *.po *.pot *.mo
recursive-include sonar *.json *.html *.js *.scss
recursive-include sonar *.json *.html *.js *.scss *.css
recursive-include sonar *.png *.jpg *.svg
recursive-include docker *.cfg *.conf *.crt *.ini *.key *.pem *.sh
recursive-include tests *.py
recursive-include tests *.doc
recursive-include tests *.json
recursive-include tests *.pdf
recursive-include tests *.xml

# added by check_manifest.py
include *.html
Expand Down
2 changes: 2 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ lxml = ">=3.5.0,<4.2.6"
orcid = "*"
python-slugify = "*"
python3-saml = "*"
xmltodict = "*"

[dev-packages]
Flask-Debugtoolbar = ">=0.10.1"
Expand All @@ -33,6 +34,7 @@ pytest-mock = ">=1.6.0"
pytest-pep8 = ">=1.0.6"
pytest-random-order = ">=0.5.4"
pytest-runner = ">=3.0.0,<5"
docutils = "==0.15"

[requires]
python_version = "3.6"
Expand Down
118 changes: 63 additions & 55 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions docker-compose.full.yml
Original file line number Diff line number Diff line change
Expand Up @@ -141,5 +141,9 @@ services:
extends:
file: docker-services.yml
service: es
grobid:
extends:
file: docker-services.yml
service: grobid
volumes:
static_data:
4 changes: 4 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,7 @@ services:
extends:
file: docker-services.yml
service: es
grobid:
extends:
file: docker-services.yml
service: grobid
5 changes: 5 additions & 0 deletions docker-services.yml
Original file line number Diff line number Diff line change
Expand Up @@ -98,3 +98,8 @@ services:
command: --broker=amqp://guest:guest@mq:5672/ --broker_api=http://guest:guest@mq:15672/api/
ports:
- "5555:5555"
grobid:
image: lfoppiano/grobid:0.5.5
ports:
- "8070:8070"
- "8071:8071"
3 changes: 2 additions & 1 deletion docker/nginx/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ RUN apt-get update && apt-get upgrade -y && apt-get install -y \
libxml2 \
libxml2-dev \
libxmlsec1 \
libxmlsec1-dev
libxmlsec1-dev \
xpdf

COPY nginx.conf /etc/nginx/nginx.conf
COPY conf.d/* /etc/nginx/conf.d/
Expand Down
6 changes: 6 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,11 @@
'documents = sonar.modules.documents.views:blueprint',
'shibboleth_authenticator = \
sonar.modules.shibboleth_authenticator.views.client:blueprint',
'pdf_extractor = \
sonar.modules.pdf_extractor.views.client:blueprint'
],
'invenio_base.api_blueprints': [
'pdf_extractor = sonar.modules.pdf_extractor.views.api:blueprint'
],
'invenio_assets.webpack': [
'sonar_theme = sonar.theme.webpack:theme',
Expand All @@ -71,6 +76,7 @@
'sonar_documents = sonar.modules.documents.config',
'shibboleth_authenticator = \
sonar.modules.shibboleth_authenticator.config',
'pdf_extractor = sonar.modules.pdf_extractor.config',
],
'invenio_i18n.translations': [
'messages = sonar',
Expand Down
Loading

0 comments on commit 1ff5b6a

Please sign in to comment.