Skip to content
This repository has been archived by the owner on Sep 25, 2019. It is now read-only.

Commit

Permalink
Add unoconv-based script to normalize office docs
Browse files Browse the repository at this point in the history
  • Loading branch information
mistydemeo committed Jan 20, 2016
1 parent be4b342 commit 113a1c1
Showing 1 changed file with 41 additions and 0 deletions.
41 changes: 41 additions & 0 deletions normalization/preservation-office-pdfa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import subprocess
import sys

# Python 2/3 compatibility
try:
range = xrange
except NameError:
pass

MAX_TRIES = 3


def convert(document, target, max_tries=MAX_TRIES):
"""
Attempt to convert a document into a PDF at the target location.
This continues up to max_tries times; it's possible for initial calls
to fail due to delays in spinning up a LibreOffice server, where
retries may succeed.
"""
for _ in range(0, max_tries):
try:
# SelectPdfVersion=1 converts to PDF/A-1a, instead of
# the default PDF 1.4
subprocess.check_call(['unoconv', '-eSelectPdfVersion=1',
'--output={}'.format(target), document])
return 0
except subprocess.CalledProcessError:
continue

return 1

if __name__ == '__main__':
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--file-full-name", dest="input")
parser.add_argument("--output-file-path", dest="output")
args, _ = parser.parse_known_args()

sys.exit(convert(args.input, args.output + '.pdf'))

0 comments on commit 113a1c1

Please sign in to comment.