-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexport_texts_from_pdf.py
180 lines (136 loc) · 6.76 KB
/
export_texts_from_pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
"""
https://stackoverflow.com/questions/22898145/how-to-export-text-and-text-coordinates-from-a-pdf-filepdf
"""
import json
from collections import defaultdict
from pathlib import Path
from pprint import pprint
from typing import Iterable, Any
import pandas as pd
from PIL import Image
from PIL.ImageDraw import ImageDraw
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBoxHorizontal, LTPage, LTFigure
from cs_uk_transliteration import cs_to_uk, uk_to_cs
from export_images_from_layers import to_percent
def load_translations(
path=Path(
r'E:\code\book-for-ukraine\static\in\Překlady textů a webu – Povídání modro-žluté krajiny - Dvojverší.csv')
):
"""
Data source:
https://docs.google.com/spreadsheets/d/1Gb0XqligJd2rXgUbyGpjv3FkJ4oXwpma8pFi9hiwHqY/edit#gid=0
(exported as csv)
"""
# df = pd.read_csv(path, sep='\t')
df = pd.read_csv(path)
langs = [x.split('_')[1] for x in df.columns if x.startswith('translation_')]
text_to_id_lang = {}
id_to_align = {}
for _, row in df.iterrows():
id_to_align[row['ID']] = row['align']
for l in langs:
text_to_id_lang[row[f'translation_{l}']] = {'lang': l, 'name': row['ID'], 'align': row['align']}
return text_to_id_lang, id_to_align
def export_texts_from_pdf(o: Any, text_to_id_lang, id_to_align, texts, images, depth=0):
"""Show location and text of LTItem and all its descendants"""
# 2x A5 = 2 * 148 = 296, one less than A4 (297)
width_after_cut = 296
height_after_cut = 210
# cuts = {'left': 3, 'right': 3, 'top': 9.6, 'bottom': 3}
cuts = {'left': 2.033, 'right': 2.086, 'top': 5.869, 'bottom': 2.462}
width_before_cut = cuts['left'] + width_after_cut + cuts['right']
height_before_cut = cuts['top'] + height_after_cut + cuts['bottom']
# assert width_before_cut == 306
assert width_before_cut == 300.119
# assert height_before_cut == 222.6
assert height_before_cut == 218.331
to_readd_pct = {direction: cut_value / (width_after_cut if direction in ('left', 'right') else height_after_cut)
for direction, cut_value in cuts.items()}
if images is None:
images = {}
page = -1
page_image = None
if isinstance(o, Iterable):
for i in o:
export_texts_from_pdf(i, text_to_id_lang, id_to_align, texts, images, depth=depth + 1)
image_width = None
image_height = None
if type(o) == LTPage:
page = o.pageid
# We skip the title (page 1) and the last page (page 7)
if page in (1, 7):
continue
page_images = [x for x in o if type(x) == LTFigure and x.name == 'Im0']
if len(page_images) != 1:
raise ValueError(f"Page {o} doesn't have one figure, instead has {page_images}")
page_image = page_images[0]
fixed_width = page_image.width + (page_image.width * (to_readd_pct['left'] + to_readd_pct['right']))
fixed_height = page_image.height + (page_image.height * (to_readd_pct['top'] + to_readd_pct['bottom']))
image_width = fixed_width / 2
image_height = fixed_height
if type(i) == LTTextBoxHorizontal:
lines = [''.join(x._text for x in line._objs).strip() for line in i._objs]
text = '\n'.join(lines)
# no need to fix offset/sizes (consider fixed_*) here as left and right margins are identical
is_right = i.x0 >= (page_image.width / 2)
page_index_offset = 1 if is_right else 0
page_current = ((page - 1) * 2) + page_index_offset
if text not in text_to_id_lang:
raise ValueError(f'Text {text} not found in {text_to_id_lang}')
name = text_to_id_lang[text]['name']
lang = text_to_id_lang[text]['lang']
if name not in texts:
texts[name] = {'positions': [], 'translations': defaultdict(dict)}
texts[name]['page'] = page_current
texts[name]['align'] = id_to_align[name]
script = 'cyrillic' if lang == 'uk' else 'latin'
texts[name]['translations'][lang][script] = text
if script == 'cyrillic':
texts[name]['translations'][lang]['latin'] = uk_to_cs(text)
else:
texts[name]['translations'][lang]['cyrillic'] = cs_to_uk(text)
texts[name]['translations'][lang]['align'] = text_to_id_lang[text]['align']
top = (page_image.y1 + (to_readd_pct['top'] * page_image.height) - i.y1) / image_height
left = (i.x0 - (page_image.x0 + (to_readd_pct['left'] * page_image.width / 2)) - (
image_width if is_right else 0
)) / image_width
width = (i.x1 - i.x0) / image_width
height = (i.y1 - i.y0) / image_height
# cs is first, ua is second
texts[name]['positions'].insert(0 if lang == 'cs' else 1, {
'top': to_percent(top),
'left': to_percent(left),
'width': to_percent(width),
'height': to_percent(height),
})
# Drawing rectangle outlines for debugging purposes
if page_current not in images:
images[page_current] = Image.new('RGBA', (round(image_width), round(image_height)))
draw = ImageDraw(images[page_current])
draw.rectangle((
0,
0,
image_width - 1,
image_height - 1,
), width=1, outline="#ffffff")
draw = ImageDraw(images[page_current])
draw.rectangle((
left * image_width,
top * image_height,
(left + width) * image_width,
(top + height) * image_height,
), width=1, outline="#ffffff")
if __name__ == '__main__':
path = Path(r'E:\code\book-for-ukraine\static\in\UKAJINsKA KNIZKA_dvoustrany.pdf')
pages = extract_pages(path)
text_to_id_lang, id_to_align = load_translations()
texts = {}
images = {}
export_texts_from_pdf(pages, text_to_id_lang, id_to_align, texts, images)
pprint(texts)
print(len(texts))
with open(Path(r'E:\code\book-for-ukraine\static') / 'in' / 'texts.json', 'w', encoding='utf8') as f:
json.dump(texts, f, ensure_ascii=False, sort_keys=True, indent=4)
for page, image in images.items():
image.save(Path(r'E:\code\book-for-ukraine\static') / 'in' / f'page_{page:02d}.png')