Skip to content

Commit

Permalink
Merge pull request #1047 from myhloli/dev
Browse files Browse the repository at this point in the history
fix(ocr_mkcontent): improve hyphen handling at line ends
  • Loading branch information
myhloli authored Nov 21, 2024
2 parents a8ea5d4 + a07007e commit 23c8436
Showing 1 changed file with 4 additions and 3 deletions.
7 changes: 4 additions & 3 deletions magic_pdf/dict2md/ocr_mkcontent.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,9 +141,10 @@ def merge_para_with_text(para_block):
span_type = span['type']
if span_type == ContentType.Text:
line_text += span['content'].strip()

if line_text != '':
line_lang = detect_lang(line_text)
for span in line['spans']:
for j, span in enumerate(line['spans']):

span_type = span['type']
content = ''
Expand All @@ -164,8 +165,8 @@ def merge_para_with_text(para_block):
para_text += f' {content} '
else:
if span_type in [ContentType.Text, ContentType.InlineEquation]:
# 如果是前一行带有-连字符,那么末尾不应该加空格
if __is_hyphen_at_line_end(content):
# 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content):
para_text += content[:-1]
elif len(content) == 1 and content not in ['A', 'I', 'a', 'i'] and not content.isdigit():
para_text += content
Expand Down

0 comments on commit 23c8436

Please sign in to comment.