diff --git a/magic_pdf/dict2md/ocr_mkcontent.py b/magic_pdf/dict2md/ocr_mkcontent.py index 0cc887ce..7abca480 100644 --- a/magic_pdf/dict2md/ocr_mkcontent.py +++ b/magic_pdf/dict2md/ocr_mkcontent.py @@ -14,7 +14,7 @@ def split_long_words(text): for i in range(len(segments)): words = re.findall(r'\w+|[^\w]', segments[i], re.UNICODE) for j in range(len(words)): - if len(words[j]) > 15: + if len(words[j]) > 10: words[j] = ' '.join(wordninja.split(words[j])) segments[i] = ''.join(words) return ' '.join(segments) @@ -147,6 +147,18 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""): def merge_para_with_text(para_block): + def detect_language(text): + en_pattern = r'[a-zA-Z]+' + en_matches = re.findall(en_pattern, text) + en_length = sum(len(match) for match in en_matches) + if len(text) > 0: + if en_length / len(text) >= 0.5: + return 'en' + else: + return "unknown" + else: + return "empty" + para_text = '' for line in para_block['lines']: line_text = "" @@ -162,7 +174,8 @@ def merge_para_with_text(para_block): content = '' if span_type == ContentType.Text: content = span['content'] - language = detect_lang(content) + # language = detect_lang(content) + language = detect_language(content) if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本 content = ocr_escape_special_markdown_char(split_long_words(content)) else: @@ -171,12 +184,12 @@ def merge_para_with_text(para_block): content = f" ${span['content']}$ " elif span_type == ContentType.InterlineEquation: content = f"\n$$\n{span['content']}\n$$\n" - if content != '': - if 'zh' in line_lang: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断 - para_text += content # 中文语境下,content间不需要空格分隔 + langs = ['zh', 'ja', 'ko'] + if line_lang in langs: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断 + para_text += content # 中文/日语/韩文语境下,content间不需要空格分隔 else: - para_text += content + ' ' # 英文语境下 content间需要空格分隔 + para_text += content + ' ' # 西方文本语境下 content间需要空格分隔 return para_text @@ -202,7 +215,6 @@ def para_to_standard_format(para, img_buket_path): elif span_type == ContentType.InlineEquation: content = f"${span['content']}$" inline_equation_num += 1 - if language == 'en': # 英文语境下 content间需要空格分隔 para_text += content + ' ' else: # 中文语境下,content间不需要空格分隔