opendatalab · drunkpig · Aug 20, 2024 · Aug 19, 2024
diff --git a/magic_pdf/dict2md/ocr_mkcontent.py b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -14,7 +14,7 @@ def split_long_words(text):
     for i in range(len(segments)):
         words = re.findall(r'\w+|[^\w]', segments[i], re.UNICODE)
         for j in range(len(words)):
-            if len(words[j]) > 15:
+            if len(words[j]) > 10:
                 words[j] = ' '.join(wordninja.split(words[j]))
         segments[i] = ''.join(words)
     return ' '.join(segments)
@@ -147,6 +147,18 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
 
 
 def merge_para_with_text(para_block):
+    def detect_language(text):
+        en_pattern = r'[a-zA-Z]+'
+        en_matches = re.findall(en_pattern, text)
+        en_length = sum(len(match) for match in en_matches)
+        if len(text) > 0:
+            if en_length / len(text) >= 0.5:
+                return 'en'
+            else:
+                return "unknown"
+        else:
+            return "empty"
+
     para_text = ''
     for line in para_block['lines']:
         line_text = ""
@@ -162,7 +174,8 @@ def merge_para_with_text(para_block):
             content = ''
             if span_type == ContentType.Text:
                 content = span['content']
-                language = detect_lang(content)
+                # language = detect_lang(content)
+                language = detect_language(content)
                 if language == 'en':  # 只对英文长词进行分词处理，中文分词会丢失文本
                     content = ocr_escape_special_markdown_char(split_long_words(content))
                 else:
@@ -171,12 +184,12 @@ def merge_para_with_text(para_block):
                 content = f" ${span['content']}$ "
             elif span_type == ContentType.InterlineEquation:
                 content = f"\n$$\n{span['content']}\n$$\n"
-
             if content != '':
-                if 'zh' in line_lang:  # 遇到一些一个字一个span的文档，这种单字语言判断不准，需要用整行文本判断
-                    para_text += content  # 中文语境下，content间不需要空格分隔
+                langs = ['zh', 'ja', 'ko']
+                if line_lang in langs:  # 遇到一些一个字一个span的文档，这种单字语言判断不准，需要用整行文本判断
+                    para_text += content  # 中文/日语/韩文语境下，content间不需要空格分隔
                 else:
-                    para_text += content + ' '  # 英文语境下 content间需要空格分隔
+                    para_text += content + ' '  # 西方文本语境下 content间需要空格分隔
     return para_text
 
 
@@ -202,7 +215,6 @@ def para_to_standard_format(para, img_buket_path):
                 elif span_type == ContentType.InlineEquation:
                     content = f"${span['content']}$"
                     inline_equation_num += 1
-
                 if language == 'en':  # 英文语境下 content间需要空格分隔
                     para_text += content + ' '
                 else:  # 中文语境下，content间不需要空格分隔