From 5468e56fba0cc7284c4e5bb741ab3546504761b4 Mon Sep 17 00:00:00 2001 From: myhloli Date: Fri, 8 Nov 2024 11:38:47 +0800 Subject: [PATCH 1/2] refactor(pdf_parse): adjust line count limit for layoutreader - Decrease the maximum line count from 512 to 316 for layoutreader --- magic_pdf/pdf_parse_union_core_v2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/magic_pdf/pdf_parse_union_core_v2.py b/magic_pdf/pdf_parse_union_core_v2.py index 807a3ecf..8b5e50f2 100644 --- a/magic_pdf/pdf_parse_union_core_v2.py +++ b/magic_pdf/pdf_parse_union_core_v2.py @@ -298,7 +298,7 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height): block['lines'].append({'bbox': line, 'spans': []}) page_line_list.extend(lines) - if len(page_line_list) > 512: # layoutreader最高支持512line + if len(page_line_list) > 316: # layoutreader最高支持512line return None # 使用layoutreader排序 From 5936684fd86003699ea9c492fd588c87ca64b20f Mon Sep 17 00:00:00 2001 From: myhloli Date: Fri, 8 Nov 2024 12:06:15 +0800 Subject: [PATCH 2/2] refactor(pdf_parse): adjust line count threshold for layoutreader - Lower the line count threshold from 316 to 200 to ensure compatibility - This change aims to prevent potential issues with layoutreader's maximum line support --- magic_pdf/pdf_parse_union_core_v2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/magic_pdf/pdf_parse_union_core_v2.py b/magic_pdf/pdf_parse_union_core_v2.py index 8b5e50f2..0cd1ed04 100644 --- a/magic_pdf/pdf_parse_union_core_v2.py +++ b/magic_pdf/pdf_parse_union_core_v2.py @@ -298,7 +298,7 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height): block['lines'].append({'bbox': line, 'spans': []}) page_line_list.extend(lines) - if len(page_line_list) > 316: # layoutreader最高支持512line + if len(page_line_list) > 200: # layoutreader最高支持512line return None # 使用layoutreader排序