From 1807126e7fc97086cbaa96f171ea1ea3fae23937 Mon Sep 17 00:00:00 2001 From: myhloli Date: Fri, 25 Oct 2024 19:03:03 +0800 Subject: [PATCH] refactor(ocr): adjust OCR processing parameters - Lower the Y-axis overlap threshold for merging spans into lines from0.6 to 0.5 - Reduce the unclip ratio for OCR detection from 2.4 to 1.8 --- magic_pdf/model/pdf_extract_kit.py | 2 +- magic_pdf/pre_proc/ocr_dict_merge.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/magic_pdf/model/pdf_extract_kit.py b/magic_pdf/model/pdf_extract_kit.py index 99c701f..145b3e8 100644 --- a/magic_pdf/model/pdf_extract_kit.py +++ b/magic_pdf/model/pdf_extract_kit.py @@ -83,7 +83,7 @@ def doclayout_yolo_model_init(weight): return model -def ocr_model_init(show_log: bool = False, det_db_box_thresh=0.3, lang=None, use_dilation=True, det_db_unclip_ratio=2.4): +def ocr_model_init(show_log: bool = False, det_db_box_thresh=0.3, lang=None, use_dilation=True, det_db_unclip_ratio=1.8): if lang is not None: model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh, lang=lang, use_dilation=use_dilation, det_db_unclip_ratio=det_db_unclip_ratio) else: diff --git a/magic_pdf/pre_proc/ocr_dict_merge.py b/magic_pdf/pre_proc/ocr_dict_merge.py index 6120e4a..1b55397 100644 --- a/magic_pdf/pre_proc/ocr_dict_merge.py +++ b/magic_pdf/pre_proc/ocr_dict_merge.py @@ -49,7 +49,7 @@ def merge_spans_to_line(spans): continue # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行 - if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox'], 0.6): + if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox'], 0.5): current_line.append(span) else: # 否则,开始新行