diff --git a/magic_pdf/dict2md/ocr_mkcontent.py b/magic_pdf/dict2md/ocr_mkcontent.py index 2a9df99..91f8faf 100644 --- a/magic_pdf/dict2md/ocr_mkcontent.py +++ b/magic_pdf/dict2md/ocr_mkcontent.py @@ -71,7 +71,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, for block in para_block['blocks']: # 2nd.拼image_caption if block['type'] == BlockType.ImageCaption: para_text += merge_para_with_text(block) + ' \n' - for block in para_block['blocks']: # 2nd.拼image_caption + for block in para_block['blocks']: # 3rd.拼image_footnote if block['type'] == BlockType.ImageFootnote: para_text += merge_para_with_text(block) + ' \n' elif para_type == BlockType.Table: diff --git a/magic_pdf/libs/draw_bbox.py b/magic_pdf/libs/draw_bbox.py index e8916f4..9703e13 100644 --- a/magic_pdf/libs/draw_bbox.py +++ b/magic_pdf/libs/draw_bbox.py @@ -141,11 +141,33 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename): layout_bbox_list = [] + table_type_order = { + 'table_caption': 1, + 'table_body': 2, + 'table_footnote': 3 + } for page in pdf_info: page_block_list = [] for block in page['para_blocks']: - bbox = block['bbox'] - page_block_list.append(bbox) + if block['type'] in [ + BlockType.Text, + BlockType.Title, + BlockType.InterlineEquation, + BlockType.List, + BlockType.Index, + ]: + bbox = block['bbox'] + page_block_list.append(bbox) + elif block['type'] in [BlockType.Image]: + for sub_block in block['blocks']: + bbox = sub_block['bbox'] + page_block_list.append(bbox) + elif block['type'] in [BlockType.Table]: + sorted_blocks = sorted(block['blocks'], key=lambda x: table_type_order[x['type']]) + for sub_block in sorted_blocks: + bbox = sub_block['bbox'] + page_block_list.append(bbox) + layout_bbox_list.append(page_block_list) pdf_docs = fitz.open('pdf', pdf_bytes) @@ -153,11 +175,11 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename): for i, page in enumerate(pdf_docs): draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158], True) - draw_bbox_without_number(i, tables_list, page, [153, 153, 0], True) # color ! + # draw_bbox_without_number(i, tables_list, page, [153, 153, 0], True) # color ! draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0], True) draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102], True) draw_bbox_without_number(i, tables_footnote_list, page, [229, 255, 204], True) - draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True) + # draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True) draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True) draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255], True) draw_bbox_without_number(i, imgs_footnote_list, page, [255, 178, 102], True), @@ -338,19 +360,23 @@ def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename): for page in pdf_info: page_line_list = [] for block in page['preproc_blocks']: - if block['type'] in ['text', 'title', 'interline_equation']: + if block['type'] in [BlockType.Text, BlockType.Title, BlockType.InterlineEquation]: for line in block['lines']: bbox = line['bbox'] index = line['index'] page_line_list.append({'index': index, 'bbox': bbox}) - if block['type'] in ['table', 'image']: - bbox = block['bbox'] - index = block['index'] - page_line_list.append({'index': index, 'bbox': bbox}) - # for line in block['lines']: - # bbox = line['bbox'] - # index = line['index'] - # page_line_list.append({'index': index, 'bbox': bbox}) + if block['type'] in [BlockType.Image, BlockType.Table]: + for sub_block in block['blocks']: + if sub_block['type'] in [BlockType.ImageBody, BlockType.TableBody]: + for line in sub_block['virtual_lines']: + bbox = line['bbox'] + index = line['index'] + page_line_list.append({'index': index, 'bbox': bbox}) + elif sub_block['type'] in [BlockType.ImageCaption, BlockType.TableCaption, BlockType.ImageFootnote, BlockType.TableFootnote]: + for line in sub_block['lines']: + bbox = line['bbox'] + index = line['index'] + page_line_list.append({'index': index, 'bbox': bbox}) sorted_bboxes = sorted(page_line_list, key=lambda x: x['index']) layout_bbox_list.append(sorted_bbox['bbox'] for sorted_bbox in sorted_bboxes) pdf_docs = fitz.open('pdf', pdf_bytes)