opendatalab · strongerfly · Aug 26, 2024 · Aug 26, 2024 · Sep 2, 2024 · Sep 2, 2024
diff --git a/README.md b/README.md
@@ -30,6 +30,7 @@
 </div>
 
 # Changelog
+- 2024/08/30: Version 0.7.1 released, add paddle tablemaster table recognition option
 - 2024/08/09: Version 0.7.0b1 released, simplified installation process, added table recognition functionality
 - 2024/08/01: Version 0.6.2b1 released, optimized dependency conflict issues and installation documentation
 - 2024/07/05: Initial open-source release
@@ -157,7 +158,9 @@ In non-mainline environments, due to the diversity of hardware and software conf
     <tr>
         <td colspan="2">Recommended Configuration 16G+ VRAM</td>
         <td colspan="2">3090/3090ti/4070ti super/4080/4090<br>
-        16G or more can enable layout, formula recognition, and OCR acceleration simultaneously</td>
+        16G or more can enable layout, formula recognition, and OCR acceleration simultaneously<br>
+        24G or more can enable layout, formula recognition, OCR acceleration and table recognition simultaneously
+        </td>
     </tr>
 </table>
 
@@ -171,7 +174,7 @@ In non-mainline environments, due to the diversity of hardware and software conf
 ```bash
 conda create -n MinerU python=3.10
 conda activate MinerU
-pip install magic-pdf[full]==0.7.0b1 --extra-index-url https://wheels.myhloli.com
+pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
 ```
 #### 2. Download model weight files
 
@@ -200,6 +203,7 @@ Find the `magic-pdf.json` file in your user directory and configure the "models-
   // other config
   "models-dir": "D:/models",
   "table-config": {
+        "model": "TableMaster", // Another option of this value is 'struct_eqtable'
         "is_table_recog_enable": false, // Table recognition is disabled by default, modify this value to enable it
         "max_time": 400
     }
@@ -311,13 +315,7 @@ TODO
 - Comic books, art books, elementary school textbooks, and exercise books are not well-parsed yet
 - Enabling OCR may produce better results in PDFs with a high density of formulas
 - If you are processing PDFs with a large number of formulas, it is strongly recommended to enable the OCR function. When using PyMuPDF to extract text, overlapping text lines can occur, leading to inaccurate formula insertion positions.
-- **Table Recognition** is currently in the testing phase; recognition speed is slow, and accuracy needs improvement. Below are some performance test results in an Ubuntu 22.04 LTS + Intel(R) Xeon(R) Platinum 8352V CPU @ 2.10GHz + NVIDIA GeForce RTX 4090 environment for reference.
 
-| Table Size     | Parsing Time        | 
-|---------------|----------------------------| 
-| 6\*5 55kb     | 37s                   | 
-| 16\*12 284kb  | 3m18s                 | 
-| 44\*7 559kb   | 4m12s                 | 
 
 # FAQ
 [FAQ in Chinese](docs/FAQ_zh_cn.md)

diff --git a/README_ja-JP.md b/README_ja-JP.md
@@ -116,13 +116,13 @@ pip install detectron2 --extra-index-url https://wheels.myhloli.com
 >CUDA/MPSによる加速については、[CUDAまたはMPSによる加速](#4-CUDAまたはMPSによる加速)を参照してください。
 
 ```bash
-pip install magic-pdf[full]==0.6.2b1
+pip install -U magic-pdf[full]
 ```
 
 > ❗️❗️❗️
 > 私たちは0.6.2 ベータ版を事前にリリースし、私たちのログに記載されている多くの問題に対処しました。しかし、このビルドはまだ完全なQAテストを経ておらず、最終的なリリース品質を表していません。問題に遭遇した場合は、問題を通じて速やかに報告するか、0.6.1バージョンに戻ることをお願いします。
 > ```bash
-> pip install magic-pdf[full-cpu]==0.6.1
+> pip install -U magic-pdf[full]
 > ```
 
 

diff --git a/README_zh-CN.md b/README_zh-CN.md
@@ -33,6 +33,7 @@
 
 
 # 更新记录
+- 2024/08/30 0.7.1发布，集成了paddle tablemaster表格识别功能
 - 2024/08/09 0.7.0b1发布，简化安装步骤提升易用性，加入表格识别功能
 - 2024/08/01 0.6.2b1发布，优化了依赖冲突问题和安装文档
 - 2024/07/05 首次开源
@@ -163,7 +164,9 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
     <tr>
         <td colspan="2">推荐配置 16G+显存</td>
         <td colspan="2">3090/3090ti/4070tisuper/4080/4090<br>
-        16G及以上可以同时开启layout，公式识别和ocr加速</td>
+        16G及以上可以同时开启layout，公式识别和ocr加速<br>
+        24G及以上可以同时开启layout，公式识别，ocr加速和表格识别
+        </td>
     </tr>
 </table>
 
@@ -179,7 +182,7 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
 ```bash
 conda create -n MinerU python=3.10
 conda activate MinerU
-pip install magic-pdf[full]==0.7.0b1 --extra-index-url https://wheels.myhloli.com -i https://pypi.tuna.tsinghua.edu.cn/simple
+pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i https://pypi.tuna.tsinghua.edu.cn/simple
 ```
 #### 2. 下载模型权重文件
 
@@ -208,6 +211,7 @@ cp magic-pdf.template.json ~/magic-pdf.json
   // other config
   "models-dir": "D:/models",
   "table-config": {
+        "model": "TableMaster", // 使用structEqTable请修改为'struct_eqtable'
         "is_table_recog_enable": false, // 表格识别功能默认是关闭的，如果需要修改此处的值
         "max_time": 400
     }
@@ -321,14 +325,6 @@ TODO
 - 漫画书、艺术图册、小学教材、习题尚不能很好解析
 - 在一些公式密集的PDF上强制启用OCR效果会更好
 - 如果您要处理包含大量公式的pdf,强烈建议开启OCR功能。使用pymuPDF提取文字的时候会出现文本行互相重叠的情况导致公式插入位置不准确。
-- **表格识别**目前处于测试阶段，识别速度较慢，识别准确度有待提升。以下是我们在Ubuntu 22.04 LTS + Intel(R) Xeon(R) Platinum 8352V CPU @ 2.10GHz + NVIDIA GeForce RTX 4090环境下的一些性能测试结果，可供参考。
-
-| 表格大小     | 解析耗时        | 
-|---------------|----------------------------| 
-| 6\*5 55kb     | 37s                   | 
-| 16\*12 284kb  | 3m18s                 | 
-| 44\*7 559kb   | 4m12s                 | 
-
 
 
 # FAQ

diff --git a/demo/magic_pdf_parse_main.py b/demo/magic_pdf_parse_main.py
@@ -67,10 +67,11 @@ def pdf_parse_main(
         pdf_name = os.path.basename(pdf_path).split(".")[0]
         pdf_path_parent = os.path.dirname(pdf_path)
 
+        # 文件名称去除空值
         if output_dir:
-            output_path = os.path.join(output_dir, pdf_name)
+            output_path = os.path.join(output_dir, pdf_name.strip())
         else:
-            output_path = os.path.join(pdf_path_parent, pdf_name)
+            output_path = os.path.join(pdf_path_parent, pdf_name.strip())
 
         output_image_path = os.path.join(output_path, 'images')
 

diff --git a/docs/README_Ubuntu_CUDA_Acceleration_en_US.md b/docs/README_Ubuntu_CUDA_Acceleration_en_US.md
@@ -48,7 +48,7 @@
 
 ### 5. Install Applications
    ```sh
-   pip install magic-pdf[full]==0.7.0b1 --extra-index-url https://wheels.myhloli.com
+   pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
    ```
 ❗ After installation, make sure to check the version of `magic-pdf` using the following command:
    ```sh

diff --git a/docs/README_Ubuntu_CUDA_Acceleration_zh_CN.md b/docs/README_Ubuntu_CUDA_Acceleration_zh_CN.md
@@ -43,7 +43,7 @@ conda activate MinerU
 ```
 ## 5. 安装应用
 ```bash
-pip install magic-pdf[full]==0.7.0b1 --extra-index-url https://wheels.myhloli.com -i https://pypi.tuna.tsinghua.edu.cn/simple
+pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i https://pypi.tuna.tsinghua.edu.cn/simple
 ```
 > ❗️下载完成后，务必通过以下命令确认magic-pdf的版本是否正确
 > 

diff --git a/docs/README_Windows_CUDA_Acceleration_en_US.md b/docs/README_Windows_CUDA_Acceleration_en_US.md
@@ -19,7 +19,7 @@ Download link: https://repo.anaconda.com/archive/Anaconda3-2024.06-1-Windows-x86
 
 ### 4. Install Applications
    ```
-   pip install magic-pdf[full]==0.7.0b1 --extra-index-url https://wheels.myhloli.com
+   pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
    ```
    >❗️After installation, verify the version of `magic-pdf`:
    >  ```bash

diff --git a/docs/README_Windows_CUDA_Acceleration_zh_CN.md b/docs/README_Windows_CUDA_Acceleration_zh_CN.md
@@ -20,7 +20,7 @@ conda activate MinerU
 ```
 ## 4. 安装应用
 ```bash
-pip install magic-pdf[full]==0.7.0b1 --extra-index-url https://wheels.myhloli.com -i https://pypi.tuna.tsinghua.edu.cn/simple
+pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i https://pypi.tuna.tsinghua.edu.cn/simple
 ```
 > ❗️下载完成后，务必通过以下命令确认magic-pdf的版本是否正确
 > 

diff --git a/docs/how_to_download_models_en.md b/docs/how_to_download_models_en.md
@@ -44,6 +44,21 @@ The structure of the model folder is as follows, including configuration files a
 │       ├── spiece.model
 │       ├── tokenizer.json
 │       └── tokenizer_config.json 
+│   └─ TableMaster 
+│       └─ ch_PP-OCRv3_det_infer
+│           ├── inference.pdiparams
+│           ├── inference.pdiparams.info
+│           └── inference.pdmodel
+│       └─ ch_PP-OCRv3_rec_infer
+│           ├── inference.pdiparams
+│           ├── inference.pdiparams.info
+│           └── inference.pdmodel
+│       └─ table_structure_tablemaster_infer
+│           ├── inference.pdiparams
+│           ├── inference.pdiparams.info
+│           └── inference.pdmodel
+│       ├── ppocr_keys_v1.txt
+│       └── table_master_structure_dict.txt
 └── README.md
 ```
 #### 2. Check whether the model file is fully downloaded.

diff --git a/docs/how_to_download_models_zh_cn.md b/docs/how_to_download_models_zh_cn.md
@@ -74,6 +74,21 @@ print(f"模型文件下载路径为：{model_dir}/models")
 │       ├── spiece.model
 │       ├── tokenizer.json
 │       └── tokenizer_config.json 
+│   └─ TableMaster 
+│       └─ ch_PP-OCRv3_det_infer
+│           ├── inference.pdiparams
+│           ├── inference.pdiparams.info
+│           └── inference.pdmodel
+│       └─ ch_PP-OCRv3_rec_infer
+│           ├── inference.pdiparams
+│           ├── inference.pdiparams.info
+│           └── inference.pdmodel
+│       └─ table_structure_tablemaster_infer
+│           ├── inference.pdiparams
+│           ├── inference.pdiparams.info
+│           └── inference.pdmodel
+│       ├── ppocr_keys_v1.txt
+│       └── table_master_structure_dict.txt
 └── README.md
 ```
 

diff --git a/magic-pdf.template.json b/magic-pdf.template.json
@@ -6,6 +6,7 @@
     "models-dir":"/tmp/models",
     "device-mode":"cpu",
     "table-config": {
+        "model": "TableMaster",
         "is_table_recog_enable": false,
         "max_time": 400
     }

diff --git a/magic_pdf/dict2md/ocr_mkcontent.py b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -132,6 +132,8 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
                                     # if processed by table model
                                     if span.get('latex', ''):
                                         para_text += f"\n\n$\n {span['latex']}\n$\n\n"
+                                    elif span.get('html', ''):
+                                        para_text += f"\n\n{span['html']}\n\n"
                                     else:
                                         para_text += f"\n![{table_caption}]({join_path(img_buket_path, span['image_path'])})  \n"
                 for block in para_block['blocks']:  # 3rd.拼table_footnote
@@ -256,6 +258,8 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx):
             if block['type'] == BlockType.TableBody:
                 if block["lines"][0]["spans"][0].get('latex', ''):
                     para_content['table_body'] = f"\n\n$\n {block['lines'][0]['spans'][0]['latex']}\n$\n\n"
+                elif block["lines"][0]["spans"][0].get('html', ''):
+                    para_content['table_body'] = f"\n\n{block['lines'][0]['spans'][0]['html']}\n\n"
                 para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
             if block['type'] == BlockType.TableCaption:
                 para_content['table_caption'] = merge_para_with_text(block)

diff --git a/magic_pdf/libs/Constants.py b/magic_pdf/libs/Constants.py
@@ -10,5 +10,31 @@
 # block中lines是否被删除
 LINES_DELETED = "lines_deleted"
 
+# struct eqtable
+STRUCT_EQTABLE = "struct_eqtable"
+
 # table recognition max time default value
-TABLE_MAX_TIME_VALUE = 400
+TABLE_MAX_TIME_VALUE = 400
+
+# pp_table_result_max_length
+TABLE_MAX_LEN = 480
+
+# pp table structure algorithm
+TABLE_MASTER = "TableMaster"
+
+# table master structure dict
+TABLE_MASTER_DICT = "table_master_structure_dict.txt"
+
+# table master dir
+TABLE_MASTER_DIR = "table_structure_tablemaster_infer/"
+
+# pp detect model dir
+DETECT_MODEL_DIR = "ch_PP-OCRv3_det_infer"
+
+# pp rec model dir
+REC_MODEL_DIR = "ch_PP-OCRv3_rec_infer"
+
+# pp rec char dict path
+REC_CHAR_DICT = "ppocr_keys_v1.txt"
+
+
diff --git a/magic_pdf/libs/version.py b/magic_pdf/libs/version.py
@@ -1 +1 @@
-__version__ = "0.7.0b1"
+__version__ = "0.7.1"
diff --git a/magic_pdf/model/magic_model.py b/magic_pdf/model/magic_model.py
@@ -562,8 +562,11 @@ def remove_duplicate_spans(spans):
                 elif category_id == 5:
                     # 获取table模型结果
                     latex = layout_det.get("latex", None)
+                    html = layout_det.get("html", None)
                     if latex:
                         span["latex"] = latex
+                    elif html:
+                        span["html"] = html
                     span["type"] = ContentType.Table
                 elif category_id == 13:
                     span["content"] = layout_det["latex"]

diff --git a/magic_pdf/model/pdf_extract_kit.py b/magic_pdf/model/pdf_extract_kit.py
@@ -2,7 +2,7 @@
 import os
 import time
 
-from magic_pdf.libs.Constants import TABLE_MAX_TIME_VALUE
+from magic_pdf.libs.Constants import *
 
 os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'  # 禁止albumentations检查更新
 try:
@@ -34,10 +34,18 @@
 from magic_pdf.model.pek_sub_modules.post_process import get_croped_image, latex_rm_whitespace
 from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR
 from magic_pdf.model.pek_sub_modules.structeqtable.StructTableModel import StructTableModel
-
-
-def table_model_init(model_path, max_time, _device_='cpu'):
-    table_model = StructTableModel(model_path, max_time=max_time, device=_device_)
+from magic_pdf.model.ppTableModel import ppTableModel
+
+
+def table_model_init(table_model_type, model_path, max_time, _device_='cpu'):
+    if table_model_type == STRUCT_EQTABLE:
+        table_model = StructTableModel(model_path, max_time=max_time, device=_device_)
+    else:
+        config = {
+            "model_dir": model_path,
+            "device": _device_
+        }
+        table_model = ppTableModel(config)
     return table_model
 
 
@@ -104,9 +112,11 @@ def __init__(self, ocr: bool = False, show_log: bool = False, **kwargs):
         # 初始化解析配置
         self.apply_layout = kwargs.get("apply_layout", self.configs["config"]["layout"])
         self.apply_formula = kwargs.get("apply_formula", self.configs["config"]["formula"])
+        # table config
         self.table_config = kwargs.get("table_config", self.configs["config"]["table_config"])
         self.apply_table = self.table_config.get("is_table_recog_enable", False)
         self.table_max_time = self.table_config.get("max_time", TABLE_MAX_TIME_VALUE)
+        self.table_model_type = self.table_config.get("model", TABLE_MASTER)
         self.apply_ocr = ocr
         logger.info(
             "DocAnalysis init, this may take some times. apply_layout: {}, apply_formula: {}, apply_ocr: {}, apply_table: {}".format(
@@ -141,10 +151,11 @@ def __init__(self, ocr: bool = False, show_log: bool = False, **kwargs):
         if self.apply_ocr:
             self.ocr_model = ModifiedPaddleOCR(show_log=show_log)
 
-        # init structeqtable
+        # init table model
         if self.apply_table:
-            self.table_model = table_model_init(str(os.path.join(models_dir, self.configs["weights"]["table"])),
-                                                max_time = self.table_max_time, _device_=self.device)
+            table_model_dir = self.configs["weights"][self.table_model_type]
+            self.table_model = table_model_init(self.table_model_type, str(os.path.join(models_dir, table_model_dir)),
+                                                max_time=self.table_max_time, _device_=self.device)
         logger.info('DocAnalysis init done!')
 
     def __call__(self, image):
@@ -278,16 +289,28 @@ def crop_img(input_res, input_pil_img, crop_paste_x=0, crop_paste_y=0):
                 new_image, _ = crop_img(res, pil_img)
                 single_table_start_time = time.time()
                 logger.info("------------------table recognition processing begins-----------------")
+                latex_code = None
+                html_code = None
                 with torch.no_grad():
-                    latex_code = self.table_model.image2latex(new_image)[0]
+                    if self.table_model_type == STRUCT_EQTABLE:
+                        latex_code = self.table_model.image2latex(new_image)[0]
+                    else:
+                        html_code = self.table_model.img2html(new_image)
                 run_time = time.time() - single_table_start_time
                 logger.info(f"------------table recognition processing ends within {run_time}s-----")
                 if run_time > self.table_max_time:
                     logger.warning(f"------------table recognition processing exceeds max time {self.table_max_time}s----------")
                 # 判断是否返回正常
-                expected_ending = latex_code.strip().endswith('end{tabular}') or latex_code.strip().endswith('end{table}')
-                if latex_code and expected_ending:
-                    res["latex"] = latex_code
+
+                if latex_code:
+                    expected_ending = latex_code.strip().endswith('end{tabular}') or latex_code.strip().endswith(
+                        'end{table}')
+                    if expected_ending:
+                        res["latex"] = latex_code
+                    else:
+                        logger.warning(f"------------table recognition processing fails----------")
+                elif html_code:
+                    res["html"] = html_code
                 else:
                     logger.warning(f"------------table recognition processing fails----------")
             table_cost = round(time.time() - table_start, 2)

diff --git a/magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py b/magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py
@@ -12,7 +12,6 @@ def __init__(self, model_path, max_new_tokens=2048, max_time=400, device = 'cpu'
             self.model = StructTable(self.model_path, self.max_new_tokens, self.max_time)
 
     def image2latex(self, image) -> str:
-        #
         table_latex = self.model.forward(image)
         return table_latex