From 3ae82077d20edaa1f242dcec8ce9a81f1803bae4 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Mon, 15 Jul 2024 12:18:20 +0200 Subject: [PATCH 01/15] Add repo, absolute URLs --- README.md | 2 +- pyproject.toml | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index c9e0e20..d716832 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ You can limit the CPU threads used by `docling` by setting the environment varia ## Contributing -Please read [Contributing to Docling](./CONTRIBUTING.md) for details. +Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details. ## References diff --git a/pyproject.toml b/pyproject.toml index a603ba1..5f0c489 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,6 +5,8 @@ description = "Docling PDF conversion package" authors = ["Christoph Auer ", "Michele Dolfi ", "Maxim Lysak ", "Nikos Livathinos ", "Ahmed Nassar ", "Peter Staar "] license = "MIT" readme = "README.md" +repository = "https://github.com/DS4SD/docling" +homepage = "https://github.com/DS4SD/docling" keywords= ["docling", "convert", "document", "pdf", "layout model", "segmentation", "table structure", "table former"] classifiers = [ "License :: OSI Approved :: MIT License", From 5acb7b51cff07756e9a7f3dd2366f0b6f8c53a55 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Wed, 17 Jul 2024 15:21:13 +0200 Subject: [PATCH 02/15] Optimizations for table extraction quality, configurable options for cell matching --- docling/datamodel/base_models.py | 28 +++++++++-- docling/models/table_structure_model.py | 54 ++++++++++++++++----- docling/pipeline/standard_model_pipeline.py | 2 +- examples/convert.py | 2 - 4 files changed, 68 insertions(+), 18 deletions(-) diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index dd9795a..8b6796d 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -1,3 +1,4 @@ +import copy from enum import Enum, auto from io import BytesIO from typing import Any, Dict, List, Optional, Tuple, Union @@ -47,6 +48,15 @@ def width(self): def height(self): return abs(self.t - self.b) + def scaled(self, scale: float) -> "BoundingBox": + out_bbox = copy.deepcopy(self) + out_bbox.l *= scale + out_bbox.r *= scale + out_bbox.t *= scale + out_bbox.b *= scale + + return out_bbox + def as_tuple(self): if self.coord_origin == CoordOrigin.TOPLEFT: return (self.l, self.t, self.r, self.b) @@ -180,8 +190,7 @@ class TableStructurePrediction(BaseModel): table_map: Dict[int, TableElement] = {} -class TextElement(BasePageElement): - ... +class TextElement(BasePageElement): ... class FigureData(BaseModel): @@ -242,6 +251,17 @@ class DocumentStream(BaseModel): stream: BytesIO +class TableStructureOptions(BaseModel): + do_cell_matching: bool = ( + True + # True: Matches predictions back to PDF cells. Can break table output if PDF cells + # are merged across table columns. + # False: Let table structure model define the text cells, ignore PDF cells. + ) + + class PipelineOptions(BaseModel): - do_table_structure: bool = True - do_ocr: bool = False + do_table_structure: bool = True # True: perform table structure extraction + do_ocr: bool = False # True: perform OCR, replace programmatic PDF text + + table_structure_options: TableStructureOptions = TableStructureOptions() diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py index 8ee4bda..132b141 100644 --- a/docling/models/table_structure_model.py +++ b/docling/models/table_structure_model.py @@ -1,7 +1,10 @@ -from typing import Iterable +import copy +import random +from typing import Iterable, List import numpy from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor +from PIL import ImageDraw from docling.datamodel.base_models import ( BoundingBox, @@ -28,6 +31,21 @@ def __init__(self, config): self.tm_model_type = self.tm_config["model"]["type"] self.tf_predictor = TFPredictor(self.tm_config) + self.scale = 2.0 # Scale up table input images to 144 dpi + + def draw_table_and_cells(self, page: Page, tbl_list: List[TableElement]): + image = page._backend.get_page_image() + draw = ImageDraw.Draw(image) + + for table_element in tbl_list: + x0, y0, x1, y1 = table_element.cluster.bbox.as_tuple() + draw.rectangle([(x0, y0), (x1, y1)], outline="red") + + for tc in table_element.table_cells: + x0, y0, x1, y1 = tc.bbox.as_tuple() + draw.rectangle([(x0, y0), (x1, y1)], outline="blue") + + image.show() def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: @@ -36,16 +54,17 @@ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: return for page in page_batch: + page.predictions.tablestructure = TableStructurePrediction() # dummy in_tables = [ ( cluster, [ - round(cluster.bbox.l), - round(cluster.bbox.t), - round(cluster.bbox.r), - round(cluster.bbox.b), + round(cluster.bbox.l) * self.scale, + round(cluster.bbox.t) * self.scale, + round(cluster.bbox.r) * self.scale, + round(cluster.bbox.b) * self.scale, ], ) for cluster in page.predictions.layout.clusters @@ -65,20 +84,29 @@ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: ): # Only allow non empty stings (spaces) into the cells of a table if len(c.text.strip()) > 0: - tokens.append(c.model_dump()) + new_cell = copy.deepcopy(c) + new_cell.bbox = new_cell.bbox.scaled(scale=self.scale) + + tokens.append(new_cell.model_dump()) - iocr_page = { - "image": numpy.asarray(page.image), + page_input = { "tokens": tokens, - "width": page.size.width, - "height": page.size.height, + "width": page.size.width * self.scale, + "height": page.size.height * self.scale, } + # add image to page input. + if self.scale == 1.0: + page_input["image"] = numpy.asarray(page.image) + else: # render new page image on the fly at desired scale + page_input["image"] = numpy.asarray( + page._backend.get_page_image(scale=self.scale) + ) table_clusters, table_bboxes = zip(*in_tables) if len(table_bboxes): tf_output = self.tf_predictor.multi_table_predict( - iocr_page, table_bboxes, do_matching=self.do_cell_matching + page_input, table_bboxes, do_matching=self.do_cell_matching ) for table_cluster, table_out in zip(table_clusters, tf_output): @@ -91,6 +119,7 @@ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: element["bbox"]["token"] = text_piece tc = TableCell.model_validate(element) + tc.bbox = tc.bbox.scaled(1 / self.scale) table_cells.append(tc) # Retrieving cols/rows, after post processing: @@ -111,4 +140,7 @@ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: page.predictions.tablestructure.table_map[table_cluster.id] = tbl + # For debugging purposes: + # self.draw_table_and_cells(page, page.predictions.tablestructure.table_map.values()) + yield page diff --git a/docling/pipeline/standard_model_pipeline.py b/docling/pipeline/standard_model_pipeline.py index 07c0113..33fee75 100644 --- a/docling/pipeline/standard_model_pipeline.py +++ b/docling/pipeline/standard_model_pipeline.py @@ -34,7 +34,7 @@ def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions): "artifacts_path": artifacts_path / StandardModelPipeline._table_model_path, "enabled": pipeline_options.do_table_structure, - "do_cell_matching": False, + "do_cell_matching": pipeline_options.table_structure_options.do_cell_matching, } ), ] diff --git a/examples/convert.py b/examples/convert.py index 89b3726..26a38c5 100644 --- a/examples/convert.py +++ b/examples/convert.py @@ -46,8 +46,6 @@ def main(): logging.basicConfig(level=logging.INFO) input_doc_paths = [ - # Path("/Users/cau/Downloads/Issue-36122.pdf"), - # Path("/Users/cau/Downloads/IBM_Storage_Insights_Fact_Sheet.pdf"), Path("./test/data/2206.01062.pdf"), Path("./test/data/2203.01017v2.pdf"), Path("./test/data/2305.03393v1.pdf"), From 78b154fde7d6307b111bf33d443d3dea935198ae Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Mon, 15 Jul 2024 12:18:20 +0200 Subject: [PATCH 03/15] Add repo, absolute URLs Signed-off-by: Christoph Auer --- README.md | 2 +- pyproject.toml | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index c9e0e20..d716832 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ You can limit the CPU threads used by `docling` by setting the environment varia ## Contributing -Please read [Contributing to Docling](./CONTRIBUTING.md) for details. +Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details. ## References diff --git a/pyproject.toml b/pyproject.toml index a603ba1..5f0c489 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,6 +5,8 @@ description = "Docling PDF conversion package" authors = ["Christoph Auer ", "Michele Dolfi ", "Maxim Lysak ", "Nikos Livathinos ", "Ahmed Nassar ", "Peter Staar "] license = "MIT" readme = "README.md" +repository = "https://github.com/DS4SD/docling" +homepage = "https://github.com/DS4SD/docling" keywords= ["docling", "convert", "document", "pdf", "layout model", "segmentation", "table structure", "table former"] classifiers = [ "License :: OSI Approved :: MIT License", From 6c016001947d7bcdfb2c0f29e102e967aad75b61 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Wed, 17 Jul 2024 15:21:13 +0200 Subject: [PATCH 04/15] Optimizations for table extraction quality, configurable options for cell matching Signed-off-by: Christoph Auer --- docling/datamodel/base_models.py | 28 +++++++++-- docling/models/table_structure_model.py | 54 ++++++++++++++++----- docling/pipeline/standard_model_pipeline.py | 2 +- examples/convert.py | 2 - 4 files changed, 68 insertions(+), 18 deletions(-) diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index dd9795a..8b6796d 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -1,3 +1,4 @@ +import copy from enum import Enum, auto from io import BytesIO from typing import Any, Dict, List, Optional, Tuple, Union @@ -47,6 +48,15 @@ def width(self): def height(self): return abs(self.t - self.b) + def scaled(self, scale: float) -> "BoundingBox": + out_bbox = copy.deepcopy(self) + out_bbox.l *= scale + out_bbox.r *= scale + out_bbox.t *= scale + out_bbox.b *= scale + + return out_bbox + def as_tuple(self): if self.coord_origin == CoordOrigin.TOPLEFT: return (self.l, self.t, self.r, self.b) @@ -180,8 +190,7 @@ class TableStructurePrediction(BaseModel): table_map: Dict[int, TableElement] = {} -class TextElement(BasePageElement): - ... +class TextElement(BasePageElement): ... class FigureData(BaseModel): @@ -242,6 +251,17 @@ class DocumentStream(BaseModel): stream: BytesIO +class TableStructureOptions(BaseModel): + do_cell_matching: bool = ( + True + # True: Matches predictions back to PDF cells. Can break table output if PDF cells + # are merged across table columns. + # False: Let table structure model define the text cells, ignore PDF cells. + ) + + class PipelineOptions(BaseModel): - do_table_structure: bool = True - do_ocr: bool = False + do_table_structure: bool = True # True: perform table structure extraction + do_ocr: bool = False # True: perform OCR, replace programmatic PDF text + + table_structure_options: TableStructureOptions = TableStructureOptions() diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py index 8ee4bda..132b141 100644 --- a/docling/models/table_structure_model.py +++ b/docling/models/table_structure_model.py @@ -1,7 +1,10 @@ -from typing import Iterable +import copy +import random +from typing import Iterable, List import numpy from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor +from PIL import ImageDraw from docling.datamodel.base_models import ( BoundingBox, @@ -28,6 +31,21 @@ def __init__(self, config): self.tm_model_type = self.tm_config["model"]["type"] self.tf_predictor = TFPredictor(self.tm_config) + self.scale = 2.0 # Scale up table input images to 144 dpi + + def draw_table_and_cells(self, page: Page, tbl_list: List[TableElement]): + image = page._backend.get_page_image() + draw = ImageDraw.Draw(image) + + for table_element in tbl_list: + x0, y0, x1, y1 = table_element.cluster.bbox.as_tuple() + draw.rectangle([(x0, y0), (x1, y1)], outline="red") + + for tc in table_element.table_cells: + x0, y0, x1, y1 = tc.bbox.as_tuple() + draw.rectangle([(x0, y0), (x1, y1)], outline="blue") + + image.show() def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: @@ -36,16 +54,17 @@ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: return for page in page_batch: + page.predictions.tablestructure = TableStructurePrediction() # dummy in_tables = [ ( cluster, [ - round(cluster.bbox.l), - round(cluster.bbox.t), - round(cluster.bbox.r), - round(cluster.bbox.b), + round(cluster.bbox.l) * self.scale, + round(cluster.bbox.t) * self.scale, + round(cluster.bbox.r) * self.scale, + round(cluster.bbox.b) * self.scale, ], ) for cluster in page.predictions.layout.clusters @@ -65,20 +84,29 @@ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: ): # Only allow non empty stings (spaces) into the cells of a table if len(c.text.strip()) > 0: - tokens.append(c.model_dump()) + new_cell = copy.deepcopy(c) + new_cell.bbox = new_cell.bbox.scaled(scale=self.scale) + + tokens.append(new_cell.model_dump()) - iocr_page = { - "image": numpy.asarray(page.image), + page_input = { "tokens": tokens, - "width": page.size.width, - "height": page.size.height, + "width": page.size.width * self.scale, + "height": page.size.height * self.scale, } + # add image to page input. + if self.scale == 1.0: + page_input["image"] = numpy.asarray(page.image) + else: # render new page image on the fly at desired scale + page_input["image"] = numpy.asarray( + page._backend.get_page_image(scale=self.scale) + ) table_clusters, table_bboxes = zip(*in_tables) if len(table_bboxes): tf_output = self.tf_predictor.multi_table_predict( - iocr_page, table_bboxes, do_matching=self.do_cell_matching + page_input, table_bboxes, do_matching=self.do_cell_matching ) for table_cluster, table_out in zip(table_clusters, tf_output): @@ -91,6 +119,7 @@ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: element["bbox"]["token"] = text_piece tc = TableCell.model_validate(element) + tc.bbox = tc.bbox.scaled(1 / self.scale) table_cells.append(tc) # Retrieving cols/rows, after post processing: @@ -111,4 +140,7 @@ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: page.predictions.tablestructure.table_map[table_cluster.id] = tbl + # For debugging purposes: + # self.draw_table_and_cells(page, page.predictions.tablestructure.table_map.values()) + yield page diff --git a/docling/pipeline/standard_model_pipeline.py b/docling/pipeline/standard_model_pipeline.py index 07c0113..33fee75 100644 --- a/docling/pipeline/standard_model_pipeline.py +++ b/docling/pipeline/standard_model_pipeline.py @@ -34,7 +34,7 @@ def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions): "artifacts_path": artifacts_path / StandardModelPipeline._table_model_path, "enabled": pipeline_options.do_table_structure, - "do_cell_matching": False, + "do_cell_matching": pipeline_options.table_structure_options.do_cell_matching, } ), ] diff --git a/examples/convert.py b/examples/convert.py index 89b3726..26a38c5 100644 --- a/examples/convert.py +++ b/examples/convert.py @@ -46,8 +46,6 @@ def main(): logging.basicConfig(level=logging.INFO) input_doc_paths = [ - # Path("/Users/cau/Downloads/Issue-36122.pdf"), - # Path("/Users/cau/Downloads/IBM_Storage_Insights_Fact_Sheet.pdf"), Path("./test/data/2206.01062.pdf"), Path("./test/data/2203.01017v2.pdf"), Path("./test/data/2305.03393v1.pdf"), From f652bad2d1e531fc24933356aea5a907836e1ca6 Mon Sep 17 00:00:00 2001 From: Christoph Auer <60343111+cau-git@users.noreply.github.com> Date: Mon, 15 Jul 2024 12:43:05 +0200 Subject: [PATCH 05/15] docs: Update links, add GH repository to metadata (#1) * Add repo, absolute URLs Signed-off-by: Christoph Auer * Bump version Signed-off-by: Christoph Auer --------- Signed-off-by: Christoph Auer Co-authored-by: Christoph Auer Signed-off-by: Christoph Auer --- README.md | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d716832..9c4217a 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@

- Docling + Docling

# Docling diff --git a/pyproject.toml b/pyproject.toml index 5f0c489..133bb74 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "docling" -version = "0.1.0" +version = "0.1.1" description = "Docling PDF conversion package" authors = ["Christoph Auer ", "Michele Dolfi ", "Maxim Lysak ", "Nikos Livathinos ", "Ahmed Nassar ", "Peter Staar "] license = "MIT" From a5113eb78e7278794a8a9dbc0cdfabc7ca2aabd4 Mon Sep 17 00:00:00 2001 From: Christoph Auer <60343111+cau-git@users.noreply.github.com> Date: Mon, 15 Jul 2024 14:59:53 +0200 Subject: [PATCH 06/15] doc: More documentation updates (#2) * Update README.md Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> * Update Dockerfile Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> * Bump version Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> --------- Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> Signed-off-by: Christoph Auer --- Dockerfile | 3 +-- README.md | 4 ++-- pyproject.toml | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index bef7be5..b2138a6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,8 +6,7 @@ RUN apt-get update \ && apt-get install -y libgl1 libglib2.0-0 curl wget git \ && apt-get clean -RUN --mount=type=ssh \ - pip install --no-cache-dir https://github.com/DS4SD/docling.git +RUN pip install --no-cache-dir docling ENV HF_HOME=/tmp/ ENV TORCH_HOME=/tmp/ diff --git a/README.md b/README.md index 9c4217a..1367cfd 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ # Docling -Dockling bundles PDF document conversion to JSON and Markdown in an easy, self-contained package. +Docling bundles PDF document conversion to JSON and Markdown in an easy, self-contained package. ## Features * ⚡ Converts any PDF document to JSON or Markdown format, stable and lightning fast @@ -30,7 +30,7 @@ poetry install ## Usage -For basic usage, see the [convert.py](examples/convert.py) example module. Run with: +For basic usage, see the [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py) example module. Run with: ``` python examples/convert.py diff --git a/pyproject.toml b/pyproject.toml index 133bb74..1883ae9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "docling" -version = "0.1.1" +version = "0.1.2" description = "Docling PDF conversion package" authors = ["Christoph Auer ", "Michele Dolfi ", "Maxim Lysak ", "Nikos Livathinos ", "Ahmed Nassar ", "Peter Staar "] license = "MIT" From cf37ace24cdd22f029285353b44491f462c7bc70 Mon Sep 17 00:00:00 2001 From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Date: Tue, 16 Jul 2024 13:05:04 +0200 Subject: [PATCH 07/15] ci: Add Github Actions (#4) * add Github Actions Signed-off-by: Michele Dolfi * apply styling Signed-off-by: Michele Dolfi * Update .github/actions/setup-poetry/action.yml Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> * add semantic-release config Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --------- Signed-off-by: Michele Dolfi Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Signed-off-by: Christoph Auer --- .github/actions/setup-poetry/action.yml | 19 ++++++++ .github/scripts/release.sh | 39 ++++++++++++++++ .github/workflows/cd.yml | 59 +++++++++++++++++++++++++ .github/workflows/checks.yml | 16 +++++++ .github/workflows/ci.yml | 28 ++++++++++++ .github/workflows/pypi.yml | 21 +++++++++ .gitignore | 1 - pyproject.toml | 15 ++++++- 8 files changed, 196 insertions(+), 2 deletions(-) create mode 100644 .github/actions/setup-poetry/action.yml create mode 100755 .github/scripts/release.sh create mode 100644 .github/workflows/cd.yml create mode 100644 .github/workflows/checks.yml create mode 100644 .github/workflows/ci.yml create mode 100644 .github/workflows/pypi.yml diff --git a/.github/actions/setup-poetry/action.yml b/.github/actions/setup-poetry/action.yml new file mode 100644 index 0000000..e9ce697 --- /dev/null +++ b/.github/actions/setup-poetry/action.yml @@ -0,0 +1,19 @@ +name: 'Set up Poetry and install' +description: 'Set up a specific version of Poetry and install dependencies using caching.' +inputs: + python-version: + description: "Version range or exact version of Python or PyPy to use, using SemVer's version range syntax." + default: '3.11' +runs: + using: 'composite' + steps: + - name: Install poetry + run: pipx install poetry==1.8.3 + shell: bash + - uses: actions/setup-python@v4 + with: + python-version: ${{ inputs.python-version }} + cache: 'poetry' + - name: Install dependencies + run: poetry install --all-extras + shell: bash diff --git a/.github/scripts/release.sh b/.github/scripts/release.sh new file mode 100755 index 0000000..6cac400 --- /dev/null +++ b/.github/scripts/release.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +set -e # trigger failure on error - do not remove! +set -x # display command on output + +if [ -z "${TARGET_VERSION}" ]; then + >&2 echo "No TARGET_VERSION specified" + exit 1 +fi +CHGLOG_FILE="${CHGLOG_FILE:-CHANGELOG.md}" + +# update package version +poetry version "${TARGET_VERSION}" + +# collect release notes +REL_NOTES=$(mktemp) +poetry run semantic-release changelog --unreleased >> "${REL_NOTES}" + +# update changelog +TMP_CHGLOG=$(mktemp) +TARGET_TAG_NAME="v${TARGET_VERSION}" +RELEASE_URL="$(gh repo view --json url -q ".url")/releases/tag/${TARGET_TAG_NAME}" +printf "## [${TARGET_TAG_NAME}](${RELEASE_URL}) - $(date -Idate)\n\n" >> "${TMP_CHGLOG}" +cat "${REL_NOTES}" >> "${TMP_CHGLOG}" +if [ -f "${CHGLOG_FILE}" ]; then + printf "\n" | cat - "${CHGLOG_FILE}" >> "${TMP_CHGLOG}" +fi +mv "${TMP_CHGLOG}" "${CHGLOG_FILE}" + +# push changes +git config --global user.name 'github-actions[bot]' +git config --global user.email 'github-actions[bot]@users.noreply.github.com' +git add pyproject.toml "${CHGLOG_FILE}" +COMMIT_MSG="chore: bump version to ${TARGET_VERSION} [skip ci]" +git commit -m "${COMMIT_MSG}" +git push origin main + +# create GitHub release (incl. Git tag) +gh release create "${TARGET_TAG_NAME}" -F "${REL_NOTES}" diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml new file mode 100644 index 0000000..b65c72c --- /dev/null +++ b/.github/workflows/cd.yml @@ -0,0 +1,59 @@ +name: "Run CD" + +on: + push: + branches: + - main + +env: + # disable keyring (https://github.com/actions/runner-images/issues/6185): + PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring + +jobs: + docs: + permissions: + contents: write + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: ./.github/actions/setup-poetry + - name: Build and push docs + run: poetry run mkdocs gh-deploy --force + code-checks: + uses: ./.github/workflows/checks.yml + pre-release-check: + runs-on: ubuntu-latest + outputs: + TARGET_TAG_V: ${{ steps.version_check.outputs.TRGT_VERSION }} + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 # for fetching tags, required for semantic-release + - uses: ./.github/actions/setup-poetry + - name: Check version of potential release + id: version_check + run: | + TRGT_VERSION=$(poetry run semantic-release print-version) + echo "TRGT_VERSION=${TRGT_VERSION}" >> $GITHUB_OUTPUT + echo "${TRGT_VERSION}" + - name: Check notes of potential release + run: poetry run semantic-release changelog --unreleased + release: + needs: [code-checks, pre-release-check] + if: needs.pre-release-check.outputs.TARGET_TAG_V != '' + environment: auto-release + runs-on: ubuntu-latest + concurrency: release + steps: + - uses: actions/checkout@v3 + with: + token: ${{ secrets.GH_PAT }} + fetch-depth: 0 # for fetching tags, required for semantic-release + - uses: ./.github/actions/setup-poetry + - name: Run release script + env: + GH_TOKEN: ${{ secrets.GH_PAT }} + TARGET_VERSION: ${{ needs.pre-release-check.outputs.TARGET_TAG_V }} + CHGLOG_FILE: CHANGELOG.md + run: ./.github/scripts/release.sh + shell: bash diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml new file mode 100644 index 0000000..6e2ff36 --- /dev/null +++ b/.github/workflows/checks.yml @@ -0,0 +1,16 @@ +on: + workflow_call: + +jobs: + run-checks: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ['3.11'] + steps: + - uses: actions/checkout@v3 + - uses: ./.github/actions/setup-poetry + with: + python-version: ${{ matrix.python-version }} + - name: Run styling check + run: poetry run pre-commit run --all-files diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..33fb37a --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,28 @@ +name: "Run CI" + +on: + pull_request: + types: [opened, reopened, synchronize, ready_for_review] + push: + branches: + - "**" + - "!main" + - "!gh-pages" + +env: + # disable keyring (https://github.com/actions/runner-images/issues/6185): + PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring + +jobs: + code-checks: + uses: ./.github/workflows/checks.yml + + # To enable when we add the ./docs + # build-docs: + # runs-on: ubuntu-latest + # steps: + # - uses: actions/checkout@v3 + # - uses: ./.github/actions/setup-poetry + # - name: Build docs + # run: poetry run mkdocs build --verbose --clean + \ No newline at end of file diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml new file mode 100644 index 0000000..0d206b2 --- /dev/null +++ b/.github/workflows/pypi.yml @@ -0,0 +1,21 @@ +name: "Build and publish package" + +on: + release: + types: [published] + +permissions: + contents: read + +env: + # disable keyring (https://github.com/actions/runner-images/issues/6185): + PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring + +jobs: + build-and-publish: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: ./.github/actions/setup-poetry + - name: Build and publish + run: poetry publish --build --no-interaction --username=__token__ --password=${{ secrets.PYPI_TOKEN }} diff --git a/.gitignore b/.gitignore index bb3e254..800f568 100644 --- a/.gitignore +++ b/.gitignore @@ -413,7 +413,6 @@ tags [Ll]ib [Ll]ib64 [Ll]ocal -[Ss]cripts pyvenv.cfg pip-selfcheck.json diff --git a/pyproject.toml b/pyproject.toml index 1883ae9..35dc361 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "docling" -version = "0.1.2" +version = "0.1.2" # DO NOT EDIT, updated automatically description = "Docling PDF conversion package" authors = ["Christoph Auer ", "Michele Dolfi ", "Maxim Lysak ", "Nikos Livathinos ", "Ahmed Nassar ", "Peter Staar "] license = "MIT" @@ -72,3 +72,16 @@ python_version = "3.11" [tool.flake8] max-line-length = 88 extend-ignore = ["E203", "E501"] + +[tool.semantic_release] +# for default values check: +# https://github.com/python-semantic-release/python-semantic-release/blob/v7.32.2/semantic_release/defaults.cfg + +version_source = "tag_only" +branch = "main" + +# configure types which should trigger minor and patch version bumps respectively +# (note that they must be a subset of the configured allowed types): +parser_angular_allowed_types = "build,chore,ci,docs,feat,fix,perf,style,refactor,test" +parser_angular_minor_types = "feat" +parser_angular_patch_types = "fix,perf" From 1a7a07e931f60a5d5de5bbdaaa6669ba41af9f5d Mon Sep 17 00:00:00 2001 From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Date: Tue, 16 Jul 2024 13:14:44 +0200 Subject: [PATCH 08/15] disable docs build (#5) Signed-off-by: Christoph Auer --- .github/workflows/cd.yml | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml index b65c72c..0524898 100644 --- a/.github/workflows/cd.yml +++ b/.github/workflows/cd.yml @@ -10,15 +10,17 @@ env: PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring jobs: - docs: - permissions: - contents: write - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - uses: ./.github/actions/setup-poetry - - name: Build and push docs - run: poetry run mkdocs gh-deploy --force + # To be enabled when we add docs + # docs: + # permissions: + # contents: write + # runs-on: ubuntu-latest + # steps: + # - uses: actions/checkout@v3 + # - uses: ./.github/actions/setup-poetry + # - name: Build and push docs + # run: poetry run mkdocs gh-deploy --force + code-checks: uses: ./.github/workflows/checks.yml pre-release-check: From 4e97a9ddfa822086f07dd82e5eb49215ccee6dad Mon Sep 17 00:00:00 2001 From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Date: Tue, 16 Jul 2024 13:34:42 +0200 Subject: [PATCH 09/15] feat: build with ci (#6) Signed-off-by: Michele Dolfi Signed-off-by: Christoph Auer From e4674852c202e7693086a101d76f321cf78bf754 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 16 Jul 2024 11:37:14 +0000 Subject: [PATCH 10/15] chore: bump version to 0.2.0 [skip ci] Signed-off-by: Christoph Auer --- CHANGELOG.md | 5 +++++ pyproject.toml | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..deb4568 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,5 @@ +## [v0.2.0](https://github.com/DS4SD/docling/releases/tag/v0.2.0) - 2024-07-16 + +### Feature + +* Build with ci ([#6](https://github.com/DS4SD/docling/issues/6)) ([`b1479cf`](https://github.com/DS4SD/docling/commit/b1479cf4ecf8a586703b31c7cf6917b3293c6a85)) diff --git a/pyproject.toml b/pyproject.toml index 35dc361..a648a70 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "docling" -version = "0.1.2" # DO NOT EDIT, updated automatically +version = "0.2.0" # DO NOT EDIT, updated automatically description = "Docling PDF conversion package" authors = ["Christoph Auer ", "Michele Dolfi ", "Maxim Lysak ", "Nikos Livathinos ", "Ahmed Nassar ", "Peter Staar "] license = "MIT" From f3d65577a4a4e60775cc9e7b17fbaaba91a736d7 Mon Sep 17 00:00:00 2001 From: Christoph Auer <60343111+cau-git@users.noreply.github.com> Date: Tue, 16 Jul 2024 14:15:09 +0200 Subject: [PATCH 11/15] docs: Add setup with pypi to Readme (#7) Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> Signed-off-by: Christoph Auer --- README.md | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 1367cfd..f70c015 100644 --- a/README.md +++ b/README.md @@ -14,9 +14,19 @@ Docling bundles PDF document conversion to JSON and Markdown in an easy, self-co ## Setup -You need Python 3.11 and poetry. Install poetry from [here](https://python-poetry.org/docs/#installing-with-the-official-installer). +For general usage, you can simply install `docling` through `pip` from the pypi package index. +``` +pip install docling +``` + +**Notes**: +* Works on macOS and Linux environments. Windows platforms are currently not tested. + +### Development setup -Once you have `poetry` installed, create an environment and install the package: +To develop for `docling`, you need Python 3.11 and `poetry`. Install poetry from [here](https://python-poetry.org/docs/#installing-with-the-official-installer). + +Once you have `poetry` installed and cloned this repo, create an environment and install `docling` from the repo root: ```bash poetry env use $(which python3.11) @@ -24,10 +34,6 @@ poetry shell poetry install ``` -**Notes**: -* Works on macOS and Linux environments. Windows platforms are currently not tested. - - ## Usage For basic usage, see the [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py) example module. Run with: From 5e26e8d6e360c913bd7329a37dece0822ec01be8 Mon Sep 17 00:00:00 2001 From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Date: Wed, 17 Jul 2024 14:03:26 +0200 Subject: [PATCH 12/15] feat: enable python 3.12 support by updating glm (#8) * update deepsearch-glm for python 3.12 support Signed-off-by: Michele Dolfi * enable python 3.12 in ci tests Signed-off-by: Michele Dolfi --------- Signed-off-by: Michele Dolfi Signed-off-by: Christoph Auer --- .github/workflows/checks.yml | 2 +- poetry.lock | 91 +++++++++++------------------------- pyproject.toml | 2 +- 3 files changed, 30 insertions(+), 65 deletions(-) diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 6e2ff36..69a4178 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -6,7 +6,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.11'] + python-version: ['3.11', '3.12'] steps: - uses: actions/checkout@v3 - uses: ./.github/actions/setup-poetry diff --git a/poetry.lock b/poetry.lock index 25ce193..b59f027 100644 --- a/poetry.lock +++ b/poetry.lock @@ -74,17 +74,6 @@ tests = ["attrs[tests-no-zope]", "zope-interface"] tests-mypy = ["mypy (>=1.6)", "pytest-mypy-plugins"] tests-no-zope = ["attrs[tests-mypy]", "cloudpickle", "hypothesis", "pympler", "pytest (>=4.3.0)", "pytest-xdist[psutil]"] -[[package]] -name = "bashlex" -version = "0.18" -description = "Python parser for bash" -optional = false -python-versions = ">=2.7, !=3.0, !=3.1, !=3.2, !=3.3, !=3.4" -files = [ - {file = "bashlex-0.18-py2.py3-none-any.whl", hash = "sha256:91d73a23a3e51711919c1c899083890cdecffc91d8c088942725ac13e9dcfffa"}, - {file = "bashlex-0.18.tar.gz", hash = "sha256:5bb03a01c6d5676338c36fd1028009c8ad07e7d61d8a1ce3f513b7fff52796ee"}, -] - [[package]] name = "black" version = "24.4.2" @@ -131,17 +120,6 @@ d = ["aiohttp (>=3.7.4)", "aiohttp (>=3.7.4,!=3.9.0)"] jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] uvloop = ["uvloop (>=0.15.2)"] -[[package]] -name = "bracex" -version = "2.4" -description = "Bash style brace expander." -optional = false -python-versions = ">=3.8" -files = [ - {file = "bracex-2.4-py3-none-any.whl", hash = "sha256:efdc71eff95eaff5e0f8cfebe7d01adf2c8637c8c92edaf63ef348c241a82418"}, - {file = "bracex-2.4.tar.gz", hash = "sha256:a27eaf1df42cf561fed58b7a8f3fdf129d1ea16a81e1fadd1d17989bc6384beb"}, -] - [[package]] name = "build" version = "1.2.1" @@ -371,32 +349,6 @@ files = [ {file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"}, ] -[[package]] -name = "cibuildwheel" -version = "2.19.2" -description = "Build Python wheels on CI with minimal configuration." -optional = false -python-versions = ">=3.8" -files = [ - {file = "cibuildwheel-2.19.2-py3-none-any.whl", hash = "sha256:02ead5d7e3e81fe2ee0afb78746b1494af6b37afc1e32fae12f9c9a28c14e369"}, - {file = "cibuildwheel-2.19.2.tar.gz", hash = "sha256:d331c81c505106ee585333b871718cf0516ac10d55c4dda2c00c8a7405743cab"}, -] - -[package.dependencies] -bashlex = "!=0.13" -bracex = "*" -certifi = "*" -filelock = "*" -packaging = ">=20.9" -platformdirs = "*" - -[package.extras] -bin = ["click", "packaging (>=21.0)", "pip-tools", "pygithub", "pyyaml", "requests", "rich (>=9.6)"] -dev = ["build", "click", "jinja2", "packaging (>=21.0)", "pip-tools", "pygithub", "pytest (>=6)", "pytest-timeout", "pytest-xdist", "pyyaml", "requests", "rich (>=9.6)", "tomli-w", "validate-pyproject"] -docs = ["jinja2 (>=3.1.2)", "mkdocs (==1.3.1)", "mkdocs-include-markdown-plugin (==2.8.0)", "mkdocs-macros-plugin", "pymdown-extensions"] -test = ["build", "jinja2", "pytest (>=6)", "pytest-timeout", "pytest-xdist", "tomli-w", "validate-pyproject"] -uv = ["uv"] - [[package]] name = "cleo" version = "2.1.0" @@ -643,32 +595,45 @@ files = [ [[package]] name = "deepsearch-glm" -version = "0.18.4" +version = "0.19.0" description = "Graph Language Models" optional = false python-versions = "<4.0,>=3.8" files = [ - {file = "deepsearch_glm-0.18.4-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:ad88c5bf3c203174ef81e0699405aec0f5386130cbc6a975b165f81887bc1a52"}, - {file = "deepsearch_glm-0.18.4-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:21d51a0671f0713d23be57030287a0f907f4a5f0627a45ea07e2caf54129a71a"}, - {file = "deepsearch_glm-0.18.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7fc853941ea751a15f65e83f9bee9f988d0ecac4b28fac067b2aab49e15edb74"}, - {file = "deepsearch_glm-0.18.4-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:cab5e577cf724343f2a5987ff4488c69e86a2dbca8cb0359c9243a07c6cd7d69"}, - {file = "deepsearch_glm-0.18.4-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:dda02391306d657a884b12f21cc3d1228663f940ec6001c833893dd2844bcc25"}, - {file = "deepsearch_glm-0.18.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9dccd4286a93ee1a216acba27e1fc76f5d14e280d968998cfeae11a00ad1b6cb"}, - {file = "deepsearch_glm-0.18.4-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:cf38368bc72eab673459ea0fc96c02b1f3ae120df2d9443e1a63e010764ac1e9"}, - {file = "deepsearch_glm-0.18.4-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:d3fd83ea3b2bce11bac1d710f12547728f4dd48bfaa8bd472366ef144469d52c"}, - {file = "deepsearch_glm-0.18.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5fb4bfd43ac3b996cdd151c35e94fa399953ee3952d7e86390a825880ece95f3"}, - {file = "deepsearch_glm-0.18.4-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:57cb67e435cacb6c4a6b6a9109d943267c493ebbba252a88ca40909976f60225"}, - {file = "deepsearch_glm-0.18.4-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:edc399939b6464f96600d2f23796ae2641d668fb794b77199e87abdef77f8853"}, - {file = "deepsearch_glm-0.18.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00ad8d932e7f0d1be4fd99fc0d4c8d50cb1ff10764f146b6ecb310a1379123d4"}, + {file = "deepsearch_glm-0.19.0-cp310-cp310-macosx_13_6_arm64.whl", hash = "sha256:d420c7eb4e27b64cdc33c0beba159147fc4be14e141133f0f6ef080465b2529c"}, + {file = "deepsearch_glm-0.19.0-cp310-cp310-macosx_13_6_x86_64.whl", hash = "sha256:8af4583ea6d914e87d6db96cae1d73272af6fe85193e67406f0c700064e794c2"}, + {file = "deepsearch_glm-0.19.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:772e6e245b4d77d9df84af07693f9c19bc2f3dc6de4cb44deaf5fdd4a6c8e68d"}, + {file = "deepsearch_glm-0.19.0-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:6a0c29f8cf8a1ee392c68985f8952a01b43dd8f2c5a1476b890f2c90d7ecbc96"}, + {file = "deepsearch_glm-0.19.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bb4f34bb5e45df2790eb6bbaf5caa625393d903da502b086de65df9ce4e3fff2"}, + {file = "deepsearch_glm-0.19.0-cp311-cp311-macosx_13_6_arm64.whl", hash = "sha256:320195914e96b8197e53665594c4480b86f3fc4cacd5e6782befb2bb94494a40"}, + {file = "deepsearch_glm-0.19.0-cp311-cp311-macosx_13_6_x86_64.whl", hash = "sha256:7221851c304ef364a13eeffa940a7c15592e9d5b0050b97904221a65be33f3ab"}, + {file = "deepsearch_glm-0.19.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:b9b9e7102cf4355be8458569c7a44133b54446ae623923772db6942ce0fb2e87"}, + {file = "deepsearch_glm-0.19.0-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:fa8d8d718149cfecd724a0eca246a3bd57588dffb757f204b629a35623d8f946"}, + {file = "deepsearch_glm-0.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ae251cf69b43d945fbf2cd41a89ba12312cd319a1c28d41c99d35cc476376b5"}, + {file = "deepsearch_glm-0.19.0-cp312-cp312-macosx_13_6_arm64.whl", hash = "sha256:ad9a1fbf76e2561bc37e238ee9dd320b4b9cd49e61c55613e3977eedadee52bc"}, + {file = "deepsearch_glm-0.19.0-cp312-cp312-macosx_13_6_x86_64.whl", hash = "sha256:2030aec8ce751927fe20ca1788e125e9b0c37f994c30062e59c4d7b7a87cbb64"}, + {file = "deepsearch_glm-0.19.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:e57611b5d6cc2da91901e4b39fab6c9131dffe8766f43c20093bff75a0039100"}, + {file = "deepsearch_glm-0.19.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:efbcf157cd6bd2dd6138312cef5df378598fd67e6c3f6f0b63ed3342c1de7f49"}, + {file = "deepsearch_glm-0.19.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9febce49f365fcc5ada1ded720d387c51328ee470d4fcc89044b0684e074e699"}, + {file = "deepsearch_glm-0.19.0-cp38-cp38-macosx_13_6_arm64.whl", hash = "sha256:04a29ba5e942f32659ae1a65cfe5e90e93d50e058d53b4763fe13df93f30492f"}, + {file = "deepsearch_glm-0.19.0-cp38-cp38-macosx_13_6_x86_64.whl", hash = "sha256:a0024f42d6711f574dcab52ef2914a55f31b4fd804d3ad20ca7f211498e8a19b"}, + {file = "deepsearch_glm-0.19.0-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:513e5f1de14f0b12c916a52118083094a9ced439e4800d3442b2dd04f3cdbead"}, + {file = "deepsearch_glm-0.19.0-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:6d3dd07a549b8cd4408308b0b6b8ca65397ce7e8c819d050d8b2deb03cd1977e"}, + {file = "deepsearch_glm-0.19.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7be341a85ce1ff164963a6d58b50955599dc33b34040975c972a798ae0f6f12c"}, + {file = "deepsearch_glm-0.19.0-cp39-cp39-macosx_13_6_arm64.whl", hash = "sha256:b6c3c0d1295666a8a68f76262c020ffdc6de64cdd95671bf24c0592fa1317533"}, + {file = "deepsearch_glm-0.19.0-cp39-cp39-macosx_13_6_x86_64.whl", hash = "sha256:cf290fe3824bd0de01b7c1d681aa14c89c5e60c6735fa471e04a985e55aead44"}, + {file = "deepsearch_glm-0.19.0-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:92943c495646660aef99ba64a7e3b77ffeca4866e96044f8be5e14dfa7ee660e"}, + {file = "deepsearch_glm-0.19.0-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:a7c89d6fae4ed9dc960f9ee9734e91d321222080bf439e1d89e8c67270afc282"}, + {file = "deepsearch_glm-0.19.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fb6398a34f5afac6282c4a5b7ea5a89f27fcf4c0adac43af27ecbac9e2731ce3"}, ] [package.dependencies] -cibuildwheel = ">=2.17.0,<3.0.0" deepsearch-toolkit = ">=0.31.0" matplotlib = ">=3.7.1,<4.0.0" networkx = ">=3.1,<4.0" netwulf = ">=0.1.5,<0.2.0" numerize = ">=0.12,<0.13" +numpy = {version = ">=1.26.4,<2.0.0", markers = "python_version >= \"3.9\""} pandas = ">=1.5.1" pybind11 = ">=2.10.4,<3.0.0" python-dotenv = ">=1.0.0,<2.0.0" @@ -4862,4 +4827,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "1f0c0eb64cdce7e5c21670841d5dea047b3f918a3041bb22bb9c615b9085da20" +content-hash = "dc19329559f190dfe687b4ee272eb6dac66b3d9fe0398c95c2572e8c63fa23ac" diff --git a/pyproject.toml b/pyproject.toml index a648a70..03e2b1b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ python = "^3.11" pydantic = "^2.0.0" docling-core = "^0.2.0" docling-ibm-models = "^0.2.0" -deepsearch-glm = ">=0.18.4,<1" +deepsearch-glm = ">=0.19.0,<1" deepsearch-toolkit = ">=0.47.0,<1" filetype = "^1.2.0" pypdfium2 = "^4.30.0" From 86c2a7fc1e9906443f0a8eccd6828e88fe09179b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 17 Jul 2024 12:11:15 +0000 Subject: [PATCH 13/15] chore: bump version to 0.3.0 [skip ci] Signed-off-by: Christoph Auer --- CHANGELOG.md | 10 ++++++++++ pyproject.toml | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index deb4568..75b26cd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +## [v0.3.0](https://github.com/DS4SD/docling/releases/tag/v0.3.0) - 2024-07-17 + +### Feature + +* Enable python 3.12 support by updating glm ([#8](https://github.com/DS4SD/docling/issues/8)) ([`fb72688`](https://github.com/DS4SD/docling/commit/fb72688ff7413083c864fe62d2dbfc420c1e5268)) + +### Documentation + +* Add setup with pypi to Readme ([#7](https://github.com/DS4SD/docling/issues/7)) ([`2803222`](https://github.com/DS4SD/docling/commit/2803222ee1708481c779d435dbf1c031929d3cf6)) + ## [v0.2.0](https://github.com/DS4SD/docling/releases/tag/v0.2.0) - 2024-07-16 ### Feature diff --git a/pyproject.toml b/pyproject.toml index 03e2b1b..7a39ac3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "docling" -version = "0.2.0" # DO NOT EDIT, updated automatically +version = "0.3.0" # DO NOT EDIT, updated automatically description = "Docling PDF conversion package" authors = ["Christoph Auer ", "Michele Dolfi ", "Maxim Lysak ", "Nikos Livathinos ", "Ahmed Nassar ", "Peter Staar "] license = "MIT" From 32905ab9591417458f6904075625a13c63db1e92 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Wed, 17 Jul 2024 15:38:16 +0200 Subject: [PATCH 14/15] Add documentation Signed-off-by: Christoph Auer --- README.md | 21 ++++++++++++++++++++- docling/models/page_assemble_model.py | 12 ------------ 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index f70c015..b89b03d 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,9 @@ python examples/convert.py ``` The output of the above command will be written to `./scratch`. -### Enable or disable pipeline features +### Adjust pipeline features + +**Control pipeline options** You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter` ```python @@ -54,6 +56,23 @@ doc_converter = DocumentConverter( ) ``` +**Control table extraction options** + +You can control if table structure recognition should map the recognized structure back to PDF cells (default) or use text cells from the structure prediction itself. +This can improve output quality if you find that multiple columns in extracted tables are erroneously merged into one. + + +```python + +pipeline_options = PipelineOptions(do_table_structure=True) +pipeline_options.table_structure_options.do_cell_matching = True + +doc_converter = DocumentConverter( + artifacts_path=artifacts_path, + pipeline_options=pipeline_options, # Controls if OCR is applied (ignores programmatic content) +) +``` + ### Impose limits on the document size You can limit the file size and number of pages which should be allowed to process per document. diff --git a/docling/models/page_assemble_model.py b/docling/models/page_assemble_model.py index 4ed0832..2b9db54 100644 --- a/docling/models/page_assemble_model.py +++ b/docling/models/page_assemble_model.py @@ -19,18 +19,6 @@ class PageAssembleModel: def __init__(self, config): self.config = config - # self.line_wrap_pattern = re.compile(r'(?<=[^\W_])- \n(?=\w)') - - # def sanitize_text_poor(self, lines): - # text = '\n'.join(lines) - # - # # treat line wraps. - # sanitized_text = self.line_wrap_pattern.sub('', text) - # - # sanitized_text = sanitized_text.replace('\n', ' ') - # - # return sanitized_text - def sanitize_text(self, lines): if len(lines) <= 1: return " ".join(lines) From 60302c312ce0b35ff2dfac219dc8546fa1a8fe21 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Wed, 17 Jul 2024 15:51:28 +0200 Subject: [PATCH 15/15] Documentation improvements Signed-off-by: Christoph Auer --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index bcbe7ac..470936a 100644 --- a/README.md +++ b/README.md @@ -71,11 +71,11 @@ This can improve output quality if you find that multiple columns in extracted t ```python pipeline_options = PipelineOptions(do_table_structure=True) -pipeline_options.table_structure_options.do_cell_matching = True +pipeline_options.table_structure_options.do_cell_matching = False # Uses text cells predicted from table structure model doc_converter = DocumentConverter( artifacts_path=artifacts_path, - pipeline_options=pipeline_options, # Controls if OCR is applied (ignores programmatic content) + pipeline_options=pipeline_options, ) ```