From 3ae82077d20edaa1f242dcec8ce9a81f1803bae4 Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Mon, 15 Jul 2024 12:18:20 +0200
Subject: [PATCH 01/15] Add repo, absolute URLs

---
 README.md      | 2 +-
 pyproject.toml | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c9e0e20..d716832 100644
--- a/README.md
+++ b/README.md
@@ -75,7 +75,7 @@ You can limit the CPU threads used by `docling` by setting the environment varia
 
 ## Contributing
 
-Please read [Contributing to Docling](./CONTRIBUTING.md) for details.
+Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
 
 
 ## References
diff --git a/pyproject.toml b/pyproject.toml
index a603ba1..5f0c489 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,6 +5,8 @@ description = "Docling PDF conversion package"
 authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"
 readme = "README.md"
+repository = "https://github.com/DS4SD/docling"
+homepage = "https://github.com/DS4SD/docling"
 keywords= ["docling", "convert", "document", "pdf", "layout model", "segmentation", "table structure", "table former"]
  classifiers = [
      "License :: OSI Approved :: MIT License",

From 5acb7b51cff07756e9a7f3dd2366f0b6f8c53a55 Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Wed, 17 Jul 2024 15:21:13 +0200
Subject: [PATCH 02/15] Optimizations for table extraction quality,
 configurable options for cell matching

---
 docling/datamodel/base_models.py            | 28 +++++++++--
 docling/models/table_structure_model.py     | 54 ++++++++++++++++-----
 docling/pipeline/standard_model_pipeline.py |  2 +-
 examples/convert.py                         |  2 -
 4 files changed, 68 insertions(+), 18 deletions(-)

diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py
index dd9795a..8b6796d 100644
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@@ -1,3 +1,4 @@
+import copy
 from enum import Enum, auto
 from io import BytesIO
 from typing import Any, Dict, List, Optional, Tuple, Union
@@ -47,6 +48,15 @@ def width(self):
     def height(self):
         return abs(self.t - self.b)
 
+    def scaled(self, scale: float) -> "BoundingBox":
+        out_bbox = copy.deepcopy(self)
+        out_bbox.l *= scale
+        out_bbox.r *= scale
+        out_bbox.t *= scale
+        out_bbox.b *= scale
+
+        return out_bbox
+
     def as_tuple(self):
         if self.coord_origin == CoordOrigin.TOPLEFT:
             return (self.l, self.t, self.r, self.b)
@@ -180,8 +190,7 @@ class TableStructurePrediction(BaseModel):
     table_map: Dict[int, TableElement] = {}
 
 
-class TextElement(BasePageElement):
-    ...
+class TextElement(BasePageElement): ...
 
 
 class FigureData(BaseModel):
@@ -242,6 +251,17 @@ class DocumentStream(BaseModel):
     stream: BytesIO
 
 
+class TableStructureOptions(BaseModel):
+    do_cell_matching: bool = (
+        True
+        # True:  Matches predictions back to PDF cells. Can break table output if PDF cells
+        #        are merged across table columns.
+        # False: Let table structure model define the text cells, ignore PDF cells.
+    )
+
+
 class PipelineOptions(BaseModel):
-    do_table_structure: bool = True
-    do_ocr: bool = False
+    do_table_structure: bool = True  # True: perform table structure extraction
+    do_ocr: bool = False  # True: perform OCR, replace programmatic PDF text
+
+    table_structure_options: TableStructureOptions = TableStructureOptions()
diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py
index 8ee4bda..132b141 100644
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@@ -1,7 +1,10 @@
-from typing import Iterable
+import copy
+import random
+from typing import Iterable, List
 
 import numpy
 from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
+from PIL import ImageDraw
 
 from docling.datamodel.base_models import (
     BoundingBox,
@@ -28,6 +31,21 @@ def __init__(self, config):
             self.tm_model_type = self.tm_config["model"]["type"]
 
             self.tf_predictor = TFPredictor(self.tm_config)
+            self.scale = 2.0  # Scale up table input images to 144 dpi
+
+    def draw_table_and_cells(self, page: Page, tbl_list: List[TableElement]):
+        image = page._backend.get_page_image()
+        draw = ImageDraw.Draw(image)
+
+        for table_element in tbl_list:
+            x0, y0, x1, y1 = table_element.cluster.bbox.as_tuple()
+            draw.rectangle([(x0, y0), (x1, y1)], outline="red")
+
+            for tc in table_element.table_cells:
+                x0, y0, x1, y1 = tc.bbox.as_tuple()
+                draw.rectangle([(x0, y0), (x1, y1)], outline="blue")
+
+        image.show()
 
     def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
 
@@ -36,16 +54,17 @@ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
             return
 
         for page in page_batch:
+
             page.predictions.tablestructure = TableStructurePrediction()  # dummy
 
             in_tables = [
                 (
                     cluster,
                     [
-                        round(cluster.bbox.l),
-                        round(cluster.bbox.t),
-                        round(cluster.bbox.r),
-                        round(cluster.bbox.b),
+                        round(cluster.bbox.l) * self.scale,
+                        round(cluster.bbox.t) * self.scale,
+                        round(cluster.bbox.r) * self.scale,
+                        round(cluster.bbox.b) * self.scale,
                     ],
                 )
                 for cluster in page.predictions.layout.clusters
@@ -65,20 +84,29 @@ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
                         ):
                             # Only allow non empty stings (spaces) into the cells of a table
                             if len(c.text.strip()) > 0:
-                                tokens.append(c.model_dump())
+                                new_cell = copy.deepcopy(c)
+                                new_cell.bbox = new_cell.bbox.scaled(scale=self.scale)
+
+                                tokens.append(new_cell.model_dump())
 
-            iocr_page = {
-                "image": numpy.asarray(page.image),
+            page_input = {
                 "tokens": tokens,
-                "width": page.size.width,
-                "height": page.size.height,
+                "width": page.size.width * self.scale,
+                "height": page.size.height * self.scale,
             }
+            # add image to page input.
+            if self.scale == 1.0:
+                page_input["image"] = numpy.asarray(page.image)
+            else:  # render new page image on the fly at desired scale
+                page_input["image"] = numpy.asarray(
+                    page._backend.get_page_image(scale=self.scale)
+                )
 
             table_clusters, table_bboxes = zip(*in_tables)
 
             if len(table_bboxes):
                 tf_output = self.tf_predictor.multi_table_predict(
-                    iocr_page, table_bboxes, do_matching=self.do_cell_matching
+                    page_input, table_bboxes, do_matching=self.do_cell_matching
                 )
 
                 for table_cluster, table_out in zip(table_clusters, tf_output):
@@ -91,6 +119,7 @@ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
                             element["bbox"]["token"] = text_piece
 
                         tc = TableCell.model_validate(element)
+                        tc.bbox = tc.bbox.scaled(1 / self.scale)
                         table_cells.append(tc)
 
                     # Retrieving cols/rows, after post processing:
@@ -111,4 +140,7 @@ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
 
                     page.predictions.tablestructure.table_map[table_cluster.id] = tbl
 
+                # For debugging purposes:
+                # self.draw_table_and_cells(page, page.predictions.tablestructure.table_map.values())
+
             yield page
diff --git a/docling/pipeline/standard_model_pipeline.py b/docling/pipeline/standard_model_pipeline.py
index 07c0113..33fee75 100644
--- a/docling/pipeline/standard_model_pipeline.py
+++ b/docling/pipeline/standard_model_pipeline.py
@@ -34,7 +34,7 @@ def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
                     "artifacts_path": artifacts_path
                     / StandardModelPipeline._table_model_path,
                     "enabled": pipeline_options.do_table_structure,
-                    "do_cell_matching": False,
+                    "do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
                 }
             ),
         ]
diff --git a/examples/convert.py b/examples/convert.py
index 89b3726..26a38c5 100644
--- a/examples/convert.py
+++ b/examples/convert.py
@@ -46,8 +46,6 @@ def main():
     logging.basicConfig(level=logging.INFO)
 
     input_doc_paths = [
-        # Path("/Users/cau/Downloads/Issue-36122.pdf"),
-        # Path("/Users/cau/Downloads/IBM_Storage_Insights_Fact_Sheet.pdf"),
         Path("./test/data/2206.01062.pdf"),
         Path("./test/data/2203.01017v2.pdf"),
         Path("./test/data/2305.03393v1.pdf"),

From 78b154fde7d6307b111bf33d443d3dea935198ae Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Mon, 15 Jul 2024 12:18:20 +0200
Subject: [PATCH 03/15] Add repo, absolute URLs

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 README.md      | 2 +-
 pyproject.toml | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c9e0e20..d716832 100644
--- a/README.md
+++ b/README.md
@@ -75,7 +75,7 @@ You can limit the CPU threads used by `docling` by setting the environment varia
 
 ## Contributing
 
-Please read [Contributing to Docling](./CONTRIBUTING.md) for details.
+Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
 
 
 ## References
diff --git a/pyproject.toml b/pyproject.toml
index a603ba1..5f0c489 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,6 +5,8 @@ description = "Docling PDF conversion package"
 authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"
 readme = "README.md"
+repository = "https://github.com/DS4SD/docling"
+homepage = "https://github.com/DS4SD/docling"
 keywords= ["docling", "convert", "document", "pdf", "layout model", "segmentation", "table structure", "table former"]
  classifiers = [
      "License :: OSI Approved :: MIT License",

From 6c016001947d7bcdfb2c0f29e102e967aad75b61 Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Wed, 17 Jul 2024 15:21:13 +0200
Subject: [PATCH 04/15] Optimizations for table extraction quality,
 configurable options for cell matching

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling/datamodel/base_models.py            | 28 +++++++++--
 docling/models/table_structure_model.py     | 54 ++++++++++++++++-----
 docling/pipeline/standard_model_pipeline.py |  2 +-
 examples/convert.py                         |  2 -
 4 files changed, 68 insertions(+), 18 deletions(-)

diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py
index dd9795a..8b6796d 100644
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@@ -1,3 +1,4 @@
+import copy
 from enum import Enum, auto
 from io import BytesIO
 from typing import Any, Dict, List, Optional, Tuple, Union
@@ -47,6 +48,15 @@ def width(self):
     def height(self):
         return abs(self.t - self.b)
 
+    def scaled(self, scale: float) -> "BoundingBox":
+        out_bbox = copy.deepcopy(self)
+        out_bbox.l *= scale
+        out_bbox.r *= scale
+        out_bbox.t *= scale
+        out_bbox.b *= scale
+
+        return out_bbox
+
     def as_tuple(self):
         if self.coord_origin == CoordOrigin.TOPLEFT:
             return (self.l, self.t, self.r, self.b)
@@ -180,8 +190,7 @@ class TableStructurePrediction(BaseModel):
     table_map: Dict[int, TableElement] = {}
 
 
-class TextElement(BasePageElement):
-    ...
+class TextElement(BasePageElement): ...
 
 
 class FigureData(BaseModel):
@@ -242,6 +251,17 @@ class DocumentStream(BaseModel):
     stream: BytesIO
 
 
+class TableStructureOptions(BaseModel):
+    do_cell_matching: bool = (
+        True
+        # True:  Matches predictions back to PDF cells. Can break table output if PDF cells
+        #        are merged across table columns.
+        # False: Let table structure model define the text cells, ignore PDF cells.
+    )
+
+
 class PipelineOptions(BaseModel):
-    do_table_structure: bool = True
-    do_ocr: bool = False
+    do_table_structure: bool = True  # True: perform table structure extraction
+    do_ocr: bool = False  # True: perform OCR, replace programmatic PDF text
+
+    table_structure_options: TableStructureOptions = TableStructureOptions()
diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py
index 8ee4bda..132b141 100644
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@@ -1,7 +1,10 @@
-from typing import Iterable
+import copy
+import random
+from typing import Iterable, List
 
 import numpy
 from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
+from PIL import ImageDraw
 
 from docling.datamodel.base_models import (
     BoundingBox,
@@ -28,6 +31,21 @@ def __init__(self, config):
             self.tm_model_type = self.tm_config["model"]["type"]
 
             self.tf_predictor = TFPredictor(self.tm_config)
+            self.scale = 2.0  # Scale up table input images to 144 dpi
+
+    def draw_table_and_cells(self, page: Page, tbl_list: List[TableElement]):
+        image = page._backend.get_page_image()
+        draw = ImageDraw.Draw(image)
+
+        for table_element in tbl_list:
+            x0, y0, x1, y1 = table_element.cluster.bbox.as_tuple()
+            draw.rectangle([(x0, y0), (x1, y1)], outline="red")
+
+            for tc in table_element.table_cells:
+                x0, y0, x1, y1 = tc.bbox.as_tuple()
+                draw.rectangle([(x0, y0), (x1, y1)], outline="blue")
+
+        image.show()
 
     def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
 
@@ -36,16 +54,17 @@ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
             return
 
         for page in page_batch:
+
             page.predictions.tablestructure = TableStructurePrediction()  # dummy
 
             in_tables = [
                 (
                     cluster,
                     [
-                        round(cluster.bbox.l),
-                        round(cluster.bbox.t),
-                        round(cluster.bbox.r),
-                        round(cluster.bbox.b),
+                        round(cluster.bbox.l) * self.scale,
+                        round(cluster.bbox.t) * self.scale,
+                        round(cluster.bbox.r) * self.scale,
+                        round(cluster.bbox.b) * self.scale,
                     ],
                 )
                 for cluster in page.predictions.layout.clusters
@@ -65,20 +84,29 @@ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
                         ):
                             # Only allow non empty stings (spaces) into the cells of a table
                             if len(c.text.strip()) > 0:
-                                tokens.append(c.model_dump())
+                                new_cell = copy.deepcopy(c)
+                                new_cell.bbox = new_cell.bbox.scaled(scale=self.scale)
+
+                                tokens.append(new_cell.model_dump())
 
-            iocr_page = {
-                "image": numpy.asarray(page.image),
+            page_input = {
                 "tokens": tokens,
-                "width": page.size.width,
-                "height": page.size.height,
+                "width": page.size.width * self.scale,
+                "height": page.size.height * self.scale,
             }
+            # add image to page input.
+            if self.scale == 1.0:
+                page_input["image"] = numpy.asarray(page.image)
+            else:  # render new page image on the fly at desired scale
+                page_input["image"] = numpy.asarray(
+                    page._backend.get_page_image(scale=self.scale)
+                )
 
             table_clusters, table_bboxes = zip(*in_tables)
 
             if len(table_bboxes):
                 tf_output = self.tf_predictor.multi_table_predict(
-                    iocr_page, table_bboxes, do_matching=self.do_cell_matching
+                    page_input, table_bboxes, do_matching=self.do_cell_matching
                 )
 
                 for table_cluster, table_out in zip(table_clusters, tf_output):
@@ -91,6 +119,7 @@ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
                             element["bbox"]["token"] = text_piece
 
                         tc = TableCell.model_validate(element)
+                        tc.bbox = tc.bbox.scaled(1 / self.scale)
                         table_cells.append(tc)
 
                     # Retrieving cols/rows, after post processing:
@@ -111,4 +140,7 @@ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
 
                     page.predictions.tablestructure.table_map[table_cluster.id] = tbl
 
+                # For debugging purposes:
+                # self.draw_table_and_cells(page, page.predictions.tablestructure.table_map.values())
+
             yield page
diff --git a/docling/pipeline/standard_model_pipeline.py b/docling/pipeline/standard_model_pipeline.py
index 07c0113..33fee75 100644
--- a/docling/pipeline/standard_model_pipeline.py
+++ b/docling/pipeline/standard_model_pipeline.py
@@ -34,7 +34,7 @@ def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
                     "artifacts_path": artifacts_path
                     / StandardModelPipeline._table_model_path,
                     "enabled": pipeline_options.do_table_structure,
-                    "do_cell_matching": False,
+                    "do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
                 }
             ),
         ]
diff --git a/examples/convert.py b/examples/convert.py
index 89b3726..26a38c5 100644
--- a/examples/convert.py
+++ b/examples/convert.py
@@ -46,8 +46,6 @@ def main():
     logging.basicConfig(level=logging.INFO)
 
     input_doc_paths = [
-        # Path("/Users/cau/Downloads/Issue-36122.pdf"),
-        # Path("/Users/cau/Downloads/IBM_Storage_Insights_Fact_Sheet.pdf"),
         Path("./test/data/2206.01062.pdf"),
         Path("./test/data/2203.01017v2.pdf"),
         Path("./test/data/2305.03393v1.pdf"),

From f652bad2d1e531fc24933356aea5a907836e1ca6 Mon Sep 17 00:00:00 2001
From: Christoph Auer <60343111+cau-git@users.noreply.github.com>
Date: Mon, 15 Jul 2024 12:43:05 +0200
Subject: [PATCH 05/15] docs: Update links, add GH repository to metadata (#1)

* Add repo, absolute URLs

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Bump version

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 README.md      | 2 +-
 pyproject.toml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index d716832..9c4217a 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 <p align="center">
-  <a href="https://github.com/ds4sd/docling"> <img loading="lazy" alt="Docling" src="logo.png" width="150" /> </a>
+  <a href="https://github.com/ds4sd/docling"> <img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/logo.png" width="150" /> </a>
 </p>
 
 # Docling
diff --git a/pyproject.toml b/pyproject.toml
index 5f0c489..133bb74 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "0.1.0"
+version = "0.1.1"
 description = "Docling PDF conversion package"
 authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"

From a5113eb78e7278794a8a9dbc0cdfabc7ca2aabd4 Mon Sep 17 00:00:00 2001
From: Christoph Auer <60343111+cau-git@users.noreply.github.com>
Date: Mon, 15 Jul 2024 14:59:53 +0200
Subject: [PATCH 06/15] doc: More documentation updates (#2)

* Update README.md

Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com>

* Update Dockerfile

Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com>

* Bump version

Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com>

---------

Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com>
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 Dockerfile     | 3 +--
 README.md      | 4 ++--
 pyproject.toml | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index bef7be5..b2138a6 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -6,8 +6,7 @@ RUN apt-get update \
     && apt-get install -y libgl1 libglib2.0-0 curl wget git \
     && apt-get clean
 
-RUN --mount=type=ssh \
-    pip install --no-cache-dir https://github.com/DS4SD/docling.git
+RUN pip install --no-cache-dir docling
 
 ENV HF_HOME=/tmp/
 ENV TORCH_HOME=/tmp/
diff --git a/README.md b/README.md
index 9c4217a..1367cfd 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 
 # Docling
 
-Dockling bundles PDF document conversion to JSON and Markdown in an easy, self-contained package.
+Docling bundles PDF document conversion to JSON and Markdown in an easy, self-contained package.
 
 ## Features
 * ⚡ Converts any PDF document to JSON or Markdown format, stable and lightning fast
@@ -30,7 +30,7 @@ poetry install
 
 ## Usage
 
-For basic usage, see the [convert.py](examples/convert.py) example module. Run with:
+For basic usage, see the [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py) example module. Run with:
 
 ```
 python examples/convert.py
diff --git a/pyproject.toml b/pyproject.toml
index 133bb74..1883ae9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "0.1.1"
+version = "0.1.2"
 description = "Docling PDF conversion package"
 authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"

From cf37ace24cdd22f029285353b44491f462c7bc70 Mon Sep 17 00:00:00 2001
From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Date: Tue, 16 Jul 2024 13:05:04 +0200
Subject: [PATCH 07/15] ci: Add Github Actions (#4)

* add Github Actions

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* apply styling

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* Update .github/actions/setup-poetry/action.yml

Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>

* add semantic-release config

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 .github/actions/setup-poetry/action.yml | 19 ++++++++
 .github/scripts/release.sh              | 39 ++++++++++++++++
 .github/workflows/cd.yml                | 59 +++++++++++++++++++++++++
 .github/workflows/checks.yml            | 16 +++++++
 .github/workflows/ci.yml                | 28 ++++++++++++
 .github/workflows/pypi.yml              | 21 +++++++++
 .gitignore                              |  1 -
 pyproject.toml                          | 15 ++++++-
 8 files changed, 196 insertions(+), 2 deletions(-)
 create mode 100644 .github/actions/setup-poetry/action.yml
 create mode 100755 .github/scripts/release.sh
 create mode 100644 .github/workflows/cd.yml
 create mode 100644 .github/workflows/checks.yml
 create mode 100644 .github/workflows/ci.yml
 create mode 100644 .github/workflows/pypi.yml

diff --git a/.github/actions/setup-poetry/action.yml b/.github/actions/setup-poetry/action.yml
new file mode 100644
index 0000000..e9ce697
--- /dev/null
+++ b/.github/actions/setup-poetry/action.yml
@@ -0,0 +1,19 @@
+name: 'Set up Poetry and install'
+description: 'Set up a specific version of Poetry and install dependencies using caching.'
+inputs:
+  python-version:
+    description: "Version range or exact version of Python or PyPy to use, using SemVer's version range syntax."
+    default: '3.11'
+runs:
+  using: 'composite'
+  steps:
+    - name: Install poetry
+      run: pipx install poetry==1.8.3
+      shell: bash
+    - uses: actions/setup-python@v4
+      with:
+        python-version: ${{ inputs.python-version }}
+        cache: 'poetry'
+    - name: Install dependencies
+      run: poetry install --all-extras
+      shell: bash
diff --git a/.github/scripts/release.sh b/.github/scripts/release.sh
new file mode 100755
index 0000000..6cac400
--- /dev/null
+++ b/.github/scripts/release.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+set -e  # trigger failure on error - do not remove!
+set -x  # display command on output
+
+if [ -z "${TARGET_VERSION}" ]; then
+    >&2 echo "No TARGET_VERSION specified"
+    exit 1
+fi
+CHGLOG_FILE="${CHGLOG_FILE:-CHANGELOG.md}"
+
+# update package version
+poetry version "${TARGET_VERSION}"
+
+# collect release notes
+REL_NOTES=$(mktemp)
+poetry run semantic-release changelog --unreleased >> "${REL_NOTES}"
+
+# update changelog
+TMP_CHGLOG=$(mktemp)
+TARGET_TAG_NAME="v${TARGET_VERSION}"
+RELEASE_URL="$(gh repo view --json url -q ".url")/releases/tag/${TARGET_TAG_NAME}"
+printf "## [${TARGET_TAG_NAME}](${RELEASE_URL}) - $(date -Idate)\n\n" >> "${TMP_CHGLOG}"
+cat "${REL_NOTES}" >> "${TMP_CHGLOG}"
+if [ -f "${CHGLOG_FILE}" ]; then
+    printf "\n" | cat - "${CHGLOG_FILE}" >> "${TMP_CHGLOG}"
+fi
+mv "${TMP_CHGLOG}" "${CHGLOG_FILE}"
+
+# push changes
+git config --global user.name 'github-actions[bot]'
+git config --global user.email 'github-actions[bot]@users.noreply.github.com'
+git add pyproject.toml "${CHGLOG_FILE}"
+COMMIT_MSG="chore: bump version to ${TARGET_VERSION} [skip ci]"
+git commit -m "${COMMIT_MSG}"
+git push origin main
+
+# create GitHub release (incl. Git tag)
+gh release create "${TARGET_TAG_NAME}" -F "${REL_NOTES}"
diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml
new file mode 100644
index 0000000..b65c72c
--- /dev/null
+++ b/.github/workflows/cd.yml
@@ -0,0 +1,59 @@
+name: "Run CD"
+
+on:
+  push:
+    branches:
+      - main
+
+env:
+  # disable keyring (https://github.com/actions/runner-images/issues/6185):
+  PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring
+
+jobs:
+  docs:
+    permissions:
+      contents: write
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: ./.github/actions/setup-poetry
+      - name: Build and push docs
+        run: poetry run mkdocs gh-deploy --force
+  code-checks:
+    uses: ./.github/workflows/checks.yml
+  pre-release-check:
+    runs-on: ubuntu-latest
+    outputs:
+      TARGET_TAG_V: ${{ steps.version_check.outputs.TRGT_VERSION }}
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0  # for fetching tags, required for semantic-release
+      - uses: ./.github/actions/setup-poetry
+      - name: Check version of potential release
+        id: version_check
+        run: |
+            TRGT_VERSION=$(poetry run semantic-release print-version)
+            echo "TRGT_VERSION=${TRGT_VERSION}" >> $GITHUB_OUTPUT
+            echo "${TRGT_VERSION}"
+      - name: Check notes of potential release
+        run: poetry run semantic-release changelog --unreleased
+  release:
+    needs: [code-checks, pre-release-check]
+    if: needs.pre-release-check.outputs.TARGET_TAG_V != ''
+    environment: auto-release
+    runs-on: ubuntu-latest
+    concurrency: release
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          token: ${{ secrets.GH_PAT }}
+          fetch-depth: 0  # for fetching tags, required for semantic-release
+      - uses: ./.github/actions/setup-poetry
+      - name: Run release script
+        env:
+          GH_TOKEN: ${{ secrets.GH_PAT }}
+          TARGET_VERSION: ${{ needs.pre-release-check.outputs.TARGET_TAG_V }}
+          CHGLOG_FILE: CHANGELOG.md
+        run: ./.github/scripts/release.sh
+        shell: bash
diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml
new file mode 100644
index 0000000..6e2ff36
--- /dev/null
+++ b/.github/workflows/checks.yml
@@ -0,0 +1,16 @@
+on:
+  workflow_call:
+
+jobs:
+  run-checks:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ['3.11']
+    steps:
+      - uses: actions/checkout@v3
+      - uses: ./.github/actions/setup-poetry
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Run styling check
+        run: poetry run pre-commit run --all-files
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..33fb37a
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,28 @@
+name: "Run CI"
+
+on:
+  pull_request:
+    types: [opened, reopened, synchronize, ready_for_review]
+  push:
+    branches:
+      - "**"
+      - "!main"
+      - "!gh-pages"
+
+env:
+  # disable keyring (https://github.com/actions/runner-images/issues/6185):
+  PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring
+
+jobs:
+  code-checks:
+    uses: ./.github/workflows/checks.yml
+
+    # To enable when we add the ./docs
+  # build-docs:
+  #   runs-on: ubuntu-latest
+  #   steps:
+  #     - uses: actions/checkout@v3
+  #     - uses: ./.github/actions/setup-poetry
+  #     - name: Build docs
+  #       run: poetry run mkdocs build --verbose --clean
+  
\ No newline at end of file
diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
new file mode 100644
index 0000000..0d206b2
--- /dev/null
+++ b/.github/workflows/pypi.yml
@@ -0,0 +1,21 @@
+name: "Build and publish package"
+
+on:
+  release:
+    types: [published]
+
+permissions:
+  contents: read
+
+env:
+  # disable keyring (https://github.com/actions/runner-images/issues/6185):
+  PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring
+
+jobs:
+  build-and-publish:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: ./.github/actions/setup-poetry
+      - name: Build and publish
+        run: poetry publish --build --no-interaction --username=__token__ --password=${{ secrets.PYPI_TOKEN }}
diff --git a/.gitignore b/.gitignore
index bb3e254..800f568 100644
--- a/.gitignore
+++ b/.gitignore
@@ -413,7 +413,6 @@ tags
 [Ll]ib
 [Ll]ib64
 [Ll]ocal
-[Ss]cripts
 pyvenv.cfg
 pip-selfcheck.json
 
diff --git a/pyproject.toml b/pyproject.toml
index 1883ae9..35dc361 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "0.1.2"
+version = "0.1.2"  # DO NOT EDIT, updated automatically
 description = "Docling PDF conversion package"
 authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"
@@ -72,3 +72,16 @@ python_version = "3.11"
 [tool.flake8]
 max-line-length = 88
 extend-ignore = ["E203", "E501"]
+
+[tool.semantic_release]
+# for default values check:
+# https://github.com/python-semantic-release/python-semantic-release/blob/v7.32.2/semantic_release/defaults.cfg
+
+version_source = "tag_only"
+branch = "main"
+
+# configure types which should trigger minor and patch version bumps respectively
+# (note that they must be a subset of the configured allowed types):
+parser_angular_allowed_types = "build,chore,ci,docs,feat,fix,perf,style,refactor,test"
+parser_angular_minor_types = "feat"
+parser_angular_patch_types = "fix,perf"

From 1a7a07e931f60a5d5de5bbdaaa6669ba41af9f5d Mon Sep 17 00:00:00 2001
From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Date: Tue, 16 Jul 2024 13:14:44 +0200
Subject: [PATCH 08/15] disable docs build (#5)

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 .github/workflows/cd.yml | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml
index b65c72c..0524898 100644
--- a/.github/workflows/cd.yml
+++ b/.github/workflows/cd.yml
@@ -10,15 +10,17 @@ env:
   PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring
 
 jobs:
-  docs:
-    permissions:
-      contents: write
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v3
-      - uses: ./.github/actions/setup-poetry
-      - name: Build and push docs
-        run: poetry run mkdocs gh-deploy --force
+  # To be enabled when we add docs
+  # docs:
+  #   permissions:
+  #     contents: write
+  #   runs-on: ubuntu-latest
+  #   steps:
+  #     - uses: actions/checkout@v3
+  #     - uses: ./.github/actions/setup-poetry
+  #     - name: Build and push docs
+  #       run: poetry run mkdocs gh-deploy --force
+
   code-checks:
     uses: ./.github/workflows/checks.yml
   pre-release-check:

From 4e97a9ddfa822086f07dd82e5eb49215ccee6dad Mon Sep 17 00:00:00 2001
From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Date: Tue, 16 Jul 2024 13:34:42 +0200
Subject: [PATCH 09/15] feat: build with ci (#6)

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

From e4674852c202e7693086a101d76f321cf78bf754 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Tue, 16 Jul 2024 11:37:14 +0000
Subject: [PATCH 10/15] chore: bump version to 0.2.0 [skip ci]

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 CHANGELOG.md   | 5 +++++
 pyproject.toml | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)
 create mode 100644 CHANGELOG.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..deb4568
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,5 @@
+## [v0.2.0](https://github.com/DS4SD/docling/releases/tag/v0.2.0) - 2024-07-16
+
+### Feature
+
+* Build with ci ([#6](https://github.com/DS4SD/docling/issues/6)) ([`b1479cf`](https://github.com/DS4SD/docling/commit/b1479cf4ecf8a586703b31c7cf6917b3293c6a85))
diff --git a/pyproject.toml b/pyproject.toml
index 35dc361..a648a70 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "0.1.2"  # DO NOT EDIT, updated automatically
+version = "0.2.0"  # DO NOT EDIT, updated automatically
 description = "Docling PDF conversion package"
 authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"

From f3d65577a4a4e60775cc9e7b17fbaaba91a736d7 Mon Sep 17 00:00:00 2001
From: Christoph Auer <60343111+cau-git@users.noreply.github.com>
Date: Tue, 16 Jul 2024 14:15:09 +0200
Subject: [PATCH 11/15] docs: Add setup with pypi to Readme (#7)

Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com>
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 README.md | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 1367cfd..f70c015 100644
--- a/README.md
+++ b/README.md
@@ -14,9 +14,19 @@ Docling bundles PDF document conversion to JSON and Markdown in an easy, self-co
 
 ## Setup
 
-You need Python 3.11 and poetry. Install poetry from [here](https://python-poetry.org/docs/#installing-with-the-official-installer).
+For general usage, you can simply install `docling` through `pip` from the pypi package index.
+```
+pip install docling
+```
+
+**Notes**:
+* Works on macOS and Linux environments. Windows platforms are currently not tested.
+
+### Development setup
 
-Once you have `poetry` installed, create an environment and install the package:
+To develop for `docling`, you need Python 3.11 and `poetry`. Install poetry from [here](https://python-poetry.org/docs/#installing-with-the-official-installer).
+
+Once you have `poetry` installed and cloned this repo, create an environment and install `docling` from the repo root:
 
 ```bash
 poetry env use $(which python3.11)
@@ -24,10 +34,6 @@ poetry shell
 poetry install
 ```
 
-**Notes**:
-* Works on macOS and Linux environments. Windows platforms are currently not tested.
-
-
 ## Usage
 
 For basic usage, see the [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py) example module. Run with:

From 5e26e8d6e360c913bd7329a37dece0822ec01be8 Mon Sep 17 00:00:00 2001
From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Date: Wed, 17 Jul 2024 14:03:26 +0200
Subject: [PATCH 12/15] feat: enable python 3.12 support by updating glm (#8)

* update deepsearch-glm for python 3.12 support

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* enable python 3.12 in ci tests

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 .github/workflows/checks.yml |  2 +-
 poetry.lock                  | 91 +++++++++++-------------------------
 pyproject.toml               |  2 +-
 3 files changed, 30 insertions(+), 65 deletions(-)

diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml
index 6e2ff36..69a4178 100644
--- a/.github/workflows/checks.yml
+++ b/.github/workflows/checks.yml
@@ -6,7 +6,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.11']
+        python-version: ['3.11', '3.12']
     steps:
       - uses: actions/checkout@v3
       - uses: ./.github/actions/setup-poetry
diff --git a/poetry.lock b/poetry.lock
index 25ce193..b59f027 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -74,17 +74,6 @@ tests = ["attrs[tests-no-zope]", "zope-interface"]
 tests-mypy = ["mypy (>=1.6)", "pytest-mypy-plugins"]
 tests-no-zope = ["attrs[tests-mypy]", "cloudpickle", "hypothesis", "pympler", "pytest (>=4.3.0)", "pytest-xdist[psutil]"]
 
-[[package]]
-name = "bashlex"
-version = "0.18"
-description = "Python parser for bash"
-optional = false
-python-versions = ">=2.7, !=3.0, !=3.1, !=3.2, !=3.3, !=3.4"
-files = [
-    {file = "bashlex-0.18-py2.py3-none-any.whl", hash = "sha256:91d73a23a3e51711919c1c899083890cdecffc91d8c088942725ac13e9dcfffa"},
-    {file = "bashlex-0.18.tar.gz", hash = "sha256:5bb03a01c6d5676338c36fd1028009c8ad07e7d61d8a1ce3f513b7fff52796ee"},
-]
-
 [[package]]
 name = "black"
 version = "24.4.2"
@@ -131,17 +120,6 @@ d = ["aiohttp (>=3.7.4)", "aiohttp (>=3.7.4,!=3.9.0)"]
 jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"]
 uvloop = ["uvloop (>=0.15.2)"]
 
-[[package]]
-name = "bracex"
-version = "2.4"
-description = "Bash style brace expander."
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "bracex-2.4-py3-none-any.whl", hash = "sha256:efdc71eff95eaff5e0f8cfebe7d01adf2c8637c8c92edaf63ef348c241a82418"},
-    {file = "bracex-2.4.tar.gz", hash = "sha256:a27eaf1df42cf561fed58b7a8f3fdf129d1ea16a81e1fadd1d17989bc6384beb"},
-]
-
 [[package]]
 name = "build"
 version = "1.2.1"
@@ -371,32 +349,6 @@ files = [
     {file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"},
 ]
 
-[[package]]
-name = "cibuildwheel"
-version = "2.19.2"
-description = "Build Python wheels on CI with minimal configuration."
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "cibuildwheel-2.19.2-py3-none-any.whl", hash = "sha256:02ead5d7e3e81fe2ee0afb78746b1494af6b37afc1e32fae12f9c9a28c14e369"},
-    {file = "cibuildwheel-2.19.2.tar.gz", hash = "sha256:d331c81c505106ee585333b871718cf0516ac10d55c4dda2c00c8a7405743cab"},
-]
-
-[package.dependencies]
-bashlex = "!=0.13"
-bracex = "*"
-certifi = "*"
-filelock = "*"
-packaging = ">=20.9"
-platformdirs = "*"
-
-[package.extras]
-bin = ["click", "packaging (>=21.0)", "pip-tools", "pygithub", "pyyaml", "requests", "rich (>=9.6)"]
-dev = ["build", "click", "jinja2", "packaging (>=21.0)", "pip-tools", "pygithub", "pytest (>=6)", "pytest-timeout", "pytest-xdist", "pyyaml", "requests", "rich (>=9.6)", "tomli-w", "validate-pyproject"]
-docs = ["jinja2 (>=3.1.2)", "mkdocs (==1.3.1)", "mkdocs-include-markdown-plugin (==2.8.0)", "mkdocs-macros-plugin", "pymdown-extensions"]
-test = ["build", "jinja2", "pytest (>=6)", "pytest-timeout", "pytest-xdist", "tomli-w", "validate-pyproject"]
-uv = ["uv"]
-
 [[package]]
 name = "cleo"
 version = "2.1.0"
@@ -643,32 +595,45 @@ files = [
 
 [[package]]
 name = "deepsearch-glm"
-version = "0.18.4"
+version = "0.19.0"
 description = "Graph Language Models"
 optional = false
 python-versions = "<4.0,>=3.8"
 files = [
-    {file = "deepsearch_glm-0.18.4-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:ad88c5bf3c203174ef81e0699405aec0f5386130cbc6a975b165f81887bc1a52"},
-    {file = "deepsearch_glm-0.18.4-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:21d51a0671f0713d23be57030287a0f907f4a5f0627a45ea07e2caf54129a71a"},
-    {file = "deepsearch_glm-0.18.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7fc853941ea751a15f65e83f9bee9f988d0ecac4b28fac067b2aab49e15edb74"},
-    {file = "deepsearch_glm-0.18.4-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:cab5e577cf724343f2a5987ff4488c69e86a2dbca8cb0359c9243a07c6cd7d69"},
-    {file = "deepsearch_glm-0.18.4-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:dda02391306d657a884b12f21cc3d1228663f940ec6001c833893dd2844bcc25"},
-    {file = "deepsearch_glm-0.18.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9dccd4286a93ee1a216acba27e1fc76f5d14e280d968998cfeae11a00ad1b6cb"},
-    {file = "deepsearch_glm-0.18.4-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:cf38368bc72eab673459ea0fc96c02b1f3ae120df2d9443e1a63e010764ac1e9"},
-    {file = "deepsearch_glm-0.18.4-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:d3fd83ea3b2bce11bac1d710f12547728f4dd48bfaa8bd472366ef144469d52c"},
-    {file = "deepsearch_glm-0.18.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5fb4bfd43ac3b996cdd151c35e94fa399953ee3952d7e86390a825880ece95f3"},
-    {file = "deepsearch_glm-0.18.4-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:57cb67e435cacb6c4a6b6a9109d943267c493ebbba252a88ca40909976f60225"},
-    {file = "deepsearch_glm-0.18.4-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:edc399939b6464f96600d2f23796ae2641d668fb794b77199e87abdef77f8853"},
-    {file = "deepsearch_glm-0.18.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00ad8d932e7f0d1be4fd99fc0d4c8d50cb1ff10764f146b6ecb310a1379123d4"},
+    {file = "deepsearch_glm-0.19.0-cp310-cp310-macosx_13_6_arm64.whl", hash = "sha256:d420c7eb4e27b64cdc33c0beba159147fc4be14e141133f0f6ef080465b2529c"},
+    {file = "deepsearch_glm-0.19.0-cp310-cp310-macosx_13_6_x86_64.whl", hash = "sha256:8af4583ea6d914e87d6db96cae1d73272af6fe85193e67406f0c700064e794c2"},
+    {file = "deepsearch_glm-0.19.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:772e6e245b4d77d9df84af07693f9c19bc2f3dc6de4cb44deaf5fdd4a6c8e68d"},
+    {file = "deepsearch_glm-0.19.0-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:6a0c29f8cf8a1ee392c68985f8952a01b43dd8f2c5a1476b890f2c90d7ecbc96"},
+    {file = "deepsearch_glm-0.19.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bb4f34bb5e45df2790eb6bbaf5caa625393d903da502b086de65df9ce4e3fff2"},
+    {file = "deepsearch_glm-0.19.0-cp311-cp311-macosx_13_6_arm64.whl", hash = "sha256:320195914e96b8197e53665594c4480b86f3fc4cacd5e6782befb2bb94494a40"},
+    {file = "deepsearch_glm-0.19.0-cp311-cp311-macosx_13_6_x86_64.whl", hash = "sha256:7221851c304ef364a13eeffa940a7c15592e9d5b0050b97904221a65be33f3ab"},
+    {file = "deepsearch_glm-0.19.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:b9b9e7102cf4355be8458569c7a44133b54446ae623923772db6942ce0fb2e87"},
+    {file = "deepsearch_glm-0.19.0-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:fa8d8d718149cfecd724a0eca246a3bd57588dffb757f204b629a35623d8f946"},
+    {file = "deepsearch_glm-0.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ae251cf69b43d945fbf2cd41a89ba12312cd319a1c28d41c99d35cc476376b5"},
+    {file = "deepsearch_glm-0.19.0-cp312-cp312-macosx_13_6_arm64.whl", hash = "sha256:ad9a1fbf76e2561bc37e238ee9dd320b4b9cd49e61c55613e3977eedadee52bc"},
+    {file = "deepsearch_glm-0.19.0-cp312-cp312-macosx_13_6_x86_64.whl", hash = "sha256:2030aec8ce751927fe20ca1788e125e9b0c37f994c30062e59c4d7b7a87cbb64"},
+    {file = "deepsearch_glm-0.19.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:e57611b5d6cc2da91901e4b39fab6c9131dffe8766f43c20093bff75a0039100"},
+    {file = "deepsearch_glm-0.19.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:efbcf157cd6bd2dd6138312cef5df378598fd67e6c3f6f0b63ed3342c1de7f49"},
+    {file = "deepsearch_glm-0.19.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9febce49f365fcc5ada1ded720d387c51328ee470d4fcc89044b0684e074e699"},
+    {file = "deepsearch_glm-0.19.0-cp38-cp38-macosx_13_6_arm64.whl", hash = "sha256:04a29ba5e942f32659ae1a65cfe5e90e93d50e058d53b4763fe13df93f30492f"},
+    {file = "deepsearch_glm-0.19.0-cp38-cp38-macosx_13_6_x86_64.whl", hash = "sha256:a0024f42d6711f574dcab52ef2914a55f31b4fd804d3ad20ca7f211498e8a19b"},
+    {file = "deepsearch_glm-0.19.0-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:513e5f1de14f0b12c916a52118083094a9ced439e4800d3442b2dd04f3cdbead"},
+    {file = "deepsearch_glm-0.19.0-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:6d3dd07a549b8cd4408308b0b6b8ca65397ce7e8c819d050d8b2deb03cd1977e"},
+    {file = "deepsearch_glm-0.19.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7be341a85ce1ff164963a6d58b50955599dc33b34040975c972a798ae0f6f12c"},
+    {file = "deepsearch_glm-0.19.0-cp39-cp39-macosx_13_6_arm64.whl", hash = "sha256:b6c3c0d1295666a8a68f76262c020ffdc6de64cdd95671bf24c0592fa1317533"},
+    {file = "deepsearch_glm-0.19.0-cp39-cp39-macosx_13_6_x86_64.whl", hash = "sha256:cf290fe3824bd0de01b7c1d681aa14c89c5e60c6735fa471e04a985e55aead44"},
+    {file = "deepsearch_glm-0.19.0-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:92943c495646660aef99ba64a7e3b77ffeca4866e96044f8be5e14dfa7ee660e"},
+    {file = "deepsearch_glm-0.19.0-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:a7c89d6fae4ed9dc960f9ee9734e91d321222080bf439e1d89e8c67270afc282"},
+    {file = "deepsearch_glm-0.19.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fb6398a34f5afac6282c4a5b7ea5a89f27fcf4c0adac43af27ecbac9e2731ce3"},
 ]
 
 [package.dependencies]
-cibuildwheel = ">=2.17.0,<3.0.0"
 deepsearch-toolkit = ">=0.31.0"
 matplotlib = ">=3.7.1,<4.0.0"
 networkx = ">=3.1,<4.0"
 netwulf = ">=0.1.5,<0.2.0"
 numerize = ">=0.12,<0.13"
+numpy = {version = ">=1.26.4,<2.0.0", markers = "python_version >= \"3.9\""}
 pandas = ">=1.5.1"
 pybind11 = ">=2.10.4,<3.0.0"
 python-dotenv = ">=1.0.0,<2.0.0"
@@ -4862,4 +4827,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools",
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "1f0c0eb64cdce7e5c21670841d5dea047b3f918a3041bb22bb9c615b9085da20"
+content-hash = "dc19329559f190dfe687b4ee272eb6dac66b3d9fe0398c95c2572e8c63fa23ac"
diff --git a/pyproject.toml b/pyproject.toml
index a648a70..03e2b1b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,7 +25,7 @@ python = "^3.11"
 pydantic = "^2.0.0"
 docling-core = "^0.2.0"
 docling-ibm-models = "^0.2.0"
-deepsearch-glm = ">=0.18.4,<1"
+deepsearch-glm = ">=0.19.0,<1"
 deepsearch-toolkit = ">=0.47.0,<1"
 filetype = "^1.2.0"
 pypdfium2 = "^4.30.0"

From 86c2a7fc1e9906443f0a8eccd6828e88fe09179b Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Wed, 17 Jul 2024 12:11:15 +0000
Subject: [PATCH 13/15] chore: bump version to 0.3.0 [skip ci]

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 CHANGELOG.md   | 10 ++++++++++
 pyproject.toml |  2 +-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index deb4568..75b26cd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,13 @@
+## [v0.3.0](https://github.com/DS4SD/docling/releases/tag/v0.3.0) - 2024-07-17
+
+### Feature
+
+* Enable python 3.12 support by updating glm ([#8](https://github.com/DS4SD/docling/issues/8)) ([`fb72688`](https://github.com/DS4SD/docling/commit/fb72688ff7413083c864fe62d2dbfc420c1e5268))
+
+### Documentation
+
+* Add setup with pypi to Readme ([#7](https://github.com/DS4SD/docling/issues/7)) ([`2803222`](https://github.com/DS4SD/docling/commit/2803222ee1708481c779d435dbf1c031929d3cf6))
+
 ## [v0.2.0](https://github.com/DS4SD/docling/releases/tag/v0.2.0) - 2024-07-16
 
 ### Feature
diff --git a/pyproject.toml b/pyproject.toml
index 03e2b1b..7a39ac3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "0.2.0"  # DO NOT EDIT, updated automatically
+version = "0.3.0"  # DO NOT EDIT, updated automatically
 description = "Docling PDF conversion package"
 authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"

From 32905ab9591417458f6904075625a13c63db1e92 Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Wed, 17 Jul 2024 15:38:16 +0200
Subject: [PATCH 14/15] Add documentation

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 README.md                             | 21 ++++++++++++++++++++-
 docling/models/page_assemble_model.py | 12 ------------
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index f70c015..b89b03d 100644
--- a/README.md
+++ b/README.md
@@ -43,7 +43,9 @@ python examples/convert.py
 ```
 The output of the above command will be written to `./scratch`.
 
-### Enable or disable pipeline features
+### Adjust pipeline features
+
+**Control pipeline options**
 
 You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter` 
 ```python
@@ -54,6 +56,23 @@ doc_converter = DocumentConverter(
 )
 ```
 
+**Control table extraction options**
+
+You can control if table structure recognition should map the recognized structure back to PDF cells (default) or use text cells from the structure prediction itself.
+This can improve output quality if you find that multiple columns in extracted tables are erroneously merged into one.
+
+
+```python
+
+pipeline_options = PipelineOptions(do_table_structure=True)
+pipeline_options.table_structure_options.do_cell_matching = True
+
+doc_converter = DocumentConverter(
+    artifacts_path=artifacts_path,
+    pipeline_options=pipeline_options, # Controls if OCR is applied (ignores programmatic content)
+)
+```
+
 ### Impose limits on the document size
 
 You can limit the file size and number of pages which should be allowed to process per document.
diff --git a/docling/models/page_assemble_model.py b/docling/models/page_assemble_model.py
index 4ed0832..2b9db54 100644
--- a/docling/models/page_assemble_model.py
+++ b/docling/models/page_assemble_model.py
@@ -19,18 +19,6 @@ class PageAssembleModel:
     def __init__(self, config):
         self.config = config
 
-        # self.line_wrap_pattern = re.compile(r'(?<=[^\W_])- \n(?=\w)')
-
-    # def sanitize_text_poor(self, lines):
-    #     text = '\n'.join(lines)
-    #
-    #     # treat line wraps.
-    #     sanitized_text = self.line_wrap_pattern.sub('', text)
-    #
-    #     sanitized_text = sanitized_text.replace('\n', ' ')
-    #
-    #     return sanitized_text
-
     def sanitize_text(self, lines):
         if len(lines) <= 1:
             return " ".join(lines)

From 60302c312ce0b35ff2dfac219dc8546fa1a8fe21 Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Wed, 17 Jul 2024 15:51:28 +0200
Subject: [PATCH 15/15] Documentation improvements

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index bcbe7ac..470936a 100644
--- a/README.md
+++ b/README.md
@@ -71,11 +71,11 @@ This can improve output quality if you find that multiple columns in extracted t
 ```python
 
 pipeline_options = PipelineOptions(do_table_structure=True)
-pipeline_options.table_structure_options.do_cell_matching = True
+pipeline_options.table_structure_options.do_cell_matching = False # Uses text cells predicted from table structure model
 
 doc_converter = DocumentConverter(
     artifacts_path=artifacts_path,
-    pipeline_options=pipeline_options, # Controls if OCR is applied (ignores programmatic content)
+    pipeline_options=pipeline_options,
 )
 ```