Skip to content

Commit

Permalink
fix: fix duplicate title and heading + add e2e tests for html and docx (
Browse files Browse the repository at this point in the history
#186)

* add real e2e tests for html and docx

Signed-off-by: Peter Staar <[email protected]>

* updated the output of itxt

Signed-off-by: Peter Staar <[email protected]>

* reformatted the text

Signed-off-by: Peter Staar <[email protected]>

* fixed the tests

Signed-off-by: Peter Staar <[email protected]>

* fixed the tests (2)

Signed-off-by: Peter Staar <[email protected]>

* fixed the examples (1)

Signed-off-by: Peter Staar <[email protected]>

* fixed the output of the test

Signed-off-by: Peter Staar <[email protected]>

* updated the tests, moved the ground-truth

Signed-off-by: Peter Staar <[email protected]>

* moved the ground-truth data

Signed-off-by: Peter Staar <[email protected]>

* fixed the html tests

Signed-off-by: Peter Staar <[email protected]>

* restructure title fix (#187)

Signed-off-by: Panos Vagenas <[email protected]>

---------

Signed-off-by: Peter Staar <[email protected]>
Signed-off-by: Panos Vagenas <[email protected]>
Co-authored-by: Panos Vagenas <[email protected]>
  • Loading branch information
PeterStaar-IBM and vagenas authored Oct 30, 2024
1 parent dda2645 commit f542460
Show file tree
Hide file tree
Showing 49 changed files with 13,737 additions and 61 deletions.
50 changes: 25 additions & 25 deletions docling/backend/html_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,31 +179,31 @@ def handle_header(self, element, idx, doc):
self.parents[self.level] = doc.add_text(
parent=self.parents[0], label=DocItemLabel.TITLE, text=text
)

elif hlevel > self.level:

# add invisible group
for i in range(self.level + 1, hlevel):
self.parents[i] = doc.add_group(
name=f"header-{i}",
label=GroupLabel.SECTION,
parent=self.parents[i - 1],
)
self.level = hlevel

elif hlevel < self.level:

# remove the tail
for key, val in self.parents.items():
if key > hlevel:
self.parents[key] = None
self.level = hlevel

self.parents[hlevel] = doc.add_heading(
parent=self.parents[hlevel - 1],
text=text,
level=hlevel,
)
else:
if hlevel > self.level:

# add invisible group
for i in range(self.level + 1, hlevel):
self.parents[i] = doc.add_group(
name=f"header-{i}",
label=GroupLabel.SECTION,
parent=self.parents[i - 1],
)
self.level = hlevel

elif hlevel < self.level:

# remove the tail
for key, val in self.parents.items():
if key > hlevel:
self.parents[key] = None
self.level = hlevel

self.parents[hlevel] = doc.add_heading(
parent=self.parents[hlevel - 1],
text=text,
level=hlevel,
)

def handle_paragraph(self, element, idx, doc):
"""Handles paragraph tags (p)."""
Expand Down
8 changes: 4 additions & 4 deletions docs/examples/run_with_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@
def main():
input_paths = [
Path("README.md"),
Path("tests/data/wiki_duck.html"),
Path("tests/data/word_sample.docx"),
Path("tests/data/lorem_ipsum.docx"),
Path("tests/data/powerpoint_sample.pptx"),
Path("tests/data/html/wiki_duck.html"),
Path("tests/data/docx/word_sample.docx"),
Path("tests/data/docx/lorem_ipsum.docx"),
Path("tests/data/pptx/powerpoint_sample.pptx"),
Path("tests/data/2305.03393v1-pg9-img.png"),
Path("tests/data/2206.01062.pdf"),
Path("tests/data/test_01.asciidoc"),
Expand Down
27 changes: 5 additions & 22 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ torchvision = [
######################
python = "^3.10"
pydantic = "^2.0.0"
docling-core = "^2.2.1"
docling-core = "^2.2.3"
docling-ibm-models = "^2.0.1"
deepsearch-glm = "^0.26.1"
filetype = "^1.2.0"
Expand Down
File renamed without changes.
Binary file added tests/data/docx/unit_test_headers.docx
Binary file not shown.
Binary file added tests/data/docx/unit_test_lists.docx
Binary file not shown.
File renamed without changes.
12 changes: 12 additions & 0 deletions tests/data/groundtruth/docling_v2/example_01.html.itxt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: title: Introduction
item-2 at level 2: paragraph: This is the first paragraph of the introduction.
item-3 at level 2: section_header: Background
item-4 at level 3: paragraph: Some background information here.
item-5 at level 3: picture
item-6 at level 3: list: group list
item-7 at level 4: list_item: First item in unordered list
item-8 at level 4: list_item: Second item in unordered list
item-9 at level 3: ordered_list: group ordered list
item-10 at level 4: list_item: First item in ordered list
item-11 at level 4: list_item: Second item in ordered list
197 changes: 197 additions & 0 deletions tests/data/groundtruth/docling_v2/example_01.html.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
{
"schema_name": "DoclingDocument",
"version": "1.0.0",
"name": "example_01",
"origin": {
"mimetype": "text/html",
"binary_hash": 13782069548509991617,
"filename": "example_01.html"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/texts/0"
}
],
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/texts/2"
},
"children": [
{
"$ref": "#/texts/4"
},
{
"$ref": "#/texts/5"
}
],
"name": "list",
"label": "list"
},
{
"self_ref": "#/groups/1",
"parent": {
"$ref": "#/texts/2"
},
"children": [
{
"$ref": "#/texts/6"
},
{
"$ref": "#/texts/7"
}
],
"name": "ordered list",
"label": "ordered_list"
}
],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/1"
},
{
"$ref": "#/texts/2"
}
],
"label": "title",
"prov": [],
"orig": "Introduction",
"text": "Introduction"
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/texts/0"
},
"children": [],
"label": "paragraph",
"prov": [],
"orig": "This is the first paragraph of the introduction.",
"text": "This is the first paragraph of the introduction."
},
{
"self_ref": "#/texts/2",
"parent": {
"$ref": "#/texts/0"
},
"children": [
{
"$ref": "#/texts/3"
},
{
"$ref": "#/pictures/0"
},
{
"$ref": "#/groups/0"
},
{
"$ref": "#/groups/1"
}
],
"label": "section_header",
"prov": [],
"orig": "Background",
"text": "Background",
"level": 2
},
{
"self_ref": "#/texts/3",
"parent": {
"$ref": "#/texts/2"
},
"children": [],
"label": "paragraph",
"prov": [],
"orig": "Some background information here.",
"text": "Some background information here."
},
{
"self_ref": "#/texts/4",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"label": "list_item",
"prov": [],
"orig": "First item in unordered list",
"text": "First item in unordered list",
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/5",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"label": "list_item",
"prov": [],
"orig": "Second item in unordered list",
"text": "Second item in unordered list",
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/6",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"label": "list_item",
"prov": [],
"orig": "First item in ordered list",
"text": "First item in ordered list",
"enumerated": true,
"marker": "1."
},
{
"self_ref": "#/texts/7",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"label": "list_item",
"prov": [],
"orig": "Second item in ordered list",
"text": "Second item in ordered list",
"enumerated": true,
"marker": "2."
}
],
"pictures": [
{
"self_ref": "#/pictures/0",
"parent": {
"$ref": "#/texts/2"
},
"children": [],
"label": "picture",
"prov": [],
"captions": [],
"references": [],
"footnotes": [],
"annotations": []
}
],
"tables": [],
"key_value_items": [],
"pages": {}
}
15 changes: 15 additions & 0 deletions tests/data/groundtruth/docling_v2/example_01.html.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Introduction

This is the first paragraph of the introduction.

## Background

Some background information here.

<!-- image -->

- First item in unordered list
- Second item in unordered list

1. First item in ordered list
2. Second item in ordered list
Loading

0 comments on commit f542460

Please sign in to comment.