From f542460af3c7676e5f8dee3b6ce729b139560cd6 Mon Sep 17 00:00:00 2001 From: "Peter W. J. Staar" <91719829+PeterStaar-IBM@users.noreply.github.com> Date: Wed, 30 Oct 2024 13:14:56 +0100 Subject: [PATCH] fix: fix duplicate title and heading + add e2e tests for html and docx (#186) * add real e2e tests for html and docx Signed-off-by: Peter Staar * updated the output of itxt Signed-off-by: Peter Staar * reformatted the text Signed-off-by: Peter Staar * fixed the tests Signed-off-by: Peter Staar * fixed the tests (2) Signed-off-by: Peter Staar * fixed the examples (1) Signed-off-by: Peter Staar * fixed the output of the test Signed-off-by: Peter Staar * updated the tests, moved the ground-truth Signed-off-by: Peter Staar * moved the ground-truth data Signed-off-by: Peter Staar * fixed the html tests Signed-off-by: Peter Staar * restructure title fix (#187) Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --------- Signed-off-by: Peter Staar Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- docling/backend/html_backend.py | 50 +- docs/examples/run_with_formats.py | 8 +- poetry.lock | 27 +- pyproject.toml | 2 +- tests/data/{ => docx}/lorem_ipsum.docx | Bin tests/data/docx/unit_test_headers.docx | Bin 0 -> 13903 bytes tests/data/docx/unit_test_lists.docx | Bin 0 -> 15769 bytes tests/data/{ => docx}/word_sample.docx | Bin .../docling_v2/example_01.html.itxt | 12 + .../docling_v2/example_01.html.json | 197 + .../groundtruth/docling_v2/example_01.html.md | 15 + .../docling_v2/example_02.html.itxt | 11 + .../docling_v2/example_02.html.json | 180 + .../groundtruth/docling_v2/example_02.html.md | 13 + .../docling_v2/example_03.html.itxt | 20 + .../docling_v2/example_03.html.json | 624 ++ .../groundtruth/docling_v2/example_03.html.md | 27 + .../docling_v2/example_04.html.itxt | 3 + .../docling_v2/example_04.html.json | 329 + .../groundtruth/docling_v2/example_04.html.md | 7 + .../docling_v2/lorem_ipsum.docx.itxt | 10 + .../docling_v2/lorem_ipsum.docx.json | 156 + .../docling_v2/lorem_ipsum.docx.md | 9 + .../docling_v2/unit_test_01.html.itxt | 9 + .../docling_v2/unit_test_01.html.json | 151 + .../docling_v2/unit_test_01.html.md | 13 + .../docling_v2/unit_test_headers.docx.itxt | 48 + .../docling_v2/unit_test_headers.docx.json | 703 ++ .../docling_v2/unit_test_headers.docx.md | 43 + .../docling_v2/unit_test_lists.docx.itxt | 61 + .../docling_v2/unit_test_lists.docx.json | 921 ++ .../docling_v2/unit_test_lists.docx.md | 48 + .../docling_v2/wiki_duck.html.itxt | 464 + .../docling_v2/wiki_duck.html.json | 7988 +++++++++++++++++ .../groundtruth/docling_v2/wiki_duck.html.md | 532 ++ .../docling_v2/word_sample.docx.itxt | 29 + .../docling_v2/word_sample.docx.json | 749 ++ .../docling_v2/word_sample.docx.md | 43 + tests/data/html/example_01.html | 17 + tests/data/html/example_02.html | 16 + tests/data/html/example_03.html | 66 + tests/data/html/example_04.html | 24 + tests/data/html/unit_test_01.html | 11 + tests/data/{ => html}/wiki_duck.html | 0 tests/data/{ => pptx}/powerpoint_sample.pptx | Bin tests/test_backend_html.py | 76 +- tests/test_backend_msword.py | 76 +- tests/test_e2e_conversion.py | 2 +- tests/test_legacy_format_transform.py | 8 +- 49 files changed, 13737 insertions(+), 61 deletions(-) rename tests/data/{ => docx}/lorem_ipsum.docx (100%) create mode 100644 tests/data/docx/unit_test_headers.docx create mode 100644 tests/data/docx/unit_test_lists.docx rename tests/data/{ => docx}/word_sample.docx (100%) create mode 100644 tests/data/groundtruth/docling_v2/example_01.html.itxt create mode 100644 tests/data/groundtruth/docling_v2/example_01.html.json create mode 100644 tests/data/groundtruth/docling_v2/example_01.html.md create mode 100644 tests/data/groundtruth/docling_v2/example_02.html.itxt create mode 100644 tests/data/groundtruth/docling_v2/example_02.html.json create mode 100644 tests/data/groundtruth/docling_v2/example_02.html.md create mode 100644 tests/data/groundtruth/docling_v2/example_03.html.itxt create mode 100644 tests/data/groundtruth/docling_v2/example_03.html.json create mode 100644 tests/data/groundtruth/docling_v2/example_03.html.md create mode 100644 tests/data/groundtruth/docling_v2/example_04.html.itxt create mode 100644 tests/data/groundtruth/docling_v2/example_04.html.json create mode 100644 tests/data/groundtruth/docling_v2/example_04.html.md create mode 100644 tests/data/groundtruth/docling_v2/lorem_ipsum.docx.itxt create mode 100644 tests/data/groundtruth/docling_v2/lorem_ipsum.docx.json create mode 100644 tests/data/groundtruth/docling_v2/lorem_ipsum.docx.md create mode 100644 tests/data/groundtruth/docling_v2/unit_test_01.html.itxt create mode 100644 tests/data/groundtruth/docling_v2/unit_test_01.html.json create mode 100644 tests/data/groundtruth/docling_v2/unit_test_01.html.md create mode 100644 tests/data/groundtruth/docling_v2/unit_test_headers.docx.itxt create mode 100644 tests/data/groundtruth/docling_v2/unit_test_headers.docx.json create mode 100644 tests/data/groundtruth/docling_v2/unit_test_headers.docx.md create mode 100644 tests/data/groundtruth/docling_v2/unit_test_lists.docx.itxt create mode 100644 tests/data/groundtruth/docling_v2/unit_test_lists.docx.json create mode 100644 tests/data/groundtruth/docling_v2/unit_test_lists.docx.md create mode 100644 tests/data/groundtruth/docling_v2/wiki_duck.html.itxt create mode 100644 tests/data/groundtruth/docling_v2/wiki_duck.html.json create mode 100644 tests/data/groundtruth/docling_v2/wiki_duck.html.md create mode 100644 tests/data/groundtruth/docling_v2/word_sample.docx.itxt create mode 100644 tests/data/groundtruth/docling_v2/word_sample.docx.json create mode 100644 tests/data/groundtruth/docling_v2/word_sample.docx.md create mode 100644 tests/data/html/example_01.html create mode 100644 tests/data/html/example_02.html create mode 100644 tests/data/html/example_03.html create mode 100644 tests/data/html/example_04.html create mode 100644 tests/data/html/unit_test_01.html rename tests/data/{ => html}/wiki_duck.html (100%) rename tests/data/{ => pptx}/powerpoint_sample.pptx (100%) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 7fd69cff..7d14c2eb 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -179,31 +179,31 @@ def handle_header(self, element, idx, doc): self.parents[self.level] = doc.add_text( parent=self.parents[0], label=DocItemLabel.TITLE, text=text ) - - elif hlevel > self.level: - - # add invisible group - for i in range(self.level + 1, hlevel): - self.parents[i] = doc.add_group( - name=f"header-{i}", - label=GroupLabel.SECTION, - parent=self.parents[i - 1], - ) - self.level = hlevel - - elif hlevel < self.level: - - # remove the tail - for key, val in self.parents.items(): - if key > hlevel: - self.parents[key] = None - self.level = hlevel - - self.parents[hlevel] = doc.add_heading( - parent=self.parents[hlevel - 1], - text=text, - level=hlevel, - ) + else: + if hlevel > self.level: + + # add invisible group + for i in range(self.level + 1, hlevel): + self.parents[i] = doc.add_group( + name=f"header-{i}", + label=GroupLabel.SECTION, + parent=self.parents[i - 1], + ) + self.level = hlevel + + elif hlevel < self.level: + + # remove the tail + for key, val in self.parents.items(): + if key > hlevel: + self.parents[key] = None + self.level = hlevel + + self.parents[hlevel] = doc.add_heading( + parent=self.parents[hlevel - 1], + text=text, + level=hlevel, + ) def handle_paragraph(self, element, idx, doc): """Handles paragraph tags (p).""" diff --git a/docs/examples/run_with_formats.py b/docs/examples/run_with_formats.py index a3b62b2d..7bd27de5 100644 --- a/docs/examples/run_with_formats.py +++ b/docs/examples/run_with_formats.py @@ -20,10 +20,10 @@ def main(): input_paths = [ Path("README.md"), - Path("tests/data/wiki_duck.html"), - Path("tests/data/word_sample.docx"), - Path("tests/data/lorem_ipsum.docx"), - Path("tests/data/powerpoint_sample.pptx"), + Path("tests/data/html/wiki_duck.html"), + Path("tests/data/docx/word_sample.docx"), + Path("tests/data/docx/lorem_ipsum.docx"), + Path("tests/data/pptx/powerpoint_sample.pptx"), Path("tests/data/2305.03393v1-pg9-img.png"), Path("tests/data/2206.01062.pdf"), Path("tests/data/test_01.asciidoc"), diff --git a/poetry.lock b/poetry.lock index b832ed3e..b1fa58b7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -894,13 +894,13 @@ files = [ [[package]] name = "docling-core" -version = "2.2.1" +version = "2.2.3" description = "A python library to define and validate data types in Docling." optional = false python-versions = "<4.0,>=3.9" files = [ - {file = "docling_core-2.2.1-py3-none-any.whl", hash = "sha256:65ed05331f387410950e10d7d2347eae770ab7dc4b5a632715aaa7c66c158cb5"}, - {file = "docling_core-2.2.1.tar.gz", hash = "sha256:4893369fe2aac9dff26c85a4ff87990f2e1645d9e16473ac7309e3459a3c4219"}, + {file = "docling_core-2.2.3-py3-none-any.whl", hash = "sha256:3080c0fb916dbc6a445b1c69a0a71922a902c61205b5dc434cd4bb727a72166c"}, + {file = "docling_core-2.2.3.tar.gz", hash = "sha256:c6e622e61792a3edebf34f560d91f12abfa5e97afcaf7930f3b4d6a310de8f7a"}, ] [package.dependencies] @@ -3622,43 +3622,31 @@ python-versions = ">=3.9" files = [ {file = "pandas-2.2.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1948ddde24197a0f7add2bdc4ca83bf2b1ef84a1bc8ccffd95eda17fd836ecb5"}, {file = "pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:381175499d3802cde0eabbaf6324cce0c4f5d52ca6f8c377c29ad442f50f6348"}, - {file = "pandas-2.2.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d9c45366def9a3dd85a6454c0e7908f2b3b8e9c138f5dc38fed7ce720d8453ed"}, {file = "pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86976a1c5b25ae3f8ccae3a5306e443569ee3c3faf444dfd0f41cda24667ad57"}, - {file = "pandas-2.2.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b8661b0238a69d7aafe156b7fa86c44b881387509653fdf857bebc5e4008ad42"}, {file = "pandas-2.2.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:37e0aced3e8f539eccf2e099f65cdb9c8aa85109b0be6e93e2baff94264bdc6f"}, {file = "pandas-2.2.3-cp310-cp310-win_amd64.whl", hash = "sha256:56534ce0746a58afaf7942ba4863e0ef81c9c50d3f0ae93e9497d6a41a057645"}, {file = "pandas-2.2.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:66108071e1b935240e74525006034333f98bcdb87ea116de573a6a0dccb6c039"}, {file = "pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7c2875855b0ff77b2a64a0365e24455d9990730d6431b9e0ee18ad8acee13dbd"}, - {file = "pandas-2.2.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd8d0c3be0515c12fed0bdbae072551c8b54b7192c7b1fda0ba56059a0179698"}, {file = "pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c124333816c3a9b03fbeef3a9f230ba9a737e9e5bb4060aa2107a86cc0a497fc"}, - {file = "pandas-2.2.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:63cc132e40a2e084cf01adf0775b15ac515ba905d7dcca47e9a251819c575ef3"}, {file = "pandas-2.2.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:29401dbfa9ad77319367d36940cd8a0b3a11aba16063e39632d98b0e931ddf32"}, {file = "pandas-2.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:3fc6873a41186404dad67245896a6e440baacc92f5b716ccd1bc9ed2995ab2c5"}, {file = "pandas-2.2.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b1d432e8d08679a40e2a6d8b2f9770a5c21793a6f9f47fdd52c5ce1948a5a8a9"}, {file = "pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a5a1595fe639f5988ba6a8e5bc9649af3baf26df3998a0abe56c02609392e0a4"}, - {file = "pandas-2.2.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5de54125a92bb4d1c051c0659e6fcb75256bf799a732a87184e5ea503965bce3"}, {file = "pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fffb8ae78d8af97f849404f21411c95062db1496aeb3e56f146f0355c9989319"}, - {file = "pandas-2.2.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6dfcb5ee8d4d50c06a51c2fffa6cff6272098ad6540aed1a76d15fb9318194d8"}, {file = "pandas-2.2.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:062309c1b9ea12a50e8ce661145c6aab431b1e99530d3cd60640e255778bd43a"}, {file = "pandas-2.2.3-cp312-cp312-win_amd64.whl", hash = "sha256:59ef3764d0fe818125a5097d2ae867ca3fa64df032331b7e0917cf5d7bf66b13"}, {file = "pandas-2.2.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f00d1345d84d8c86a63e476bb4955e46458b304b9575dcf71102b5c705320015"}, {file = "pandas-2.2.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3508d914817e153ad359d7e069d752cdd736a247c322d932eb89e6bc84217f28"}, - {file = "pandas-2.2.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22a9d949bfc9a502d320aa04e5d02feab689d61da4e7764b62c30b991c42c5f0"}, {file = "pandas-2.2.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3a255b2c19987fbbe62a9dfd6cff7ff2aa9ccab3fc75218fd4b7530f01efa24"}, - {file = "pandas-2.2.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:800250ecdadb6d9c78eae4990da62743b857b470883fa27f652db8bdde7f6659"}, {file = "pandas-2.2.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6374c452ff3ec675a8f46fd9ab25c4ad0ba590b71cf0656f8b6daa5202bca3fb"}, {file = "pandas-2.2.3-cp313-cp313-win_amd64.whl", hash = "sha256:61c5ad4043f791b61dd4752191d9f07f0ae412515d59ba8f005832a532f8736d"}, {file = "pandas-2.2.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:3b71f27954685ee685317063bf13c7709a7ba74fc996b84fc6821c59b0f06468"}, {file = "pandas-2.2.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:38cf8125c40dae9d5acc10fa66af8ea6fdf760b2714ee482ca691fc66e6fcb18"}, - {file = "pandas-2.2.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ba96630bc17c875161df3818780af30e43be9b166ce51c9a18c1feae342906c2"}, {file = "pandas-2.2.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1db71525a1538b30142094edb9adc10be3f3e176748cd7acc2240c2f2e5aa3a4"}, - {file = "pandas-2.2.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:15c0e1e02e93116177d29ff83e8b1619c93ddc9c49083f237d4312337a61165d"}, {file = "pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a"}, {file = "pandas-2.2.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc6b93f9b966093cb0fd62ff1a7e4c09e6d546ad7c1de191767baffc57628f39"}, {file = "pandas-2.2.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5dbca4c1acd72e8eeef4753eeca07de9b1db4f398669d5994086f788a5d7cc30"}, - {file = "pandas-2.2.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8cd6d7cc958a3910f934ea8dbdf17b2364827bb4dafc38ce6eef6bb3d65ff09c"}, {file = "pandas-2.2.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99df71520d25fade9db7c1076ac94eb994f4d2673ef2aa2e86ee039b6746d20c"}, - {file = "pandas-2.2.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:31d0ced62d4ea3e231a9f228366919a5ea0b07440d9d4dac345376fd8e1477ea"}, {file = "pandas-2.2.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7eee9e7cea6adf3e3d24e304ac6b8300646e2a5d1cd3a3c2abed9101b0846761"}, {file = "pandas-2.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:4850ba03528b6dd51d6c5d273c46f183f39a9baf3f0143e566b89450965b105e"}, {file = "pandas-2.2.3.tar.gz", hash = "sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667"}, @@ -5596,11 +5584,6 @@ files = [ {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"}, {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"}, {file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"}, - {file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"}, - {file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"}, - {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"}, - {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"}, - {file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"}, {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"}, @@ -7180,4 +7163,4 @@ tesserocr = ["tesserocr"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "48127a4b7e05f31a1c9f2c6f9b0ac8da61ac67309a8dd020b41e7ec82ecff38e" +content-hash = "0fa3ccd2d6b5d23f83e5abe3f5ae72ad7b0835045f4acd92388bb22c168011b0" diff --git a/pyproject.toml b/pyproject.toml index 26c34f55..9634708d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ torchvision = [ ###################### python = "^3.10" pydantic = "^2.0.0" -docling-core = "^2.2.1" +docling-core = "^2.2.3" docling-ibm-models = "^2.0.1" deepsearch-glm = "^0.26.1" filetype = "^1.2.0" diff --git a/tests/data/lorem_ipsum.docx b/tests/data/docx/lorem_ipsum.docx similarity index 100% rename from tests/data/lorem_ipsum.docx rename to tests/data/docx/lorem_ipsum.docx diff --git a/tests/data/docx/unit_test_headers.docx b/tests/data/docx/unit_test_headers.docx new file mode 100644 index 0000000000000000000000000000000000000000..6fce0d3b6711a77cfea89878b89657d2d2e89eb9 GIT binary patch literal 13903 zcmeHuWmH_twr&H#J$U09+}+*Xg1ZNT1$TFM2?Prc!QBD`cXzko?(jPM>~puA{l**P z{k><6(bc_H)in006{*wfsqIEieFp2oeB51Hgc5i`ahr zWcu-wzN(v@siQ8vtBo~L?i+B*F92{*`Tt)3i(8;Jap>cBMijBzqzB}<1~sGo+!AW= z2>xU`DI8QbuQ$GSbWKT33`?s zGz)XvbUr3H6dQ?*t3m_pzSgX*Z8);*V2q@7-SHzTa0;1unE@CYcYtW>q!@KF_b3*E zz$Z+W_1m^ph?G@wJ1T`bXj~+{FK^)!s^{3eK1z~cn=Cj^X_zO;1{&0&hyCihZWd33y2)Io3Y*b#mw;hQTr<}%e*atAFGwt<)fDC0H%W}AKeVeS@SXIYnv~E z`2D*}k2ZG@0Kn@j1VHgGLy{m4*l-HcGkMU6g9i;seFsx(M+W*|>3@gf|KjoYr&lkF z@3QJ_9$&&=*fKYLF zp{XOCXgk!RH#>#!mLybT)5c^qd0F;qbGND9EML-?W@fDAcl?*$dyr5nw+h)~wavu*J>P6}ki&rvDrtRmu6!xKIE9>pK7d71Ss$whkr?Cbq^- zHXyt8%U}&_Eqq+!ME4>zcdn+gFJpSlN#8CmBn5)EkX9)=`xXgV`T zT9=|hB=_tr4~eXH%W!3Pw3-}jOQ||3Z#Ss8LVQJwlMu_?j#xTa69`c7)X!o1S8D@p(DFcf-V*phk0Ji9>PY^PtaLI;gfBOf#IGP&P62N+4HKo9;DsM8il^ z;^n7^W6X-6W=HY;(neEC^45v&xXJJZeRc{_=PEY@^ZiE6Q5isvR4Z7GRPP4+tedFk z8-h1NAQWH#mBbYxNB}_$ac2fKmJ3GG-6>EsnD{d`9)56Oblf~6nP+5A2r)ig;gPE! zRzbh%joKV7@QWj~PuvlXZKiq{3gAGT@Bn zMEZh6GPD(#oBVHsp_QhHv8=+!Fah{rJXEh8FFL8P&r0DaSxcK;B4_%%HA7(W~MFpX*O}RM8Z&VMw~=2pqSBbkygv{ zrF|z?CFMrFwRXpyGWm9d9ZbWAo6iI?>TnGr1*>mf+qjc>@bg5d6(Oa-bu!sDmSkO+|JE8 zi={FX+X+8z3)Jy@$6J-@5tGu{l4eU>wZpG{q+jN%8;>VfVc313Nja}nY~TE-5(DQl ziF^pblv;<#cpP28T-!ckKrIQx%gR626SeZl6hs{S0&O!?E0G^*h|=f`K&Dd`UFl?3 zJGf7@aZQzph@hrTL37RPhP?0!MyVCo;ZJ@tfJ34K7qBDVvXWn|<*XciCuuoVM&B(d zEu|TzrK(`GXn5>fjbC~OWoZ}Iv}KRbdfphwALKWpI_^YgG`Ghtu(s(FMB*-hUWZAv zzHARb{y`cgywR@q#;vc%^R4En_Y_XDhs>}u z9)`goK?{hpL};i~*FCsOM_@U_eeLnr(PD5mqMV9p8*0h38F-eEM3-d%``|v~Fju|x zac6rcI%O3&{zmmhI|`ZU3se8o7)llNn$8^?Alqy!pieCIDmpI}T4!{`mODCY)T;i0 z(G!iLLe5(L2k+5RYEv0sp7TOonnG&?#ZdR#E|E#+UUvz7daQi3_zq27up50gQa z5MGbkw3!S{pS6iwy5PD=TLaSY^-woIDh_#*#pj)$LhIV9HpO!o0V-}nz}XSxxW3%C z=e#;Fg>N@1XRqP=d%XGOu{xYirY*hP<#UGk479?ts_M}RQ09=`VNr9&nQ>M+V)+CF zxvPAqxddgzzCkkC3BF#<|d@57)t%pO*AvdkV%tte=BX_1Q)7h{YST zaO`)e3-^4VN20H>MHC1jrK8x11kY|EC3u6~b)mDgc7%h-xiM<0r&pHNcxG0a0?%@y zf)$mP=1x$P?|`rMCOQNqVB1HwP@URpahX&V(U_Lo4>8jh!4<;cYxOj<^+IG?+qsS` z;G>UA%Olo5S&~_k8l@|c1z`gDVR!%-u-`1nzuc`q+mnC0USObA5Xd6^@4m|7hd{0e zXcZ9j5|HXe$K$0aylpK_yfpg^5F-#MqJ#T7rd70^JKha!H<#DEvxoSfLLF& z9?q{yLsYu<_`bPJpV#|sxm-d5guB6XIxWCqqZEfUiB5Eup05&0?Ihk%iV6-&ctY1_ z3WmBCYHS-XAl_pD^CY9?sNi#A<#w^&A!JFn5mUKWXww9vNFaq8P|mDo$)98UFCaQ_Izr<&=H9iJa4%WuE)cY%d?7h8+RH5QnKGy$;z0B?$gpzE9Qk+;9K zvZoUjjkXs5_~kX(LB?-jc*V$Y)A>R1LSyB7gKp&%Ev_q~ zG{Fb$JLwMH+XpDU0C4}j1@>ljTUINXD4(^_jHI7GMcu2X zY(m_nZfMkI;wqOW__cbTZyudr*7S9J=5uqnxx@Mv^DL;bePgg~zN7bgcfGde1NQQ8 z9WtlTQMKiU`TBE@LI3%sucH_Xs@yia&Hd$Q$3_47eo6(tb!43qo^TVz-VXk=KM!jn ze66gn_|Y+0Fj$XfnDe$Hwh+0o(5;xk7h5_aKTI9yH0Rbnljd(=m18l8-zC5@;H|+{ z5riRJ1*r}c&2*>a*xt{=UG_I9jP7bK++ad7%@7 zO=Ev1HbHHPsWO~{;5AE}DhAv#?F(A74WPtCT1W@o*ik-Le?v`tVYha)L;gfrReYL* zhDLP@MaG^Kifz`4v zIKoDX=?%#jo$wrONVz;otR z^0_szT61fAgPkUW-W2WW9E+WgGX^P+SB7?csCl=#;(l$8&F%5NI=!jPjJ_17r60P; zY!X~tAEKPzqc)lGcGo-{mg_CN|4kFUSL5Tes?4wX=4bcHbz?%ZO|;pc8q^wqM3-m2^2xWZC8Lur8Onh0+O_hEN7G`8_*E^w zHW=N~IXZp)`U)7zg$Cldv|%!(`q-jM?lIz=_s+z$>6n;^1^g*NFIw(m1oD8A59e0o zhGgAx!B?X9RB`%lx8M0bo69;bWSP3j^L&7$MQL?tw$>+R3ob?FMYgn{EBH&sV4WK!8Xiz@nfU8q( z&@P(R5$l+sou8<;F>^cEnm%_T05Wg?X#m^z7~jFY0RWES|Fi&i zH2w6+@}s%qFXLCMz64r;qj>2XympOWayM(V!#ahNMRQPX))y%sv5+gXqw2kh58uLk zJmtw39cmfCw2x?&oX$*R>E0MtJY5yAXLlu&R-wWa^2CIMAdwC$o~QM%ye-|J#kL4| zBQ6V94Ra$3eLCjzdb~nE)q|}#1@Q%iduxYe9;jk&STsbpA(=gp4g;Vm=43uAW+#^T znNu86u+7F?`z12nD^+xYY~Ny=#3HeuJDj+%AYM!TN4iOMiG@^iN`qqM`BFNq>4F_e z6k4du5w7|NV#kRwjfrrZtrK0*f8dWU1}92 z>z-GV2pMXYqGI7N5Wf2{;|Y=@>U_F^g*MXvD3z7DyxT=j#Iwif)Gjpu-BLzT5;&{< zrp66Li)*^2%pm~b%%qtjvI$k4*}wCceu=Wj@Xj@?rYm0$3iKdMaV0JZDntJL)O z9ZV`!VD1GlsneM}NGaZpk5)k15i&qlDBB71W7&0SD}I~-F-pku%ZH!APx+2Gcs~KwMMZ;8-KSV2cS-O zny4l~$thee$9GjmwR$QA`OO&E(pw{$6D1^Yc#1`JY>CKf4tp1#A}GNbS5Z&w=}(Y@ ztPTw&ARO+{>*Zu>mzP$Wc5M0h9X?K>RIfA1!6HM89v8?3Y@MR5IKALN=t2`oEa z!Pkls=F?H$IQQ>VB<+)2MJK>R(iU{=nTLz+fuR4)+pRh$quj)%O#u zU(zXzwF`e&uCM&yR>_@B!K}uWGp%CMkXymLd%-mc;oZ<-M5z+qNJ`%Mnl9np>(!;t zKTaG+RZ`QV$N<1%0RRB{`j_YI_{q)M^tWl_NO#F$sUOu#LG1;sw{W|QMk&8YrrcR3 zzI+~IsZ@7xVUL&*y$T!*3P5Ms!SC1|WQ`C?P;0>@)ig)~c9S-AFqQgo+573<630#7 zEj>uelL`C9j`Pg*=$#Punm$7N@=7ZgzN_ZwrmvXPFay*WFYeRRl>Y0*)|OkcfCTU5 z8^O%a49^i=BxCRQprO+`)@72PW6NA=oktTs4LF>$wFzicUm3Q!Z@nAB80yx<4!XbW zn%LMdF))~9dg?v?Yssz{*(H?Rd`gW%! zavCaj+aJmz5^3ZSMT?6GY!h1Qbr1UUYOJrrMI+_Y4K4VaMsSn#A)Yk`dgu+_>pkod zo@|ks)<_b;B3}fexx#6@FzhP7C=uQ#EYgP%355DRlTBlgy=e2BA5xBO&t0*z>5p(+ z?@@$Z4twyLrx6Cb;&~ijZ(9Kn=;4YD1u3;^?KJ5SHQk60$K=A^B z{IzM`o{2N@`7LgTXQNy<83ys8k)5*{yZk+~Cm|GA%m;x;EW=J8{j)on9|2a6l^oPZ zsfgay(Y<>U1ES~nrd=)XvMWtc&^(MO%wKC*L$@pE3}+gdUtPf1S<#kFI@V>y9c&vB zbDCrPIpZ-kZE8nS56i1c^>cv0Y>JG} zEWds*71@iki9tnKd_@3G1mdvfx=p=EvDtzitb!i2;?%xeI%Dm|!JS{L^?ma=KH-+$ zh7DVNP7S`TD#^YPXJ%75O)fa%H!Tf%cQ`ik?V8DWsY%-3{4{zqRh>ks{4gn^zr+!{ z@=;+Q3YgR=%}e5z}{u9s+kG1MKJl<`j`V3%|U*sry9efOWF6I45i0KqKkYGQsYCE=Y$sD&mQ+M6vVV{ z#&HiQ!g9Ccpg)C9@ygX!&XD`$Qy!4ry9~|vo8}BAu@f~D{)~{pX;rYE@epi4g*OJS zuC{4bGhXbgqnw`ddtIHxC&{wx^o#@a$lbDN2N>nZ&ulv`7uGDfU8&;N@6s%4Q$%-& zA)WEx;+e22zeOPqGd05wA?s5Nx>b1LmC*o>e_8peqtBP4U2&BgAgE13&mVp# z@(rl$`f6lJc_^%!+e`{pnuJ($eS~~+i|mw*7L(6>HN^e?TB2@RZT6Gha|$djD1ix= zvv-Yskzt%iYl99UGU^S(K`1pvp?gNR5P!O$GK`IbQ5{ z-62~DHT$ZCk0@UIrpQm}9_L^qd!JSTF>-JW9XlzTiji7C#X!}Y-Tmwskes_jj%+fQQRat)*m>j<9+s6 zsw<>u2fa~MMK#gMVU=vkxz{O&Mn23rrxkm!ySr?#8}aLtDRrt0r5@Fj=x0@=+74c= zf6_0k-s?N_j`Nrj(>$*1mPK;W!i0ScUO0iu*!48}XlBX!*b`vaM#~Ln`!zMP8J8wz zm685qtR+*0`di%OK72aEjD(fs8EUugi~Kxt>&x1_g2{tm(+q;u>Z_ayRQbiH!r^n; zQJ2YGu>yPJ8k=ZUE%v5$sSGR|$=;SLFSmL?^^_co^W(?OswHDp6N#?=f#^xi_s*g0 ze$PKTf-m}`Vbe?6E`GwI+|H0Rt870xeco?={sCKf!r|nF0$<3vrAIni^cYqeMQNPf zH@fiyX@cuyo|qe&h)1+g(u7SSh>S;iZ{8$iA20t1W#0la2By&h^4NX?Zm~s_CUo;{ zxqBJ-5z6Nj2xpjDMdCMFMSSXT&M=9JeoYCA#Bf?gDByIeFn*ZSI=&l$aL4w-K{xcm zL6pBf7CFMdLw+w)vbDHsbwZ{Z>W@hsCa%&Vlvi4$uOZwl&6=gTX~}M8{`^qxP2+-ZHH}}g zpM+of*aq`n>SCR3)s<8EUYsM=4l^i>T1i{Jbk2T;9JL57yF;g8Z9_%k^nRS&ks*-V z9ImrZmrh8|bb0r}NhWqXYKgb)Bdq%8t@?fKhBqNy_S)lW&9wB=s3D{V%a~?XKSUzj zv}-HUx~V=!Ot6V?cH6`?nZdkkq$8L^h~dM@Y$%W68J)^+ZAT}nZ8ApO=!qWRO^z)k z!4on3L_QKdp2Wu^{2q7=L&;yx=xTJVXc2s@_)%3P&9K-Rs{|iRhE3Ei7ueaTf|!$8 z91P0o?hGvek{A>_Bab@2c~=4!$uom8m}UjDBhJQ~XjmK!qftT(p8PWl+Jxp$Svt%A zlVugUF3RR>UkDWD0tsC%g3@G8w+i+BO=vKTyeb5?O9^oz%M40Ml$Dy;!WUq#-3VR- zOexMLQa_uBj$H)~w}}kfhd%4#;Pq5Gp$fQB`vv9Ctf05JIfBu(tR?+&SNnZEtWG2c zr(UEq1Ucf2Hk5F|1gy)SKX>6=b34=E)KKOsh<|D^RTNY<@{CrAG6u$ zwXJfS_rMn5w&Z{PE&!&`E&P%r2a;1jQ3Co8Xdqz&(*Mq^`&HA?UowBI$>bOEx6I$_ z$qtl~JcB7Zh%ac8xNU8`rPbapd+mnp=%E^m-={(5UCi+ojk;L3-@1PPR=aIZ;fWv1 zks{kyuSJ(?T)~&k_h$&9{{@@cq{dAT%jrFHhFZwTf!0bc^YVl$?8i|IWH8Tb2b!7h zkzq>DB<2vT1VhlI?A9qrO4bbNO?ux{Ps?@c?xIkQ*^>J%S>XeWZZ|d-1|}AG(Ad=< z@NJMYEBPaCX4vn=&bnEy>83cw;V#oWzow@8O^B{|^zL#gFhCWG*M)goP4!hOEyU-H z)=oTdbKXb{9jd~UZh9#KGV9QLr~tfv~6*u*Dbni)L0lL>Jrb%E=vxh{hI(H&w~mo+{@6D zrOABxW5Vj}RJORf5A&SM_}$dL$917mG~G+>(1)81TnRIq`txe(y7EHX$nti>iuIiH zD0}F&qGZ6MOVU(xA-W&Fd-x=LTVdeC>xY$=cJHYv(&%ym!;0bD^U$qVq!>kNH?OL^ zLgf9FR`z?}0ToM6gNs40PWhJj?|Kkj=V?z3$8RolrIX;7gD&vfQo{4H2<(im0W8_9 z1a=oI?t)AM3{+2ij^0)~%Vqo)oC02InIdLM2YPj0pGaPvHty+q(@%#7Ytp=cSz~%! zW4)^F`ThrK*S-17YrM=W${Wm;{M6Yid3v6;BTYNcl9=tX3szYAH{N^Iql=T~Y9Tgs zBL~XmGdbKIs{9eGGn?HZ8r;))ZZd5rmS;VziSs@qdHZNn9rEfo+2Wq`ro=Z$$TJDd z4L4&~GK6I}#oRmwkeu^JkuQrU?;hAxsZxydUA4KyXRkAR#soVz~ zzu2g9HF#J$yNgtYvuOL3DnZxVner|JOOCqsULoaJ3z}~%@~%Qdl~oiU3BT4V$e~IV z)4wiI;-e~AQ76Ai^e&xll~>5FCWt4{^{bg+p~&q+M{KFM)ypdPN|}u<+ow*AN-`!< zm)$tI{&w-xbv%g@7a5I&K=%BoMG@NT7)&aR9&v$^i#WGOcAqBc2gf_1IV)?j%(rm{ z(f(^e^m?3mYjR=qrSp_rq`5uAPc$>V?Qmuf8|X`pvb%`gsxc6-HbnIZtyzB0oJF(;LMd9<09p`1U#Bv;V^3FdNS| zULoZ+Tw{-l>CcceC%OcPg+?ZDmXx;$GXu14)YANW72WBTc`b)@S|P znR%?PZ+`kVFY(;Ck|(7_>6L;ct9_D(52uVXvKY0 zaKp%aB6^bffmXoJb{u_ueqnL|-#}(+AYOmC3w1Boq9$ith&F@kbfWH~B5;8F)M3eo zey3D=c*BQh9cmKN*Ur7t@T33deQG^ynfC;TH0HZ|MJ0!?+H5sRVPD?r)?kb*`;nX4 zH*L>GVrExozx*Sqt=-$%NDmY|9S7y2|9k20Vrr!Fr)|T`DJ2CE!4R9Q|K z9Ly*g*>z_mJ58v#i2K3XLq;3#?yyb16GtzV;jnJWtLbp@-oae%2A1FA6C^I8BZyyE zRJ(+J^lZb{Xsj9foyu@rsOtu0-of6B&DIRDOrKc%f!QR#k+G)QKrQk*@{)+=8wo-c zlmvs!mAYv917gE>AL~OuQSFGLxyAj)y)EACYKSccrEj!#fzXNC@bvZf%K^lwx<#Mx zqqQ8vX1~0zlYk{w6qtQ-Rkv_45^pe%otZT5WJW4Z6A_~mJIHH|D!YfIc=kQVVNCjV zXugb(AfKiyQR+QE+`mk}8-zF=9`@KR}fiB+{s6CF+&23fMVG#7PYK+!- zSxXz)wYzg4NAWbqr{npL{qa2eN$v&IAA_Jk=G%YwhZ%_bt7>RuZTf2ycRQggI)f1< z_%!JOss9!cN|-$&xvY65YFY+)<^~MU*k8?>GzKlc;dDOMT3u;o`~w;l!YgC;?yE6D zHd0B{&3Av+X0n0Eco4UTy10S-{=U9lJ01pa3gbf*COH=zkwWQMHo;>q`=U_bB@nJx zW|DIb14K4K4euKMBy3#6uFe$F;EYRS=5yT4^HH@gbGcu=**H_VnTYhqsxXriU6U)5 zXJMeTA%A_YSnkAlKazay3#OK>^^9gWB*0z$xKl;;CksWIzF*?@tZ(rO!bl`8Bzp8a zR@D+O=xlH%p^tX7g5gTqfc%KHxSZkYo4 zP|0mCkZ-H&RSn5j88);%g8#t~eac6YtN=cJ0#9Y2YLtT92*O30q>?{&&dH9M>;fa{ z(e%YLslR&-4TrPg-CKF_E0UZK3+Fz(#KvHyYyY@$2gG~%VhhriV-O`36-1N(u_F{6 zZ0#HwjBOo$Qx*_0^}p;0P}4=m>w=~*^x#vuCuEsZ2_D7VA?z?<1sO3|-g!kmbdk00 z6an^=M@tkUR@2#uC-2_z!4k!kaTX->H+c+q;ao6MFbgHg0uHo``t%@VGQ0k|ZOt4* zVfpob*0wgOz%VgGLqm87OpL}(8cQsA7GiBtG?kw{m<}gUS>x?Z^wy&jh0Y>~SUl}u z+$Pqtj)L@D4qC9mUscG=rKq^2Y8o`@h#-Ry9LD!8SVQ-N#J)APa4Wia$2EOV@GG@g zI@-@u!PiLM`GPl7v?h>CcsXb4HKTYiNCQSDmmjKR2xTeEkQ1}~a)|qeeTV+M>RV^k zw*&B}tgg3Q4Qx}hG<`01)bKQ)vrkM364nFNpq_cy7u);`Sx)Ru2%XCX`9DAOJ=wfR z!$c~1z_L_D(@oEm`=}0qRS+VRf zxus}y7$w=C-7LkTL|@klg(;m9NaYmaDSri67n{buN6^skEQP1Oh@kq`?ypAb9~*|* zZ~h*uQylwa8;4f7Xz5Oj78I6lQuSWIT3H4(dpmfdVZa67e(fIQH>N;n8xK-1p&$N5 z0cUdCQ===s2Bbk<7zKMtjHrIkDBm2)v68pv72{U5mmem}Tf{tLo+u=U^_{HP^(3q9 z_+K$xru`7Biuzv-42jmW^vF`>v(0Rxe!A}2=4r22yrz3Lr=ovC!sT&D^tfYr0!2yx z6+;CBrvv#u|9tZ1Up@0j`VZ%D6lMM@;IHQd{(u4i2_TdIr&9xe2mYNv{RgxL zo%-+azq02300RKe2*1PsFC4nROZq#L>JM2gpzzQ?@~ZwW;_tEOKSZ!%{x0Irk?Fs~ z{~nS018xKSH~fF%Q-25l-KYHnY(f1y_-`KW-zEIr3;RRD9nJ3&{^FDU9sgIm{0ABU tpr8i;{=-WD9sbt|^Uv^M#y`P-pFS03AVKQ!>r4tfpbIp`y0ZMb`#(tr{|*2E literal 0 HcmV?d00001 diff --git a/tests/data/docx/unit_test_lists.docx b/tests/data/docx/unit_test_lists.docx new file mode 100644 index 0000000000000000000000000000000000000000..8032f93c249bffb4e4ad59440e5903e0e52ca59b GIT binary patch literal 15769 zcmeIZWpo_LvMtTW_uJ z@4cnfsqTv0kzLi15mA{tdSuHb(YZw9b|m_&K1!zkq)doXCg$wNKoIl{ z!?<|XKKGG2M{0~OT=s*5ppy#;kXSlS^<*Rd1Y2{`WLEZ5l2njhJyh7C{rfLj$#yAn z@ut(RCU|EtxNy8LMN|VIj?DX)ICno za+18oovfs%l*=ZqCcNT%XiiWFDd-TU)tv-&mp^xoATzhGT}C58RRC4lp9!mW-CUca zbc}KdP!4AV7yFDD)XrxhhRg$)H{O=8HPXJu%yLBX;+H2pL{;Cr>;5=`$4V$)qXZ$)$BtbFk z7p8_9I)Mou^AMao)``4FDSWqA2xCS_F)?dE{4+PxR%zir*^}u<3d7vIxwwWKLTd_3`xtzz=LBlCI$PFQ=~@EUA3^Hh=Lf^-*^LPV05H!20DStm zDNfdQhIEG31`d`VG3<}vwX3lbncIlwfoJwcFzkljz9Ou}0t~`LlR=bFV8ZJSOep{d z1XMiY{<;Vn%dapeqtK9f7DTw{>~g+$x@Rfm^>#`yLFy!-RM>ZA2Tj-cJ(fC`P$ubm zbSEUWjs0^1fN{7?N9O&Pp6*Qb@a$`1GTtUi(2;W#5EdBhP{M;+0m8IhTIrje3d zk=USHkxWq~dGv-c5yf3)TemdP5CHq-fX=M#%PF{tQ7o+BSz-_Hx81CrgHf5_y z1Pgw3j)*GDP!AdiE}S6E3eq$GAt?SHf$y{2gDJVOS?QhkpalPL2@kO=<}}d3qM5iU zUPxA;|9woanYi0l{5gcPJgJnDi-w}hV$q7Z)t=t1rIM#Yr_o{_n^_`j6f|-bF6oz~ z4VfzOx@G)bc3w;$4VO=37?E!A+jh+*i9ah|3#HvGpDfneTP)OxQ`8ztxK<=_Q_HNL z>Jy-xj`y|6sA;|_Cm4nf=&R99+vUPP7MqsXP5G_zh;?dZ-cGC~P1=<3kXWv}ey;Z> z-FGrbGWj-=CFb9$%goYS5D)^@!|NtxhuXCgDVt*)q-jdd%L7;j(UsS4RK59}(^Hjm zdNR*myjAv{aqFg3HcX@Lio2Xt-nxD)d;nQyWp@TG>XFHygRWPs$QEU+ zvpH%4LC^)N^XzK+{VM3OIk&VQGBs-WJxyWS|0VR6SZEcr5&-&DL205rD=EB=C6nC z$gnH*HpEyQQ?FR;49vG>%k!J)``kHeoPYwW8AXWpIGScxw3Nww_Gx8;x`z&dX?f z#QW6MNE{T_NM=gE8TGe9kMtSRa^d1EKCsnz$^`MZ;a_H%<)o`K^E(N#8{w-68fw`JUHSse0qDxwE7 z3?C-sZf}rP@5qk}DBfADpwQz5dE4J&(L2r|8++GLXG@b)y>8arJ0_bsp*_w5No%IF z)tQ$vS{Rho5Mrp&?~Dr&k;N8G{n} zS&XYc*_G(>K{hTxdOr~%AJh$_Ux0zrr_68ptdTc>Q)%faMzC9J@O!N8^bSnel%A(X z0H-+^y*$&X52&{r(K~}pl`av>Gx=7CHY>N2M?Actenu#(N_GeuBL#t@=n*l&EE>{r z_!imzdA~5O$KfL6k&;1Gq4(llaO7}Sa>tYLO$`m zc$E8T`PO1o`!)>AC~4#cIijxJ1C_lGrNSp8`(CqTtcl~MWBi#&JZX{jN$*qf#tUMj zKgfKo##8hBRV`7lL&RPL5Rd_b5WznZOB=+Yg1T699Pk4~$v`FvqQUf;nJ|n5FZEML zUKd+=C4Ow_c}syLN#gkVS$uB%GPwn{Av^eE2p*jOhyf1+=kC?H)DH^8Ur2?%WXvl! z9F)IJ1w)KR556XVYqiIHHQXPYmX$N6o431&$Mgwz#}~QkC(Y8Om+H*Z25OuAXbL%B z;39$r17|OBCj$GAryrLY80-QKju`anujB~q(91ioYjYC>qeNXcx6kr9o7AU?(F`3d zD;Gb;(S1g?4ppCC)NX361dePBbLD3V2YKq!rq#kFqRz`TZeZ8!0H|NWm7#T8CX;j< z?~{NGc7IqQd{ci1L6CbMn52;e1+sae?Xsr;a&L^|4E3ovv;bchY>>nESg{qw)?KOA z+Jrk;Yq6Qkyi4!uvL_U=0Dj~w<*LF`M^B3>n&+h0d-!}B?c4?vQkmsQ807|Q6Lk?Hz?^PGzdb>E!D)DbAouY&jX+}Gdi%OlVJ*w&p4Z+qy*9pCz6;IIb$rC!e~-#P+rx0cKH_yf z2mpW$fCTz8D*spN`p?+>-)St+$J6CU@Bi6HS=`tkPgnlu0quT^ZZuq$LQpaG6h4|7 z_dtejuR^y5dD7|~9wL%RLEtlfkxcp~?E!PH*X5{OKd#$YfNO}s0t;#!tkr?=`tG*C zlF7`>i^~Lnh+G$U@12g>6l1%i&!4(}M(ekFN1CeIo|VGa-Gtyw`X}{s(ol#mTnQ*= z)rho<&yYCHzz$Y2E>)myI^Ggh>0@8;E!CSnLF-u7EtnpPg58UwuwD~ca&f3Eul>Q-2?~TPx)z&b!aoy!3Sg^9Bi$9p(uX-iN!nEQnMqQ_nQYMV8Ebh+FuGyqhx+Hu%XwO ze<)YP>8Cbj+C%u;RsEZB`Hm87D>zA1!YK=U>Z|r#=Qp`4)wS*ht;!i{EN3`L+;18W zk{w$2PvF{qz`nUltj)^S%;r)zn2h;`6VTT4=2mSD{)I*Qq|Hc=;=6t7kWmy^;@*z5 zDD1%cpK>X%r8K$F;{9;<8GK+Mdg*gC9b2@Ei9ss2U5}&c*;fVVi88)NJSrz`gFGZ} zsn(`rDU`@M(|&)r-PrH~dV9JJT9ECi z-f=;D|8+p8^ZGW>QH%~=Zk^TU`gXGCr1SbX^A)CbY?B-YZyV9p1}4Loi#Z;q_KUaZ z$tiIlP_J5un(tA8^_NS1MV3P`7Kz65Tn9PB?Ink$X{!!KE=PWTG-nld?&9iKF>x*rnmoNsa9umH|D)z@dyu9TR#OJ-{XI#<1mWhoO zCnL-nh+xbVrj!mp>d3xT$R(oP8EjsNu!!ED-tv%Ynfn?{OomTHq-?n~ z+C$F9c?zSm1$A+aV?qWVP3j0Brv3@$_HM1T$y*(4O9l%24>*RC8Y6?fZ~gTcg8E;g z!ABAGtE{CO6nbsED0AxL(9$&a3ib=6*W^~oT?Mf-z5>0nB}9pA>zm_pU3ipawEEYm z?d)!`Ql-%vA-`Orv+}Sc_ zl+(J^Ceq*Un}kAfAi?jf)F{-hnt(Vg>G?SY&IX9$0*bi^R_?e zqE3$7$$v~N2NlN#o%*8dhyE=FUhgBzP3n?BG(^ z`-}&PVSZ!?&eFFiNOQ_ktfeauonnQP>SEF>k_Db7UkV?rq*bh;LDz6}s4D6yt6XYN z*jPa6RY$+5)t2~$(dSIdCc{`nF`vccthKCc&xv z=+vQAY?_=9Qfkid%;%+A7H!1ozS-;e)uWNJt}@(AHLn?-)?DxPt$lZ|qXR-!eP3@3 zi+N;BrhJmR0U|)Ma)z9_rA{VW<#^uBPrG5+6dspjQ`nE{)UH9{m<_s4p+Tc)R#T*7 zih7nRbD?5(CBifR)zs`;)79MlaBJ$q*?(UBW3O^^)B^*AVoeywIQ??O5a<1yE)Ya>zZHM)%=O`s0H?3VN3n<+agU@O7q=R_ zz2otHNuRf=bkYwSGSG~PTRPvkuQzZz7~t!fbfk#d#8ax?W)zdQL%qVRTrA! zLg7q;ga+$bjTLa8xO*g)=Bz*a3rF(+b%n=9KAl<{Ig_;rbfP-mhCh56AhE;&uZ@`D zuH$p=7#0XUsx?qHPJ}>5bfJwX%;DJ{PGrT=uj?`{6guLZgN$&5bS%n;K8hhAuOqzm z$O`{Dxo@IbBiHxF$mu3(U>Sz8Oyp7`J{C;3qU)T)RYSWG+g<>Sb(m=g98%D+a%JFR z*Fe?8j}c(=0~SI~0t7kZEdXQ+;b-rJXvOsp^ymz%WKJD+6F(sZy%?e6uEEf?EmaV? z3-=azaUgj@sxQSUdK`U_FSxN8{ICU437^w|t>vZXK!#aERes#~>$)LNm~Fa8fexdh zr9tm?)^Fw`rK`93zW2=mlcso!exn)HJXTIK7WEYg`E2o!*3_^MHXk$R+xli$>vMS) zctG3X)T>tjqp%5)aTvj=kS=(EbpPg^>2j(qx`bZSt-&5U9AzBj1 z+!n)Rk3>N-mkQa{r=`Yk*_y&sv!)LO4>`MxIUEG(N$=9$SL9uK6Y(;S^bRsSneqk_ zzqgZ4)wgt7y)0DWbxJJV7Bw+~&I<``hMvW)LZ)QGd09{@fm|%g)vFc2#C9gsn$RM- zQt^a#N#ul4=pl|MCNqon=rqV|Lw3ze9EhK&O$V&SlX6SsvvV&>e@n_#xv>9kPp>61 zUcy%2gg*ANzPN9DNc+lb$Y7t_h^N*lU;9gBnjG5IzU?OCvBrsyiY{H+&HiJ(rAnZ` zutYcHZShfKF#(qbWL}jTaRgMrukH;S^IOCh?r{mTOV}PBpNSah3^f~ObHET{J_Sk> z*r=nb0-_qt<$T!@W{B3nig~a$#pojf8Lfx4?q%Vo!`u%DpAut3Ra7=pq(LNw61VO$ z&H1H%^#q%r83GP!?(RsWvRbb2A--tLx%ap!%B;^x(qpLhq-`XV~)1|&^(8&T@fL>E<5=$(ZJXd2d0eny=HQ_TLw){ zEzig;`5c&bMeH-TWAUDb3-iRbwzJI@)uO zu2nP&cgKtltN;;0qC93^X+iJWq~_)@td#gES!)JSNEq7B0zHGLIRvz?bdhWF%o2W| z!jqj*f{NKf6HAx6oidK55#BAIJKt`b9ufZ2b6r-7MR3-KKXS$a0FeGWA-8g{)HkyG z@JN3+q|xLRs~uLvz;l=z99W>XFW+m4v5u*0B((AtX+y0-wP%nlKWz#8Ajh-1H+q8f zC~Fs#7>Yd|43XeC+b6Acd0=NsC-6Gx8DQ!1BiRMh|E}M(5Xs`rtVfX(7IJ&Ab3<7V z9spnlOO|J^X058?U4MD#37ShzGz4!-XOD(e4_9$vtY##*seTGrWMe$v>j59O0=tr4MU}`ii5QQTHp%&28*#w z#C7J|ii>}gec&kH&Olk4poAkg2}4}|D?0F1Tr3^tIi{7C;+NP}OI{;hahyzE+9q|7 zBK$Z88UJ9VyB>09*6@p@uRZ=GtR69I?2$hlIR4XY4`EXY2_hd0sW7V^MX%?w++r^! z2affEPTg_B(T?kDy_pJsSws1%m0?d^XMpLvUB=?v$hn#VlDG&eOPUb%&Y)>bc554s zxK1keDCXc*9e^6jYQNl0<>#z+^YYoT!6J~u=td?^SKVGEvXlXMXAdpGoTU#61nJ6l{wsbq$*C!feL<_x-{@>R_OK*EnTfKu8SrHH}6_z zL1#~cAoJNP9_5%&W~Lq%6T{Lves5S5%PDd~W_z2(ZB|3N^~dejoTq@Fj2lyvF10~x zx6KNY1KxEs=VhA7=wG|Qn8mkzKrn3s_Mj2miVPcav9!FRBCn4e@xgwUG9(ujip;K?U27pN9eIAPoNSHjD_oBa@>3F&Jj5m zgGXDK=k^m0T(vm>o9wbjYAneuEmAujJ17LoNUU&BF$Ct2@D8&TVLaWh@L9fBO3Z|9 zxpVQnXOx1=?-l&YRE<$liv?;f0F-W@i*J>Q z544V<^GfR*Y|O{ z`^N#Qr)S?1#4n}O%aU3pFjHzvzijkoPxkWHb1F-*nJk`rt0EH1!qkD}ZqQFvYbQ$_ zZ@b!QnLALHr4(Xo@)LQz7GRGioo_7rWBaB$f z&|R<)CAz@0NpreypEhHXHZ^oxUxJNMwmGuQnc%1y(_7h^Z_Vv`j<3JpOzm~0>=*1d zIkc}Xit-!Ipnrd}F_SB3NoFuSGpy*;;C;`7kuhpZpjty6=>Nr?om5NAj(<41<$M6( z2}2h`Lg$HN12{K#x)l z3^6FZG78XL}HW#nJ-Ekj-1Xn1N7PfVmM{9?Q3deKW$M=>!I#Oe~d_=NyeKQVyXQ{ z1bzAfF=)wT1cbRlBSj$W9Y7?Yv-d!jJ^ZVmEk5-zyPkJXN<6!sT#xPX7dU*?T>MB; z5x!kKGwq&XUvAaSP3TDYJerXuU!yQi!U4F8Mt?W$;YaPK1H7{xVxynp_)rK}{>aYI zs&91r3UBgwk8#ViLHK;ZKCi^HsKjp?ye7xw6T1sHtSmZXY_|ubA=jgB+$Je_flfJT zIps#=sUBV!CR_X`Tht?TKciXBGzSqP++L_FE!&~}Y;5o~+&9gr9leaE`ZmR_hu^5# z!oz45YQAEv5Tb3MuQ-27_jAi?&aVh~ex(gYvr8wf%{)1oZ{d;BXBR)>3| zbPq8q!Lh!LqcN+@BcnSWI8f9#zHoHCPA{E{2dREP^XE!7%9CU`&zi`-gQ+3mOB|!F z7P72LLqudZ15%UspUlC#l?!@vjg0S3K&;HjtA-t$UqtP!8{x8>qkP$8(bOz!$C8iB zt4npVF)>d|voG5*IoDN$;yhTs*Hi||+|Nb(x(o$&vm^Rwzv8lj$T@AP&Ib2a2g5}e zOd(+o5TY_iia?oShvnwlQhSE?S2t1slgYZLq0gs#7+&5vuC5J0vaPNKfqpb%fR6@f z1J23G-Kl2{XMD}ZNzD4=R2ODcX_ERn1i?@C^YR*(*j?LLds$jpx7=z~-Qt@>a9P|N zXGunJ;LkWqqdtvnsS~sJYq&YWzdX4ojEoOR?5%OvM|Sg^23%ee#D+rMs3Add33P`= zo(xkyp=ZfFrWbPGapW7!_=xI_wmG+@e|p#tU5nGZWXI2J2aX{okB1+de-sQQCbs!S zay^shwSXjwhQ}MI_TChWzdpwq%KmBI1QY$M>Z?5n*bEv%V4n5BPL*^7Y}qZ<-MBe(FCs@tJVThvXiA;V^tm)*@0F(vsZ%=H*pSgQkmO=> z!j6OLC@1mPhG7pHvDtOU60bb+BjQJ=kvU(Z?BN7f{ARpgVNw{avKDi0{0*OA3^3Q%+th04uMU+F&(C>1 zZq8y8zA)|eP6D(^Tr#PL=%q<6tUIokHq1DkDPlJtQcP=;g!c%*9C47a4Ve{?5D7wz zj6Vkv56A`F%f4|-sbWt4SS!%f;fc2=?4{7CxXJP3*C3?j4Sf)*!c=g6*Eb_S7F5h> zCITu=fctrSf^c?^;E;tJmB)B9!l`mARyV6O|J~*_2@30@fee+lb&h_MqF;n-eDhoL z&c!@&=&hly%@lO#&&9Oz5#w2Z8_MdE$|26EBx;7D1`HWj0f^JOiPh%_iCGDWk@ytF)$7FXFCClk5GLzF&!lQhq0D7&QEcUtW_)-Wy|E`V$Ct3?y z+$8ICbjPoJOhh3Wmc)cG>++ZI-QEd z@U#ixpwp)g6HqXP2*)&1dlwA}q~owgD14&`PC`wKZ|sz@UI$Dy6%y3Lo}Uzj)lf*F zQ*=4eB{?=?pw=-(BCz*o^tP*UzSnniJZ(AUV7liz;fJp4KYbjG;mq2`g~` zhej_gZY^<+(xvArFPFsPx;8g|`Y6yS4R^ifCVT3W%<@a&=q2^I)AYVbzOBJe%Sc6a z)}~E~G;~YxzLpygmwG_Wj5L$uv(ih-h`Sl8fCP3jaG>VWuXq1}XTmI0ewt@k7w1NTTf4oey1^an=I?j z>U4V_^!ANf?kK!7i$!l##!LMyHfZOVgoZL^0E+xbk()&&_7CD)~!zHMK zc?wC+TTbt+e=27hcq(V5D3qdC?1)~110=;FY?Fi8+4vPMJH0sYL(K`J_)&C^LT#+Ygz?ERn1xouJg4~pXbE~p=8yOJ zm`OBCf9R3^(34ME^6?#zKa>qg{%#MP$l&aDcPM%#ky3op7oX_mKkix$PCtT&7I zuOK|%D;A|`)w^D%^GC)srJ%7R^|c(v)hR_Nt8r8WAop84s=4m)5cyX^6A)(H5eOny zizIk?3%b-M?JC9da?QGjh))JAi38WnFn;>?TU$#*Q%hXPtjbR~mI&#UykU29tdAlW zJxsSWGi;O4*D3A=$;m!b!fS4Q`y8@#;Dw@fA)e+l1C{bivDxFbQ%{`icVZ*Qo4eyq z#%Ze^m!geuBflbS?5>p_McH*qT2B_n9t?Bbg{SnN~5+{@xgIDefPQoZLOu{QykX39 z+dV<5oU^$uQf+5u7ro5!i(X^7hsZM>GRk*ZqVBXt1b6TVb8(CfcM~^Kcx89RoLstK z?29MiZ_8(7Pb|^d?UsG&PzfX#fk(>sa<>^rk-R6BObjevAN@|>EEPE#+{_$Zg(^dt zG<-_sA?j_6xK}YtPPz`>!K9hZ-RprzREr071YW~f08JseP1HS`6O>nnFt#1 zSvuP)Bb!x&8;h&u^K*)cG-m(>uBGB$JG0y)X+FB_kTN+U!GKWt%huU#)zvTO$pmsN z1Y|JMD!j!`v6=KQL`oKMqVMc}3(Yfz^~*N7Ygs%v+jUpZ7%k)J&T_t;auCgFOnWRhLu zGZ`bK(eZDSmTU48ECEw3RhNc?%Aa>`o5kyf)ZSJ+15HgtZ>Sxt>3EcdtBPAdp$*HxtztF5Za9V9(Hbs+J#b*_n@LDwKBIQ-j5Z*iz-XJfyd%vkTX4(*%|bgck>F@oij;7 zO4vSs!=q@N*4Y%v=3LshFgKzke|-nkqK+q&q9-jb|prO`kw zxSK+6QlRS&ana7!gT>kyu1tqO^oh|huaW*|jjmGoZTK}lRTUv@HMkg^)Qz%m>JwZ; zx0l7SkFZ8q(Zcd!cpT3 zP>Bg!7ZNMF4#G3XPjj!jl^U}aLY#7aHK zZ+1=RY~NJAK*+6V_de{aieW0a4nR|ul;AnNh;i?Ql2d*vsepXNOiyoYlYX|`J2Bgt z467puk&zP(!vD5)lh^h|2&=n7OO__j8&hKvwTIKZv_m)GZQTI5^SYKgyla2&F^2SI zf=AQ+KmHyrW+?Qw9~}2vsDG?18-J9dDC+5382w4)P0+ATqet=HAUoy5U&o>)P+=*a zFG#qn396gP#4+$?u^_@DBYyWBAkE9mQ%WvqqbBddes*8P8`!`V74Ja_hg3j zh>|E#pk;%39~hY#o4xNuOo3N{Fq);lo<@xwm zfx;~g#(yYuru#=!ey~D>)E|nQ_%3&?gV>#jB)P_XEdnArBm;FSSHA;ntG$t z`PWHnJYldiY#{VJU^w5KV6&$ z-yiYkOKdJ<#lfh%D{v^#JlVU6pe?7QyBRXYwt5jx_*0SLMj3zk*y4-_Z}w$}6ohYM z!);VIH|z%POCLm*(0d43Gpv@C>N8)uv3O{VKNA+ef^jt$aq#wBun@4oHf!75@ zMxfJvcHU6k^HA{}3{yWr#tjmhjoD=DSGH3NYnjoYS+;%t?9$6I5G5;HDT+(am=bx$ zC2EX^VVfjB*0et8w`@LM>u?nr0fm{&K{YQrJrXEKF_ZM&(0yY1Sc|yFYipaVz3;RG zz4fOSXI2<#?A*o%?by`{j0@Fn>M7DK@>M?bubOc7)`8XD1QCxL3Ya2xemaEe40G;` z@7&*Ec7zb>0@NnDZE#yrgKF@D-lqntfkrky+`+#)41pt=0Q(PzarR-PpFSvUA3QcW zJ8K(zIsy1XCLjn>WV#3y1f&EaU#D zagcV?y*NK*{Y-1Owf5*dzB1DwZ8AWJ0I!=tc^fYbA__d26EA69uxrVJfG55g{cY%4 zp(j4QGe*tPD+xL-c)EK!5uBRR#z$qE_LH$>eK2YEYzPg?DjavVhrQOaZ?5cVASSJ& zI|#F}#TOWUS`IsPn81Rs#3m9HoDx49T!`?&0 z9+)fZp9H_8Q%A zgL3*1#7UgC9YOmoAbP<@L(?8kReQrAkF{MxsOz06se#0_}2AW zrE9yj>m4PR97ExZR|d{`>tKP^?y#^w)4`oD(yaoawQ zbcGR_55i?D(ItZX*{m5weo4J(5$blQlsUA?$T~4vq~#L$RELR>Dde`O4A;-~%adN4 z+o{wO9HH6qitD4iL8jVi3d>`Mb~@|vL>IKUD#_0#wi$Tz`zkLtpqakcpJV%zWBenI zTP21nKZn(HDfdFv(Kt||tH(eFrZ#;Pn>EO^*O?-xq4xQ1i3yg zSE(`+Mr4%teQ~!1vhd$pcFWBdkEiKaU~D`riRbae7;kpb-67(aI+1G!%YbhMum;)` z(yV#Gm(dNXB03 zYdVCUk(z-XOFe2ba@m{ + +- First item in unordered list +- Second item in unordered list + +1. First item in ordered list +2. Second item in ordered list \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/example_02.html.itxt b/tests/data/groundtruth/docling_v2/example_02.html.itxt new file mode 100644 index 00000000..93f0352e --- /dev/null +++ b/tests/data/groundtruth/docling_v2/example_02.html.itxt @@ -0,0 +1,11 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: title: Introduction + item-2 at level 2: paragraph: This is the first paragraph of the introduction. + item-3 at level 2: section_header: Background + item-4 at level 3: paragraph: Some background information here. + item-5 at level 3: list: group list + item-6 at level 4: list_item: First item in unordered list + item-7 at level 4: list_item: Second item in unordered list + item-8 at level 3: ordered_list: group ordered list + item-9 at level 4: list_item: First item in ordered list + item-10 at level 4: list_item: Second item in ordered list \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/example_02.html.json b/tests/data/groundtruth/docling_v2/example_02.html.json new file mode 100644 index 00000000..0dbff9f3 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/example_02.html.json @@ -0,0 +1,180 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.0.0", + "name": "example_02", + "origin": { + "mimetype": "text/html", + "binary_hash": 17361433184833793580, + "filename": "example_02.html" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + } + ], + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/texts/2" + }, + "children": [ + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/texts/5" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/1", + "parent": { + "$ref": "#/texts/2" + }, + "children": [ + { + "$ref": "#/texts/6" + }, + { + "$ref": "#/texts/7" + } + ], + "name": "ordered list", + "label": "ordered_list" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/texts/2" + } + ], + "label": "title", + "prov": [], + "orig": "Introduction", + "text": "Introduction" + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "This is the first paragraph of the introduction.", + "text": "This is the first paragraph of the introduction." + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/texts/0" + }, + "children": [ + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/groups/0" + }, + { + "$ref": "#/groups/1" + } + ], + "label": "section_header", + "prov": [], + "orig": "Background", + "text": "Background", + "level": 2 + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/texts/2" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Some background information here.", + "text": "Some background information here." + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "First item in unordered list", + "text": "First item in unordered list", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Second item in unordered list", + "text": "Second item in unordered list", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "First item in ordered list", + "text": "First item in ordered list", + "enumerated": true, + "marker": "1." + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Second item in ordered list", + "text": "Second item in ordered list", + "enumerated": true, + "marker": "2." + } + ], + "pictures": [], + "tables": [], + "key_value_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/example_02.html.md b/tests/data/groundtruth/docling_v2/example_02.html.md new file mode 100644 index 00000000..2b0a2ceb --- /dev/null +++ b/tests/data/groundtruth/docling_v2/example_02.html.md @@ -0,0 +1,13 @@ +# Introduction + +This is the first paragraph of the introduction. + +## Background + +Some background information here. + +- First item in unordered list +- Second item in unordered list + +1. First item in ordered list +2. Second item in ordered list \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/example_03.html.itxt b/tests/data/groundtruth/docling_v2/example_03.html.itxt new file mode 100644 index 00000000..5fce9389 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/example_03.html.itxt @@ -0,0 +1,20 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: title: Example Document + item-2 at level 2: section_header: Introduction + item-3 at level 3: paragraph: This is the first paragraph of the introduction. + item-4 at level 2: section_header: Background + item-5 at level 3: paragraph: Some background information here. + item-6 at level 3: list: group list + item-7 at level 4: list_item: First item in unordered list + item-8 at level 5: list: group list + item-9 at level 6: list_item: Nested item 1 + item-10 at level 6: list_item: Nested item 2 + item-11 at level 4: list_item: Second item in unordered list + item-12 at level 3: ordered_list: group ordered list + item-13 at level 4: list_item: First item in ordered list + item-14 at level 5: ordered_list: group ordered list + item-15 at level 6: list_item: Nested ordered item 1 + item-16 at level 6: list_item: Nested ordered item 2 + item-17 at level 4: list_item: Second item in ordered list + item-18 at level 2: section_header: Data Table + item-19 at level 3: table with [4x3] \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/example_03.html.json b/tests/data/groundtruth/docling_v2/example_03.html.json new file mode 100644 index 00000000..206048da --- /dev/null +++ b/tests/data/groundtruth/docling_v2/example_03.html.json @@ -0,0 +1,624 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.0.0", + "name": "example_03", + "origin": { + "mimetype": "text/html", + "binary_hash": 17768514429310008971, + "filename": "example_03.html" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + } + ], + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/texts/3" + }, + "children": [ + { + "$ref": "#/texts/5" + }, + { + "$ref": "#/texts/8" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/1", + "parent": { + "$ref": "#/texts/5" + }, + "children": [ + { + "$ref": "#/texts/6" + }, + { + "$ref": "#/texts/7" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/2", + "parent": { + "$ref": "#/texts/3" + }, + "children": [ + { + "$ref": "#/texts/9" + }, + { + "$ref": "#/texts/12" + } + ], + "name": "ordered list", + "label": "ordered_list" + }, + { + "self_ref": "#/groups/3", + "parent": { + "$ref": "#/texts/9" + }, + "children": [ + { + "$ref": "#/texts/10" + }, + { + "$ref": "#/texts/11" + } + ], + "name": "ordered list", + "label": "ordered_list" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/texts/13" + } + ], + "label": "title", + "prov": [], + "orig": "Example Document", + "text": "Example Document" + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/texts/0" + }, + "children": [ + { + "$ref": "#/texts/2" + } + ], + "label": "section_header", + "prov": [], + "orig": "Introduction", + "text": "Introduction", + "level": 2 + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/texts/1" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "This is the first paragraph of the introduction.", + "text": "This is the first paragraph of the introduction." + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/texts/0" + }, + "children": [ + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/groups/0" + }, + { + "$ref": "#/groups/2" + } + ], + "label": "section_header", + "prov": [], + "orig": "Background", + "text": "Background", + "level": 2 + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/texts/3" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Some background information here.", + "text": "Some background information here." + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/groups/0" + }, + "children": [ + { + "$ref": "#/groups/1" + } + ], + "label": "list_item", + "prov": [], + "orig": "First item in unordered list", + "text": "First item in unordered list", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Nested item 1", + "text": "Nested item 1", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Nested item 2", + "text": "Nested item 2", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Second item in unordered list", + "text": "Second item in unordered list", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/groups/2" + }, + "children": [ + { + "$ref": "#/groups/3" + } + ], + "label": "list_item", + "prov": [], + "orig": "First item in ordered list", + "text": "First item in ordered list", + "enumerated": true, + "marker": "1" + }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/groups/3" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Nested ordered item 1", + "text": "Nested ordered item 1", + "enumerated": true, + "marker": "1." + }, + { + "self_ref": "#/texts/11", + "parent": { + "$ref": "#/groups/3" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Nested ordered item 2", + "text": "Nested ordered item 2", + "enumerated": true, + "marker": "2." + }, + { + "self_ref": "#/texts/12", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Second item in ordered list", + "text": "Second item in ordered list", + "enumerated": true, + "marker": "2." + }, + { + "self_ref": "#/texts/13", + "parent": { + "$ref": "#/texts/0" + }, + "children": [ + { + "$ref": "#/tables/0" + } + ], + "label": "section_header", + "prov": [], + "orig": "Data Table", + "text": "Data Table", + "level": 2 + } + ], + "pictures": [], + "tables": [ + { + "self_ref": "#/tables/0", + "parent": { + "$ref": "#/texts/13" + }, + "children": [], + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Row 1, Col 1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Row 1, Col 2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Row 1, Col 3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Row 2, Col 1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Row 2, Col 2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Row 2, Col 3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Row 3, Col 1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Row 3, Col 2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Row 3, Col 3", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 4, + "num_cols": 3, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 3", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Row 1, Col 1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Row 1, Col 2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Row 1, Col 3", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Row 2, Col 1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Row 2, Col 2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Row 2, Col 3", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Row 3, Col 1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Row 3, Col 2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Row 3, Col 3", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + } + ], + "key_value_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/example_03.html.md b/tests/data/groundtruth/docling_v2/example_03.html.md new file mode 100644 index 00000000..13fbb30c --- /dev/null +++ b/tests/data/groundtruth/docling_v2/example_03.html.md @@ -0,0 +1,27 @@ +# Example Document + +## Introduction + +This is the first paragraph of the introduction. + +## Background + +Some background information here. + +- First item in unordered list + - Nested item 1 + - Nested item 2 +- Second item in unordered list + +1 First item in ordered list + 1. Nested ordered item 1 + 2. Nested ordered item 2 +2. Second item in ordered list + +## Data Table + +| Header 1 | Header 2 | Header 3 | +|--------------|--------------|--------------| +| Row 1, Col 1 | Row 1, Col 2 | Row 1, Col 3 | +| Row 2, Col 1 | Row 2, Col 2 | Row 2, Col 3 | +| Row 3, Col 1 | Row 3, Col 2 | Row 3, Col 3 | \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/example_04.html.itxt b/tests/data/groundtruth/docling_v2/example_04.html.itxt new file mode 100644 index 00000000..4124360d --- /dev/null +++ b/tests/data/groundtruth/docling_v2/example_04.html.itxt @@ -0,0 +1,3 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: title: Data Table with Rowspan and Colspan + item-2 at level 2: table with [4x3] \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/example_04.html.json b/tests/data/groundtruth/docling_v2/example_04.html.json new file mode 100644 index 00000000..c7d6af05 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/example_04.html.json @@ -0,0 +1,329 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.0.0", + "name": "example_04", + "origin": { + "mimetype": "text/html", + "binary_hash": 2846345769602286603, + "filename": "example_04.html" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + } + ], + "name": "_root_", + "label": "unspecified" + }, + "groups": [], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/tables/0" + } + ], + "label": "title", + "prov": [], + "orig": "Data Table with Rowspan and Colspan", + "text": "Data Table with Rowspan and Colspan" + } + ], + "pictures": [], + "tables": [ + { + "self_ref": "#/tables/0", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Header 2 & 3 (colspan)", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Row 1 & 2, Col 1 (rowspan)", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Row 1, Col 2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Row 1, Col 3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Row 2, Col 2 & 3 (colspan)", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Row 3, Col 1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Row 3, Col 2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Row 3, Col 3", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 4, + "num_cols": 3, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Header 2 & 3 (colspan)", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Header 2 & 3 (colspan)", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Row 1 & 2, Col 1 (rowspan)", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Row 1, Col 2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Row 1, Col 3", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Row 1 & 2, Col 1 (rowspan)", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Row 2, Col 2 & 3 (colspan)", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Row 2, Col 2 & 3 (colspan)", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Row 3, Col 1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Row 3, Col 2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Row 3, Col 3", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + } + ], + "key_value_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/example_04.html.md b/tests/data/groundtruth/docling_v2/example_04.html.md new file mode 100644 index 00000000..e620a999 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/example_04.html.md @@ -0,0 +1,7 @@ +# Data Table with Rowspan and Colspan + +| Header 1 | Header 2 & 3 (colspan) | Header 2 & 3 (colspan) | +|----------------------------|----------------------------|----------------------------| +| Row 1 & 2, Col 1 (rowspan) | Row 1, Col 2 | Row 1, Col 3 | +| Row 1 & 2, Col 1 (rowspan) | Row 2, Col 2 & 3 (colspan) | Row 2, Col 2 & 3 (colspan) | +| Row 3, Col 1 | Row 3, Col 2 | Row 3, Col 3 | \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/lorem_ipsum.docx.itxt b/tests/data/groundtruth/docling_v2/lorem_ipsum.docx.itxt new file mode 100644 index 00000000..2513a58d --- /dev/null +++ b/tests/data/groundtruth/docling_v2/lorem_ipsum.docx.itxt @@ -0,0 +1,10 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: paragraph: Lorem ipsum dolor sit amet, cons ... quam non, sodales sem. Nulla facilisi. + item-2 at level 1: paragraph: + item-3 at level 1: paragraph: Duis condimentum dui eget ullamc ... cus tempor, et tristique ante aliquet. + item-4 at level 1: paragraph: + item-5 at level 1: paragraph: Maecenas id neque pharetra, elei ... ulla faucibus eu. Donec ut nisl metus. + item-6 at level 1: paragraph: + item-7 at level 1: paragraph: Duis ac tellus sed turpis feugia ... pellentesque rhoncus, blandit eu nisl. + item-8 at level 1: paragraph: + item-9 at level 1: paragraph: Nunc vehicula mattis erat ac con ... udin, vehicula turpis eu, tempus nibh. \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/lorem_ipsum.docx.json b/tests/data/groundtruth/docling_v2/lorem_ipsum.docx.json new file mode 100644 index 00000000..02bd2123 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/lorem_ipsum.docx.json @@ -0,0 +1,156 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.0.0", + "name": "lorem_ipsum", + "origin": { + "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "binary_hash": 14540608742338341240, + "filename": "lorem_ipsum.docx" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/texts/5" + }, + { + "$ref": "#/texts/6" + }, + { + "$ref": "#/texts/7" + }, + { + "$ref": "#/texts/8" + } + ], + "name": "_root_", + "label": "unspecified" + }, + "groups": [], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin elit mi, fermentum vitae dolor facilisis, porttitor mollis quam. Cras quam massa, venenatis faucibus libero vel, euismod sollicitudin ipsum. Aliquam semper sapien leo, ac ultrices nibh mollis congue. Cras luctus ultrices est, ut scelerisque eros euismod ut. Curabitur ac tincidunt felis, non scelerisque lectus. Praesent sollicitudin vulputate est id consequat. Vestibulum pharetra ligula sit amet varius porttitor. Sed eros diam, gravida non varius at, scelerisque in libero. Ut auctor finibus mauris sit amet ornare. Sed facilisis leo at urna rhoncus, in facilisis arcu eleifend. Sed tincidunt lacinia fermentum. Cras non purus fringilla, semper quam non, sodales sem. Nulla facilisi.", + "text": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin elit mi, fermentum vitae dolor facilisis, porttitor mollis quam. Cras quam massa, venenatis faucibus libero vel, euismod sollicitudin ipsum. Aliquam semper sapien leo, ac ultrices nibh mollis congue. Cras luctus ultrices est, ut scelerisque eros euismod ut. Curabitur ac tincidunt felis, non scelerisque lectus. Praesent sollicitudin vulputate est id consequat. Vestibulum pharetra ligula sit amet varius porttitor. Sed eros diam, gravida non varius at, scelerisque in libero. Ut auctor finibus mauris sit amet ornare. Sed facilisis leo at urna rhoncus, in facilisis arcu eleifend. Sed tincidunt lacinia fermentum. Cras non purus fringilla, semper quam non, sodales sem. Nulla facilisi." + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Duis condimentum dui eget ullamcorper maximus. Nulla tortor lectus, hendrerit at diam fermentum, euismod ornare orci. Integer ac mauris sed augue ultricies pellentesque. Etiam condimentum turpis a risus dictum, sed tempor arcu vestibulum. Quisque at venenatis tellus. Morbi id lobortis elit. In gravida metus at ornare suscipit. Donec euismod nibh sit amet commodo porttitor. Integer commodo sit amet nisi vel accumsan. Donec lacinia posuere porta. Pellentesque vulputate porta risus, vel consectetur nisl gravida sit amet. Nam scelerisque enim sodales lacus tempor, et tristique ante aliquet.", + "text": "Duis condimentum dui eget ullamcorper maximus. Nulla tortor lectus, hendrerit at diam fermentum, euismod ornare orci. Integer ac mauris sed augue ultricies pellentesque. Etiam condimentum turpis a risus dictum, sed tempor arcu vestibulum. Quisque at venenatis tellus. Morbi id lobortis elit. In gravida metus at ornare suscipit. Donec euismod nibh sit amet commodo porttitor. Integer commodo sit amet nisi vel accumsan. Donec lacinia posuere porta. Pellentesque vulputate porta risus, vel consectetur nisl gravida sit amet. Nam scelerisque enim sodales lacus tempor, et tristique ante aliquet." + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Maecenas id neque pharetra, eleifend lectus a, vehicula sapien. Aliquam erat volutpat. Ut arcu erat, blandit id elementum at, aliquet pretium mauris. Nulla at semper orci. Nunc sed maximus metus. Duis eget tristique arcu. Phasellus fringilla augue est, ut bibendum est bibendum vitae. Nam et urna interdum, egestas velit a, consectetur metus. Pellentesque facilisis vehicula orci, eu posuere justo imperdiet non. Vestibulum tincidunt orci ac lorem consequat semper. Fusce semper sollicitudin orci, id lacinia nulla faucibus eu. Donec ut nisl metus.", + "text": "Maecenas id neque pharetra, eleifend lectus a, vehicula sapien. Aliquam erat volutpat. Ut arcu erat, blandit id elementum at, aliquet pretium mauris. Nulla at semper orci. Nunc sed maximus metus. Duis eget tristique arcu. Phasellus fringilla augue est, ut bibendum est bibendum vitae. Nam et urna interdum, egestas velit a, consectetur metus. Pellentesque facilisis vehicula orci, eu posuere justo imperdiet non. Vestibulum tincidunt orci ac lorem consequat semper. Fusce semper sollicitudin orci, id lacinia nulla faucibus eu. Donec ut nisl metus." + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Duis ac tellus sed turpis feugiat aliquam sed vel justo. Fusce sit amet volutpat massa. Duis tristique finibus metus quis tincidunt. Etiam dapibus fringilla diam at pharetra. Vivamus dolor est, hendrerit ac ligula nec, pharetra lacinia sapien. Phasellus at malesuada orci. Maecenas est justo, mollis non ultrices ut, sagittis commodo odio. Integer viverra mauris pellentesque bibendum vestibulum. Sed eu felis mattis, efficitur justo non, finibus lorem. Phasellus viverra diam et sapien imperdiet interdum. Cras a convallis libero. Integer maximus dui vel lorem hendrerit, sit amet convallis ligula lobortis. Duis eu lacus elementum, scelerisque nunc eget, dignissim libero. Suspendisse mi quam, vehicula sit amet pellentesque rhoncus, blandit eu nisl.", + "text": "Duis ac tellus sed turpis feugiat aliquam sed vel justo. Fusce sit amet volutpat massa. Duis tristique finibus metus quis tincidunt. Etiam dapibus fringilla diam at pharetra. Vivamus dolor est, hendrerit ac ligula nec, pharetra lacinia sapien. Phasellus at malesuada orci. Maecenas est justo, mollis non ultrices ut, sagittis commodo odio. Integer viverra mauris pellentesque bibendum vestibulum. Sed eu felis mattis, efficitur justo non, finibus lorem. Phasellus viverra diam et sapien imperdiet interdum. Cras a convallis libero. Integer maximus dui vel lorem hendrerit, sit amet convallis ligula lobortis. Duis eu lacus elementum, scelerisque nunc eget, dignissim libero. Suspendisse mi quam, vehicula sit amet pellentesque rhoncus, blandit eu nisl." + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Nunc vehicula mattis erat ac consectetur. Etiam pharetra mauris ut tempor pellentesque. Sed vel libero vitae ante tempus sagittis vel sit amet dolor. Etiam faucibus viverra sodales. Pellentesque ullamcorper magna libero, non malesuada dui bibendum quis. Donec sed dolor non sem luctus volutpat. Morbi vel diam ut urna euismod gravida a id lectus. Vestibulum vel mauris eu tellus hendrerit dapibus. Etiam scelerisque lacus vel ante ultricies vulputate. In ullamcorper malesuada justo, vel scelerisque nisl lacinia at. Donec sodales interdum ipsum, ac bibendum ipsum pharetra interdum. Vivamus condimentum ac ante vel aliquam. Ut consectetur eu nibh nec gravida. Vestibulum accumsan, purus at mollis rutrum, sapien tortor accumsan purus, vitae fermentum urna mauris ut lacus. Fusce vitae leo sollicitudin, vehicula turpis eu, tempus nibh.", + "text": "Nunc vehicula mattis erat ac consectetur. Etiam pharetra mauris ut tempor pellentesque. Sed vel libero vitae ante tempus sagittis vel sit amet dolor. Etiam faucibus viverra sodales. Pellentesque ullamcorper magna libero, non malesuada dui bibendum quis. Donec sed dolor non sem luctus volutpat. Morbi vel diam ut urna euismod gravida a id lectus. Vestibulum vel mauris eu tellus hendrerit dapibus. Etiam scelerisque lacus vel ante ultricies vulputate. In ullamcorper malesuada justo, vel scelerisque nisl lacinia at. Donec sodales interdum ipsum, ac bibendum ipsum pharetra interdum. Vivamus condimentum ac ante vel aliquam. Ut consectetur eu nibh nec gravida. Vestibulum accumsan, purus at mollis rutrum, sapien tortor accumsan purus, vitae fermentum urna mauris ut lacus. Fusce vitae leo sollicitudin, vehicula turpis eu, tempus nibh." + } + ], + "pictures": [], + "tables": [], + "key_value_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/lorem_ipsum.docx.md b/tests/data/groundtruth/docling_v2/lorem_ipsum.docx.md new file mode 100644 index 00000000..25a7c30c --- /dev/null +++ b/tests/data/groundtruth/docling_v2/lorem_ipsum.docx.md @@ -0,0 +1,9 @@ +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin elit mi, fermentum vitae dolor facilisis, porttitor mollis quam. Cras quam massa, venenatis faucibus libero vel, euismod sollicitudin ipsum. Aliquam semper sapien leo, ac ultrices nibh mollis congue. Cras luctus ultrices est, ut scelerisque eros euismod ut. Curabitur ac tincidunt felis, non scelerisque lectus. Praesent sollicitudin vulputate est id consequat. Vestibulum pharetra ligula sit amet varius porttitor. Sed eros diam, gravida non varius at, scelerisque in libero. Ut auctor finibus mauris sit amet ornare. Sed facilisis leo at urna rhoncus, in facilisis arcu eleifend. Sed tincidunt lacinia fermentum. Cras non purus fringilla, semper quam non, sodales sem. Nulla facilisi. + +Duis condimentum dui eget ullamcorper maximus. Nulla tortor lectus, hendrerit at diam fermentum, euismod ornare orci. Integer ac mauris sed augue ultricies pellentesque. Etiam condimentum turpis a risus dictum, sed tempor arcu vestibulum. Quisque at venenatis tellus. Morbi id lobortis elit. In gravida metus at ornare suscipit. Donec euismod nibh sit amet commodo porttitor. Integer commodo sit amet nisi vel accumsan. Donec lacinia posuere porta. Pellentesque vulputate porta risus, vel consectetur nisl gravida sit amet. Nam scelerisque enim sodales lacus tempor, et tristique ante aliquet. + +Maecenas id neque pharetra, eleifend lectus a, vehicula sapien. Aliquam erat volutpat. Ut arcu erat, blandit id elementum at, aliquet pretium mauris. Nulla at semper orci. Nunc sed maximus metus. Duis eget tristique arcu. Phasellus fringilla augue est, ut bibendum est bibendum vitae. Nam et urna interdum, egestas velit a, consectetur metus. Pellentesque facilisis vehicula orci, eu posuere justo imperdiet non. Vestibulum tincidunt orci ac lorem consequat semper. Fusce semper sollicitudin orci, id lacinia nulla faucibus eu. Donec ut nisl metus. + +Duis ac tellus sed turpis feugiat aliquam sed vel justo. Fusce sit amet volutpat massa. Duis tristique finibus metus quis tincidunt. Etiam dapibus fringilla diam at pharetra. Vivamus dolor est, hendrerit ac ligula nec, pharetra lacinia sapien. Phasellus at malesuada orci. Maecenas est justo, mollis non ultrices ut, sagittis commodo odio. Integer viverra mauris pellentesque bibendum vestibulum. Sed eu felis mattis, efficitur justo non, finibus lorem. Phasellus viverra diam et sapien imperdiet interdum. Cras a convallis libero. Integer maximus dui vel lorem hendrerit, sit amet convallis ligula lobortis. Duis eu lacus elementum, scelerisque nunc eget, dignissim libero. Suspendisse mi quam, vehicula sit amet pellentesque rhoncus, blandit eu nisl. + +Nunc vehicula mattis erat ac consectetur. Etiam pharetra mauris ut tempor pellentesque. Sed vel libero vitae ante tempus sagittis vel sit amet dolor. Etiam faucibus viverra sodales. Pellentesque ullamcorper magna libero, non malesuada dui bibendum quis. Donec sed dolor non sem luctus volutpat. Morbi vel diam ut urna euismod gravida a id lectus. Vestibulum vel mauris eu tellus hendrerit dapibus. Etiam scelerisque lacus vel ante ultricies vulputate. In ullamcorper malesuada justo, vel scelerisque nisl lacinia at. Donec sodales interdum ipsum, ac bibendum ipsum pharetra interdum. Vivamus condimentum ac ante vel aliquam. Ut consectetur eu nibh nec gravida. Vestibulum accumsan, purus at mollis rutrum, sapien tortor accumsan purus, vitae fermentum urna mauris ut lacus. Fusce vitae leo sollicitudin, vehicula turpis eu, tempus nibh. \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/unit_test_01.html.itxt b/tests/data/groundtruth/docling_v2/unit_test_01.html.itxt new file mode 100644 index 00000000..c87bb5ff --- /dev/null +++ b/tests/data/groundtruth/docling_v2/unit_test_01.html.itxt @@ -0,0 +1,9 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: title: Title + item-2 at level 2: section_header: section-1 + item-3 at level 3: section_header: section-1.1 + item-4 at level 2: section_header: section-2 + item-5 at level 3: section: group header-3 + item-6 at level 4: section_header: section-2.0.1 + item-7 at level 3: section_header: section-2.2 + item-8 at level 3: section_header: section-2.3 \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/unit_test_01.html.json b/tests/data/groundtruth/docling_v2/unit_test_01.html.json new file mode 100644 index 00000000..fa126177 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/unit_test_01.html.json @@ -0,0 +1,151 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.0.0", + "name": "unit_test_01", + "origin": { + "mimetype": "text/html", + "binary_hash": 11574357959810932112, + "filename": "unit_test_01.html" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + } + ], + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/texts/3" + }, + "children": [ + { + "$ref": "#/texts/4" + } + ], + "name": "header-3", + "label": "section" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/texts/3" + } + ], + "label": "title", + "prov": [], + "orig": "Title", + "text": "Title" + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/texts/0" + }, + "children": [ + { + "$ref": "#/texts/2" + } + ], + "label": "section_header", + "prov": [], + "orig": "section-1", + "text": "section-1", + "level": 2 + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/texts/1" + }, + "children": [], + "label": "section_header", + "prov": [], + "orig": "section-1.1", + "text": "section-1.1", + "level": 3 + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/texts/0" + }, + "children": [ + { + "$ref": "#/groups/0" + }, + { + "$ref": "#/texts/5" + }, + { + "$ref": "#/texts/6" + } + ], + "label": "section_header", + "prov": [], + "orig": "section-2", + "text": "section-2", + "level": 2 + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "label": "section_header", + "prov": [], + "orig": "section-2.0.1", + "text": "section-2.0.1", + "level": 4 + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/texts/3" + }, + "children": [], + "label": "section_header", + "prov": [], + "orig": "section-2.2", + "text": "section-2.2", + "level": 3 + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/texts/3" + }, + "children": [], + "label": "section_header", + "prov": [], + "orig": "section-2.3", + "text": "section-2.3", + "level": 3 + } + ], + "pictures": [], + "tables": [], + "key_value_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/unit_test_01.html.md b/tests/data/groundtruth/docling_v2/unit_test_01.html.md new file mode 100644 index 00000000..38cbbbcf --- /dev/null +++ b/tests/data/groundtruth/docling_v2/unit_test_01.html.md @@ -0,0 +1,13 @@ +# Title + +## section-1 + +### section-1.1 + +## section-2 + +#### section-2.0.1 + +### section-2.2 + +### section-2.3 \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/unit_test_headers.docx.itxt b/tests/data/groundtruth/docling_v2/unit_test_headers.docx.itxt new file mode 100644 index 00000000..7b6b7543 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/unit_test_headers.docx.itxt @@ -0,0 +1,48 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: title: Test Document + item-2 at level 2: paragraph: + item-3 at level 2: section_header: Section 1 + item-4 at level 3: paragraph: + item-5 at level 3: paragraph: Paragraph 1.1 + item-6 at level 3: paragraph: + item-7 at level 3: paragraph: Paragraph 1.2 + item-8 at level 3: paragraph: + item-9 at level 3: section_header: Section 1.1 + item-10 at level 4: paragraph: + item-11 at level 4: paragraph: Paragraph 1.1.1 + item-12 at level 4: paragraph: + item-13 at level 4: paragraph: Paragraph 1.1.2 + item-14 at level 4: paragraph: + item-15 at level 3: section_header: Section 1.2 + item-16 at level 4: paragraph: + item-17 at level 4: paragraph: Paragraph 1.1.1 + item-18 at level 4: paragraph: + item-19 at level 4: paragraph: Paragraph 1.1.2 + item-20 at level 4: paragraph: + item-21 at level 4: section_header: Section 1.2.3 + item-22 at level 5: paragraph: + item-23 at level 5: paragraph: Paragraph 1.2.3.1 + item-24 at level 5: paragraph: + item-25 at level 5: paragraph: Paragraph 1.2.3.1 + item-26 at level 5: paragraph: + item-27 at level 5: paragraph: + item-28 at level 2: section_header: Section 2 + item-29 at level 3: paragraph: + item-30 at level 3: paragraph: Paragraph 2.1 + item-31 at level 3: paragraph: + item-32 at level 3: paragraph: Paragraph 2.2 + item-33 at level 3: paragraph: + item-34 at level 3: section: group header-2 + item-35 at level 4: section_header: Section 2.1.1 + item-36 at level 5: paragraph: + item-37 at level 5: paragraph: Paragraph 2.1.1.1 + item-38 at level 5: paragraph: + item-39 at level 5: paragraph: Paragraph 2.1.1.1 + item-40 at level 5: paragraph: + item-41 at level 3: section_header: Section 2.1 + item-42 at level 4: paragraph: + item-43 at level 4: paragraph: Paragraph 2.1.1 + item-44 at level 4: paragraph: + item-45 at level 4: paragraph: Paragraph 2.1.2 + item-46 at level 4: paragraph: + item-47 at level 4: paragraph: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/unit_test_headers.docx.json b/tests/data/groundtruth/docling_v2/unit_test_headers.docx.json new file mode 100644 index 00000000..c76d241a --- /dev/null +++ b/tests/data/groundtruth/docling_v2/unit_test_headers.docx.json @@ -0,0 +1,703 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.0.0", + "name": "unit_test_headers", + "origin": { + "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "binary_hash": 15606343257915737103, + "filename": "unit_test_headers.docx" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + } + ], + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/texts/27" + }, + "children": [ + { + "$ref": "#/texts/33" + } + ], + "name": "header-2", + "label": "section" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/texts/27" + } + ], + "label": "title", + "prov": [], + "orig": "Test Document", + "text": "Test Document" + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/texts/0" + }, + "children": [ + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/texts/5" + }, + { + "$ref": "#/texts/6" + }, + { + "$ref": "#/texts/7" + }, + { + "$ref": "#/texts/8" + }, + { + "$ref": "#/texts/14" + } + ], + "label": "section_header", + "prov": [], + "orig": "Section 1", + "text": "Section 1", + "level": 1 + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/texts/2" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/texts/2" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 1.1", + "text": "Paragraph 1.1" + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/texts/2" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/texts/2" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 1.2", + "text": "Paragraph 1.2" + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/texts/2" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/texts/2" + }, + "children": [ + { + "$ref": "#/texts/9" + }, + { + "$ref": "#/texts/10" + }, + { + "$ref": "#/texts/11" + }, + { + "$ref": "#/texts/12" + }, + { + "$ref": "#/texts/13" + } + ], + "label": "section_header", + "prov": [], + "orig": "Section 1.1", + "text": "Section 1.1", + "level": 2 + }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/texts/8" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/texts/8" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 1.1.1", + "text": "Paragraph 1.1.1" + }, + { + "self_ref": "#/texts/11", + "parent": { + "$ref": "#/texts/8" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/12", + "parent": { + "$ref": "#/texts/8" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 1.1.2", + "text": "Paragraph 1.1.2" + }, + { + "self_ref": "#/texts/13", + "parent": { + "$ref": "#/texts/8" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/14", + "parent": { + "$ref": "#/texts/2" + }, + "children": [ + { + "$ref": "#/texts/15" + }, + { + "$ref": "#/texts/16" + }, + { + "$ref": "#/texts/17" + }, + { + "$ref": "#/texts/18" + }, + { + "$ref": "#/texts/19" + }, + { + "$ref": "#/texts/20" + } + ], + "label": "section_header", + "prov": [], + "orig": "Section 1.2", + "text": "Section 1.2", + "level": 2 + }, + { + "self_ref": "#/texts/15", + "parent": { + "$ref": "#/texts/14" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/16", + "parent": { + "$ref": "#/texts/14" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 1.1.1", + "text": "Paragraph 1.1.1" + }, + { + "self_ref": "#/texts/17", + "parent": { + "$ref": "#/texts/14" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/18", + "parent": { + "$ref": "#/texts/14" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 1.1.2", + "text": "Paragraph 1.1.2" + }, + { + "self_ref": "#/texts/19", + "parent": { + "$ref": "#/texts/14" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/20", + "parent": { + "$ref": "#/texts/14" + }, + "children": [ + { + "$ref": "#/texts/21" + }, + { + "$ref": "#/texts/22" + }, + { + "$ref": "#/texts/23" + }, + { + "$ref": "#/texts/24" + }, + { + "$ref": "#/texts/25" + }, + { + "$ref": "#/texts/26" + } + ], + "label": "section_header", + "prov": [], + "orig": "Section 1.2.3", + "text": "Section 1.2.3", + "level": 3 + }, + { + "self_ref": "#/texts/21", + "parent": { + "$ref": "#/texts/20" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/22", + "parent": { + "$ref": "#/texts/20" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 1.2.3.1", + "text": "Paragraph 1.2.3.1" + }, + { + "self_ref": "#/texts/23", + "parent": { + "$ref": "#/texts/20" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/24", + "parent": { + "$ref": "#/texts/20" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 1.2.3.1", + "text": "Paragraph 1.2.3.1" + }, + { + "self_ref": "#/texts/25", + "parent": { + "$ref": "#/texts/20" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/26", + "parent": { + "$ref": "#/texts/20" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/27", + "parent": { + "$ref": "#/texts/0" + }, + "children": [ + { + "$ref": "#/texts/28" + }, + { + "$ref": "#/texts/29" + }, + { + "$ref": "#/texts/30" + }, + { + "$ref": "#/texts/31" + }, + { + "$ref": "#/texts/32" + }, + { + "$ref": "#/groups/0" + }, + { + "$ref": "#/texts/39" + } + ], + "label": "section_header", + "prov": [], + "orig": "Section 2", + "text": "Section 2", + "level": 1 + }, + { + "self_ref": "#/texts/28", + "parent": { + "$ref": "#/texts/27" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/29", + "parent": { + "$ref": "#/texts/27" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 2.1", + "text": "Paragraph 2.1" + }, + { + "self_ref": "#/texts/30", + "parent": { + "$ref": "#/texts/27" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/31", + "parent": { + "$ref": "#/texts/27" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 2.2", + "text": "Paragraph 2.2" + }, + { + "self_ref": "#/texts/32", + "parent": { + "$ref": "#/texts/27" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/33", + "parent": { + "$ref": "#/groups/0" + }, + "children": [ + { + "$ref": "#/texts/34" + }, + { + "$ref": "#/texts/35" + }, + { + "$ref": "#/texts/36" + }, + { + "$ref": "#/texts/37" + }, + { + "$ref": "#/texts/38" + } + ], + "label": "section_header", + "prov": [], + "orig": "Section 2.1.1", + "text": "Section 2.1.1", + "level": 3 + }, + { + "self_ref": "#/texts/34", + "parent": { + "$ref": "#/texts/33" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/35", + "parent": { + "$ref": "#/texts/33" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 2.1.1.1", + "text": "Paragraph 2.1.1.1" + }, + { + "self_ref": "#/texts/36", + "parent": { + "$ref": "#/texts/33" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/37", + "parent": { + "$ref": "#/texts/33" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 2.1.1.1", + "text": "Paragraph 2.1.1.1" + }, + { + "self_ref": "#/texts/38", + "parent": { + "$ref": "#/texts/33" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/39", + "parent": { + "$ref": "#/texts/27" + }, + "children": [ + { + "$ref": "#/texts/40" + }, + { + "$ref": "#/texts/41" + }, + { + "$ref": "#/texts/42" + }, + { + "$ref": "#/texts/43" + }, + { + "$ref": "#/texts/44" + }, + { + "$ref": "#/texts/45" + } + ], + "label": "section_header", + "prov": [], + "orig": "Section 2.1", + "text": "Section 2.1", + "level": 2 + }, + { + "self_ref": "#/texts/40", + "parent": { + "$ref": "#/texts/39" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/41", + "parent": { + "$ref": "#/texts/39" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 2.1.1", + "text": "Paragraph 2.1.1" + }, + { + "self_ref": "#/texts/42", + "parent": { + "$ref": "#/texts/39" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/43", + "parent": { + "$ref": "#/texts/39" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 2.1.2", + "text": "Paragraph 2.1.2" + }, + { + "self_ref": "#/texts/44", + "parent": { + "$ref": "#/texts/39" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/45", + "parent": { + "$ref": "#/texts/39" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + } + ], + "pictures": [], + "tables": [], + "key_value_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/unit_test_headers.docx.md b/tests/data/groundtruth/docling_v2/unit_test_headers.docx.md new file mode 100644 index 00000000..d4c8accd --- /dev/null +++ b/tests/data/groundtruth/docling_v2/unit_test_headers.docx.md @@ -0,0 +1,43 @@ +# Test Document + +## Section 1 + +Paragraph 1.1 + +Paragraph 1.2 + +### Section 1.1 + +Paragraph 1.1.1 + +Paragraph 1.1.2 + +### Section 1.2 + +Paragraph 1.1.1 + +Paragraph 1.1.2 + +#### Section 1.2.3 + +Paragraph 1.2.3.1 + +Paragraph 1.2.3.1 + +## Section 2 + +Paragraph 2.1 + +Paragraph 2.2 + +#### Section 2.1.1 + +Paragraph 2.1.1.1 + +Paragraph 2.1.1.1 + +### Section 2.1 + +Paragraph 2.1.1 + +Paragraph 2.1.2 \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/unit_test_lists.docx.itxt b/tests/data/groundtruth/docling_v2/unit_test_lists.docx.itxt new file mode 100644 index 00000000..edf6335c --- /dev/null +++ b/tests/data/groundtruth/docling_v2/unit_test_lists.docx.itxt @@ -0,0 +1,61 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: section: group header-0 + item-2 at level 2: section_header: Test Document + item-3 at level 3: paragraph: + item-4 at level 3: paragraph: + item-5 at level 3: paragraph: Paragraph 2.1.1 + item-6 at level 3: paragraph: + item-7 at level 3: paragraph: Paragraph 2.1.2 + item-8 at level 3: paragraph: + item-9 at level 3: section: group header-2 + item-10 at level 4: section_header: Test 1: + item-11 at level 5: list: group list + item-12 at level 6: list_item: List item 1 + item-13 at level 6: list_item: List item 2 + item-14 at level 6: list_item: List item 3 + item-15 at level 5: paragraph: + item-16 at level 4: section_header: Test 2: + item-17 at level 5: list: group list + item-18 at level 6: list_item: List item a + item-19 at level 6: list_item: List item b + item-20 at level 6: list_item: List item c + item-21 at level 5: paragraph: + item-22 at level 4: section_header: Test 3: + item-23 at level 5: list: group list + item-24 at level 6: list_item: List item 1 + item-25 at level 6: list_item: List item 2 + item-26 at level 6: list: group list + item-27 at level 7: list_item: List item 1.1 + item-28 at level 7: list_item: List item 1.2 + item-29 at level 7: list_item: List item 1.3 + item-30 at level 6: list_item: List item 3 + item-31 at level 5: paragraph: + item-32 at level 4: section_header: Test 4: + item-33 at level 5: list: group list + item-34 at level 6: list_item: List item 1 + item-35 at level 6: list: group list + item-36 at level 7: list_item: List item 1.1 + item-37 at level 6: list_item: List item 2 + item-38 at level 5: paragraph: + item-39 at level 4: section_header: Test 5: + item-40 at level 5: list: group list + item-41 at level 6: list_item: List item 1 + item-42 at level 6: list: group list + item-43 at level 7: list_item: List item 1.1 + item-44 at level 7: list: group list + item-45 at level 8: list_item: List item 1.1.1 + item-46 at level 6: list_item: List item 3 + item-47 at level 5: paragraph: + item-48 at level 4: section_header: Test 6: + item-49 at level 5: list: group list + item-50 at level 6: list_item: List item 1 + item-51 at level 6: list_item: List item 2 + item-52 at level 6: list: group list + item-53 at level 7: list_item: List item 1.1 + item-54 at level 7: list_item: List item 1.2 + item-55 at level 7: list: group list + item-56 at level 8: list_item: List item 1.2.1 + item-57 at level 6: list_item: List item 3 + item-58 at level 5: paragraph: + item-59 at level 5: paragraph: + item-60 at level 5: paragraph: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/unit_test_lists.docx.json b/tests/data/groundtruth/docling_v2/unit_test_lists.docx.json new file mode 100644 index 00000000..1410586c --- /dev/null +++ b/tests/data/groundtruth/docling_v2/unit_test_lists.docx.json @@ -0,0 +1,921 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.0.0", + "name": "unit_test_lists", + "origin": { + "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "binary_hash": 13601004233111293776, + "filename": "unit_test_lists.docx" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/groups/0" + } + ], + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/0" + } + ], + "name": "header-0", + "label": "section" + }, + { + "self_ref": "#/groups/1", + "parent": { + "$ref": "#/texts/0" + }, + "children": [ + { + "$ref": "#/texts/7" + }, + { + "$ref": "#/texts/12" + }, + { + "$ref": "#/texts/17" + }, + { + "$ref": "#/texts/25" + }, + { + "$ref": "#/texts/30" + }, + { + "$ref": "#/texts/36" + } + ], + "name": "header-2", + "label": "section" + }, + { + "self_ref": "#/groups/2", + "parent": { + "$ref": "#/texts/7" + }, + "children": [ + { + "$ref": "#/texts/8" + }, + { + "$ref": "#/texts/9" + }, + { + "$ref": "#/texts/10" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/3", + "parent": { + "$ref": "#/texts/12" + }, + "children": [ + { + "$ref": "#/texts/13" + }, + { + "$ref": "#/texts/14" + }, + { + "$ref": "#/texts/15" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/4", + "parent": { + "$ref": "#/texts/17" + }, + "children": [ + { + "$ref": "#/texts/18" + }, + { + "$ref": "#/texts/19" + }, + { + "$ref": "#/groups/5" + }, + { + "$ref": "#/texts/23" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/5", + "parent": { + "$ref": "#/groups/4" + }, + "children": [ + { + "$ref": "#/texts/20" + }, + { + "$ref": "#/texts/21" + }, + { + "$ref": "#/texts/22" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/6", + "parent": { + "$ref": "#/texts/25" + }, + "children": [ + { + "$ref": "#/texts/26" + }, + { + "$ref": "#/groups/7" + }, + { + "$ref": "#/texts/28" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/7", + "parent": { + "$ref": "#/groups/6" + }, + "children": [ + { + "$ref": "#/texts/27" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/8", + "parent": { + "$ref": "#/texts/30" + }, + "children": [ + { + "$ref": "#/texts/31" + }, + { + "$ref": "#/groups/9" + }, + { + "$ref": "#/texts/34" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/9", + "parent": { + "$ref": "#/groups/8" + }, + "children": [ + { + "$ref": "#/texts/32" + }, + { + "$ref": "#/groups/10" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/10", + "parent": { + "$ref": "#/groups/9" + }, + "children": [ + { + "$ref": "#/texts/33" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/11", + "parent": { + "$ref": "#/texts/36" + }, + "children": [ + { + "$ref": "#/texts/37" + }, + { + "$ref": "#/texts/38" + }, + { + "$ref": "#/groups/12" + }, + { + "$ref": "#/texts/42" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/12", + "parent": { + "$ref": "#/groups/11" + }, + "children": [ + { + "$ref": "#/texts/39" + }, + { + "$ref": "#/texts/40" + }, + { + "$ref": "#/groups/13" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/13", + "parent": { + "$ref": "#/groups/12" + }, + "children": [ + { + "$ref": "#/texts/41" + } + ], + "name": "list", + "label": "list" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/groups/0" + }, + "children": [ + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/texts/5" + }, + { + "$ref": "#/texts/6" + }, + { + "$ref": "#/groups/1" + } + ], + "label": "section_header", + "prov": [], + "orig": "Test Document", + "text": "Test Document", + "level": 1 + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 2.1.1", + "text": "Paragraph 2.1.1" + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 2.1.2", + "text": "Paragraph 2.1.2" + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/groups/1" + }, + "children": [ + { + "$ref": "#/groups/2" + }, + { + "$ref": "#/texts/11" + } + ], + "label": "section_header", + "prov": [], + "orig": "Test 1:", + "text": "Test 1:", + "level": 3 + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "List item 1", + "text": "List item 1", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "List item 2", + "text": "List item 2", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "List item 3", + "text": "List item 3", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/11", + "parent": { + "$ref": "#/texts/7" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/12", + "parent": { + "$ref": "#/groups/1" + }, + "children": [ + { + "$ref": "#/groups/3" + }, + { + "$ref": "#/texts/16" + } + ], + "label": "section_header", + "prov": [], + "orig": "Test 2:", + "text": "Test 2:", + "level": 3 + }, + { + "self_ref": "#/texts/13", + "parent": { + "$ref": "#/groups/3" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "List item a", + "text": "List item a", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/14", + "parent": { + "$ref": "#/groups/3" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "List item b", + "text": "List item b", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/15", + "parent": { + "$ref": "#/groups/3" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "List item c", + "text": "List item c", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/16", + "parent": { + "$ref": "#/texts/12" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/17", + "parent": { + "$ref": "#/groups/1" + }, + "children": [ + { + "$ref": "#/groups/4" + }, + { + "$ref": "#/texts/24" + } + ], + "label": "section_header", + "prov": [], + "orig": "Test 3:", + "text": "Test 3:", + "level": 3 + }, + { + "self_ref": "#/texts/18", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "List item 1", + "text": "List item 1", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/19", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "List item 2", + "text": "List item 2", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/20", + "parent": { + "$ref": "#/groups/5" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "List item 1.1", + "text": "List item 1.1", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/21", + "parent": { + "$ref": "#/groups/5" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "List item 1.2", + "text": "List item 1.2", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/22", + "parent": { + "$ref": "#/groups/5" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "List item 1.3", + "text": "List item 1.3", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/23", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "List item 3", + "text": "List item 3", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/24", + "parent": { + "$ref": "#/texts/17" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/25", + "parent": { + "$ref": "#/groups/1" + }, + "children": [ + { + "$ref": "#/groups/6" + }, + { + "$ref": "#/texts/29" + } + ], + "label": "section_header", + "prov": [], + "orig": "Test 4:", + "text": "Test 4:", + "level": 3 + }, + { + "self_ref": "#/texts/26", + "parent": { + "$ref": "#/groups/6" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "List item 1", + "text": "List item 1", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/27", + "parent": { + "$ref": "#/groups/7" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "List item 1.1", + "text": "List item 1.1", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/28", + "parent": { + "$ref": "#/groups/6" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "List item 2", + "text": "List item 2", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/29", + "parent": { + "$ref": "#/texts/25" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/30", + "parent": { + "$ref": "#/groups/1" + }, + "children": [ + { + "$ref": "#/groups/8" + }, + { + "$ref": "#/texts/35" + } + ], + "label": "section_header", + "prov": [], + "orig": "Test 5:", + "text": "Test 5:", + "level": 3 + }, + { + "self_ref": "#/texts/31", + "parent": { + "$ref": "#/groups/8" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "List item 1", + "text": "List item 1", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/32", + "parent": { + "$ref": "#/groups/9" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "List item 1.1", + "text": "List item 1.1", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/33", + "parent": { + "$ref": "#/groups/10" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "List item 1.1.1", + "text": "List item 1.1.1", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/34", + "parent": { + "$ref": "#/groups/8" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "List item 3", + "text": "List item 3", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/35", + "parent": { + "$ref": "#/texts/30" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/36", + "parent": { + "$ref": "#/groups/1" + }, + "children": [ + { + "$ref": "#/groups/11" + }, + { + "$ref": "#/texts/43" + }, + { + "$ref": "#/texts/44" + }, + { + "$ref": "#/texts/45" + } + ], + "label": "section_header", + "prov": [], + "orig": "Test 6:", + "text": "Test 6:", + "level": 3 + }, + { + "self_ref": "#/texts/37", + "parent": { + "$ref": "#/groups/11" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "List item 1", + "text": "List item 1", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/38", + "parent": { + "$ref": "#/groups/11" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "List item 2", + "text": "List item 2", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/39", + "parent": { + "$ref": "#/groups/12" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "List item 1.1", + "text": "List item 1.1", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/40", + "parent": { + "$ref": "#/groups/12" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "List item 1.2", + "text": "List item 1.2", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/41", + "parent": { + "$ref": "#/groups/13" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "List item 1.2.1", + "text": "List item 1.2.1", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/42", + "parent": { + "$ref": "#/groups/11" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "List item 3", + "text": "List item 3", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/43", + "parent": { + "$ref": "#/texts/36" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/44", + "parent": { + "$ref": "#/texts/36" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/45", + "parent": { + "$ref": "#/texts/36" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + } + ], + "pictures": [], + "tables": [], + "key_value_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/unit_test_lists.docx.md b/tests/data/groundtruth/docling_v2/unit_test_lists.docx.md new file mode 100644 index 00000000..b254cb16 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/unit_test_lists.docx.md @@ -0,0 +1,48 @@ +## Test Document + +Paragraph 2.1.1 + +Paragraph 2.1.2 + +#### Test 1: + +- List item 1 +- List item 2 +- List item 3 + +#### Test 2: + +- List item a +- List item b +- List item c + +#### Test 3: + +- List item 1 +- List item 2 + - List item 1.1 + - List item 1.2 + - List item 1.3 +- List item 3 + +#### Test 4: + +- List item 1 + - List item 1.1 +- List item 2 + +#### Test 5: + +- List item 1 + - List item 1.1 + - List item 1.1.1 +- List item 3 + +#### Test 6: + +- List item 1 +- List item 2 + - List item 1.1 + - List item 1.2 + - List item 1.2.1 +- List item 3 \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/wiki_duck.html.itxt b/tests/data/groundtruth/docling_v2/wiki_duck.html.itxt new file mode 100644 index 00000000..ad9a4d6c --- /dev/null +++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.itxt @@ -0,0 +1,464 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: list: group list + item-2 at level 2: list_item: Main page + item-3 at level 2: list_item: Contents + item-4 at level 2: list_item: Current events + item-5 at level 2: list_item: Random article + item-6 at level 2: list_item: About Wikipedia + item-7 at level 2: list_item: Contact us + item-8 at level 1: list: group list + item-9 at level 2: list_item: Help + item-10 at level 2: list_item: Learn to edit + item-11 at level 2: list_item: Community portal + item-12 at level 2: list_item: Recent changes + item-13 at level 2: list_item: Upload file + item-14 at level 1: picture + item-15 at level 1: picture + item-16 at level 1: picture + item-17 at level 1: list: group list + item-18 at level 1: list: group list + item-19 at level 2: list_item: Donate + item-20 at level 1: list: group list + item-21 at level 1: list: group list + item-22 at level 2: list_item: Create account + item-23 at level 2: list_item: Log in + item-24 at level 1: list: group list + item-25 at level 2: list_item: Create account + item-26 at level 2: list_item: Log in + item-27 at level 1: list: group list + item-28 at level 2: list_item: Contributions + item-29 at level 2: list_item: Talk + item-30 at level 1: section: group header-1 + item-31 at level 2: section_header: Contents + item-32 at level 3: list: group list + item-33 at level 4: list_item: (Top) + item-34 at level 4: list_item: 1 Etymology + item-35 at level 5: list: group list + item-36 at level 4: list_item: 2 Taxonomy + item-37 at level 5: list: group list + item-38 at level 4: list_item: 3 Morphology + item-39 at level 5: list: group list + item-40 at level 4: list_item: 4 Distribution and habitat + item-41 at level 5: list: group list + item-42 at level 4: list_item: 5 Behaviour Toggle Behaviour subsection + item-43 at level 5: list: group list + item-44 at level 6: list_item: 5.1 Feeding + item-45 at level 7: list: group list + item-46 at level 6: list_item: 5.2 Breeding + item-47 at level 7: list: group list + item-48 at level 6: list_item: 5.3 Communication + item-49 at level 7: list: group list + item-50 at level 6: list_item: 5.4 Predators + item-51 at level 7: list: group list + item-52 at level 4: list_item: 6 Relationship with humans Toggle Relationship with humans subsection + item-53 at level 5: list: group list + item-54 at level 6: list_item: 6.1 Hunting + item-55 at level 7: list: group list + item-56 at level 6: list_item: 6.2 Domestication + item-57 at level 7: list: group list + item-58 at level 6: list_item: 6.3 Heraldry + item-59 at level 7: list: group list + item-60 at level 6: list_item: 6.4 Cultural references + item-61 at level 7: list: group list + item-62 at level 4: list_item: 7 See also + item-63 at level 5: list: group list + item-64 at level 4: list_item: 8 Notes Toggle Notes subsection + item-65 at level 5: list: group list + item-66 at level 6: list_item: 8.1 Citations + item-67 at level 7: list: group list + item-68 at level 6: list_item: 8.2 Sources + item-69 at level 7: list: group list + item-70 at level 4: list_item: 9 External links + item-71 at level 5: list: group list + item-72 at level 1: title: Duck + item-73 at level 2: list: group list + item-74 at level 3: list_item: Acèh + item-75 at level 3: list_item: Afrikaans + item-76 at level 3: list_item: Alemannisch + item-77 at level 3: list_item: አማርኛ + item-78 at level 3: list_item: Ænglisc + item-79 at level 3: list_item: العربية + item-80 at level 3: list_item: Aragonés + item-81 at level 3: list_item: ܐܪܡܝܐ + item-82 at level 3: list_item: Armãneashti + item-83 at level 3: list_item: Asturianu + item-84 at level 3: list_item: Atikamekw + item-85 at level 3: list_item: Авар + item-86 at level 3: list_item: Aymar aru + item-87 at level 3: list_item: تۆرکجه + item-88 at level 3: list_item: Basa Bali + item-89 at level 3: list_item: বাংলা + item-90 at level 3: list_item: 閩南語 / Bân-lâm-gú + item-91 at level 3: list_item: Беларуская + item-92 at level 3: list_item: Беларуская (тарашкевіца) + item-93 at level 3: list_item: Bikol Central + item-94 at level 3: list_item: Български + item-95 at level 3: list_item: Brezhoneg + item-96 at level 3: list_item: Буряад + item-97 at level 3: list_item: Català + item-98 at level 3: list_item: Чӑвашла + item-99 at level 3: list_item: Čeština + item-100 at level 3: list_item: ChiShona + item-101 at level 3: list_item: Cymraeg + item-102 at level 3: list_item: Dagbanli + item-103 at level 3: list_item: Dansk + item-104 at level 3: list_item: Deitsch + item-105 at level 3: list_item: Deutsch + item-106 at level 3: list_item: डोटेली + item-107 at level 3: list_item: Ελληνικά + item-108 at level 3: list_item: Emiliàn e rumagnòl + item-109 at level 3: list_item: Español + item-110 at level 3: list_item: Esperanto + item-111 at level 3: list_item: Euskara + item-112 at level 3: list_item: فارسی + item-113 at level 3: list_item: Français + item-114 at level 3: list_item: Gaeilge + item-115 at level 3: list_item: Galego + item-116 at level 3: list_item: ГӀалгӀай + item-117 at level 3: list_item: 贛語 + item-118 at level 3: list_item: گیلکی + item-119 at level 3: list_item: 𐌲𐌿𐍄𐌹𐍃𐌺 + item-120 at level 3: list_item: गोंयची कोंकणी / Gõychi Konknni + item-121 at level 3: list_item: 客家語 / Hak-kâ-ngî + item-122 at level 3: list_item: 한국어 + item-123 at level 3: list_item: Hausa + item-124 at level 3: list_item: Հայերեն + item-125 at level 3: list_item: हिन्दी + item-126 at level 3: list_item: Hrvatski + item-127 at level 3: list_item: Ido + item-128 at level 3: list_item: Bahasa Indonesia + item-129 at level 3: list_item: Iñupiatun + item-130 at level 3: list_item: Íslenska + item-131 at level 3: list_item: Italiano + item-132 at level 3: list_item: עברית + item-133 at level 3: list_item: Jawa + item-134 at level 3: list_item: ಕನ್ನಡ + item-135 at level 3: list_item: Kapampangan + item-136 at level 3: list_item: ქართული + item-137 at level 3: list_item: कॉशुर / کٲشُر + item-138 at level 3: list_item: Қазақша + item-139 at level 3: list_item: Ikirundi + item-140 at level 3: list_item: Kongo + item-141 at level 3: list_item: Kreyòl ayisyen + item-142 at level 3: list_item: Кырык мары + item-143 at level 3: list_item: ລາວ + item-144 at level 3: list_item: Latina + item-145 at level 3: list_item: Latviešu + item-146 at level 3: list_item: Lietuvių + item-147 at level 3: list_item: Li Niha + item-148 at level 3: list_item: Ligure + item-149 at level 3: list_item: Limburgs + item-150 at level 3: list_item: Lingála + item-151 at level 3: list_item: Malagasy + item-152 at level 3: list_item: മലയാളം + item-153 at level 3: list_item: मराठी + item-154 at level 3: list_item: مازِرونی + item-155 at level 3: list_item: Bahasa Melayu + item-156 at level 3: list_item: ꯃꯤꯇꯩ ꯂꯣꯟ + item-157 at level 3: list_item: 閩東語 / Mìng-dĕ̤ng-ngṳ̄ + item-158 at level 3: list_item: Мокшень + item-159 at level 3: list_item: Монгол + item-160 at level 3: list_item: မြန်မာဘာသာ + item-161 at level 3: list_item: Nederlands + item-162 at level 3: list_item: Nedersaksies + item-163 at level 3: list_item: नेपाली + item-164 at level 3: list_item: नेपाल भाषा + item-165 at level 3: list_item: 日本語 + item-166 at level 3: list_item: Нохчийн + item-167 at level 3: list_item: Norsk nynorsk + item-168 at level 3: list_item: Occitan + item-169 at level 3: list_item: Oromoo + item-170 at level 3: list_item: ਪੰਜਾਬੀ + item-171 at level 3: list_item: Picard + item-172 at level 3: list_item: Plattdüütsch + item-173 at level 3: list_item: Polski + item-174 at level 3: list_item: Português + item-175 at level 3: list_item: Qırımtatarca + item-176 at level 3: list_item: Română + item-177 at level 3: list_item: Русский + item-178 at level 3: list_item: Саха тыла + item-179 at level 3: list_item: ᱥᱟᱱᱛᱟᱲᱤ + item-180 at level 3: list_item: Sardu + item-181 at level 3: list_item: Scots + item-182 at level 3: list_item: Seeltersk + item-183 at level 3: list_item: Shqip + item-184 at level 3: list_item: Sicilianu + item-185 at level 3: list_item: සිංහල + item-186 at level 3: list_item: Simple English + item-187 at level 3: list_item: سنڌي + item-188 at level 3: list_item: کوردی + item-189 at level 3: list_item: Српски / srpski + item-190 at level 3: list_item: Srpskohrvatski / српскохрватски + item-191 at level 3: list_item: Sunda + item-192 at level 3: list_item: Svenska + item-193 at level 3: list_item: Tagalog + item-194 at level 3: list_item: தமிழ் + item-195 at level 3: list_item: Taqbaylit + item-196 at level 3: list_item: Татарча / tatarça + item-197 at level 3: list_item: ไทย + item-198 at level 3: list_item: Türkçe + item-199 at level 3: list_item: Українська + item-200 at level 3: list_item: ئۇيغۇرچە / Uyghurche + item-201 at level 3: list_item: Vahcuengh + item-202 at level 3: list_item: Tiếng Việt + item-203 at level 3: list_item: Walon + item-204 at level 3: list_item: 文言 + item-205 at level 3: list_item: Winaray + item-206 at level 3: list_item: 吴语 + item-207 at level 3: list_item: 粵語 + item-208 at level 3: list_item: Žemaitėška + item-209 at level 3: list_item: 中文 + item-210 at level 2: list: group list + item-211 at level 3: list_item: Article + item-212 at level 3: list_item: Talk + item-213 at level 2: list: group list + item-214 at level 2: list: group list + item-215 at level 3: list_item: Read + item-216 at level 3: list_item: View source + item-217 at level 3: list_item: View history + item-218 at level 2: list: group list + item-219 at level 3: list_item: Read + item-220 at level 3: list_item: View source + item-221 at level 3: list_item: View history + item-222 at level 2: list: group list + item-223 at level 3: list_item: What links here + item-224 at level 3: list_item: Related changes + item-225 at level 3: list_item: Upload file + item-226 at level 3: list_item: Special pages + item-227 at level 3: list_item: Permanent link + item-228 at level 3: list_item: Page information + item-229 at level 3: list_item: Cite this page + item-230 at level 3: list_item: Get shortened URL + item-231 at level 3: list_item: Download QR code + item-232 at level 3: list_item: Wikidata item + item-233 at level 2: list: group list + item-234 at level 3: list_item: Download as PDF + item-235 at level 3: list_item: Printable version + item-236 at level 2: list: group list + item-237 at level 3: list_item: Wikimedia Commons + item-238 at level 3: list_item: Wikiquote + item-239 at level 2: picture + item-240 at level 2: table with [13x2] + item-241 at level 2: paragraph: Duck is the common name for nume ... und in both fresh water and sea water. + item-242 at level 2: paragraph: Ducks are sometimes confused wit ... divers, grebes, gallinules and coots. + item-243 at level 2: section_header: Etymology + item-244 at level 3: paragraph: The word duck comes from Old Eng ... h duiken and German tauchen 'to dive'. + item-245 at level 3: picture + item-245 at level 4: caption: Pacific black duck displaying the characteristic upending "duck" + item-246 at level 3: paragraph: This word replaced Old English e ... nskrit ātí 'water bird', among others. + item-247 at level 3: paragraph: A duckling is a young duck in do ... , is sometimes labelled as a duckling. + item-248 at level 3: paragraph: A male is called a drake and the ... a duck, or in ornithology a hen.[3][4] + item-249 at level 3: picture + item-249 at level 4: caption: Male mallard. + item-250 at level 3: picture + item-250 at level 4: caption: Wood ducks. + item-251 at level 2: section_header: Taxonomy + item-252 at level 3: paragraph: All ducks belong to the biologic ... ationships between various species.[9] + item-253 at level 3: picture + item-253 at level 4: caption: Mallard landing in approach + item-254 at level 3: paragraph: In most modern classifications, ... all size and stiff, upright tails.[14] + item-255 at level 3: paragraph: A number of other species called ... shelducks in the tribe Tadornini.[15] + item-256 at level 2: section_header: Morphology + item-257 at level 3: picture + item-257 at level 4: caption: Male Mandarin duck + item-258 at level 3: paragraph: The overall body plan of ducks i ... is moult typically precedes migration. + item-259 at level 3: paragraph: The drakes of northern species o ... rkscrew shaped vagina to prevent rape. + item-260 at level 2: section_header: Distribution and habitat + item-261 at level 3: picture + item-261 at level 4: caption: Flying steamer ducks in Ushuaia, Argentina + item-262 at level 3: paragraph: Ducks have a cosmopolitan distri ... endemic to such far-flung islands.[21] + item-263 at level 3: picture + item-263 at level 4: caption: Female mallard in Cornwall, England + item-264 at level 3: paragraph: Some duck species, mainly those ... t form after localised heavy rain.[23] + item-265 at level 2: section_header: Behaviour + item-266 at level 3: section_header: Feeding + item-267 at level 4: picture + item-267 at level 5: caption: Pecten along the bill + item-268 at level 4: picture + item-268 at level 5: caption: Mallard duckling preening + item-269 at level 4: paragraph: Ducks eat food sources such as g ... amphibians, worms, and small molluscs. + item-270 at level 4: paragraph: Dabbling ducks feed on the surfa ... thers and to hold slippery food items. + item-271 at level 4: paragraph: Diving ducks and sea ducks forag ... ave more difficulty taking off to fly. + item-272 at level 4: paragraph: A few specialized species such a ... apted to catch and swallow large fish. + item-273 at level 4: paragraph: The others have the characterist ... e nostrils come out through hard horn. + item-274 at level 4: paragraph: The Guardian published an articl ... the ducks and pollutes waterways.[25] + item-275 at level 3: section_header: Breeding + item-276 at level 4: picture + item-276 at level 5: caption: A Muscovy duckling + item-277 at level 4: paragraph: Ducks generally only have one pa ... st and led her ducklings to water.[28] + item-278 at level 3: section_header: Communication + item-279 at level 4: paragraph: Female mallard ducks (as well as ... laying calls or quieter contact calls. + item-280 at level 4: paragraph: A common urban legend claims tha ... annel television show MythBusters.[32] + item-281 at level 3: section_header: Predators + item-282 at level 4: picture + item-282 at level 5: caption: Ringed teal + item-283 at level 4: paragraph: Ducks have many predators. Duckl ... or large birds, such as hawks or owls. + item-284 at level 4: paragraph: Adult ducks are fast fliers, but ... its speed and strength to catch ducks. + item-285 at level 2: section_header: Relationship with humans + item-286 at level 3: section_header: Hunting + item-287 at level 4: paragraph: Humans have hunted ducks since p ... evidence of this is uncommon.[35][42] + item-288 at level 4: paragraph: In many areas, wild ducks (inclu ... inated by pollutants such as PCBs.[44] + item-289 at level 3: section_header: Domestication + item-290 at level 4: picture + item-290 at level 5: caption: Indian Runner ducks, a common breed of domestic ducks + item-291 at level 4: paragraph: Ducks have many economic uses, b ... it weighs less than 1 kg (2.2 lb).[48] + item-292 at level 3: section_header: Heraldry + item-293 at level 4: picture + item-293 at level 5: caption: Three black-colored ducks in the coat of arms of Maaninka[49] + item-294 at level 4: paragraph: Ducks appear on several coats of ... the coat of arms of Föglö (Åland).[51] + item-295 at level 3: section_header: Cultural references + item-296 at level 4: paragraph: In 2002, psychologist Richard Wi ... 54] and was made into a movie in 1986. + item-297 at level 4: paragraph: The 1992 Disney film The Mighty ... Ducks minor league baseball team.[55] + item-298 at level 2: section_header: See also + item-299 at level 3: list: group list + item-300 at level 4: list_item: Birds portal + item-301 at level 3: list: group list + item-302 at level 4: list_item: Domestic duck + item-303 at level 4: list_item: Duck as food + item-304 at level 4: list_item: Duck test + item-305 at level 4: list_item: Duck breeds + item-306 at level 4: list_item: Fictional ducks + item-307 at level 4: list_item: Rubber duck + item-308 at level 2: section_header: Notes + item-309 at level 3: section_header: Citations + item-310 at level 4: ordered_list: group ordered list + item-311 at level 5: list_item: ^ "Duckling". The American Herit ... n Company. 2006. Retrieved 2015-05-22. + item-312 at level 5: list_item: ^ "Duckling". Kernerman English ... Ltd. 2000–2006. Retrieved 2015-05-22. + item-313 at level 5: list_item: ^ Dohner, Janet Vorwald (2001). ... University Press. ISBN 978-0300138139. + item-314 at level 5: list_item: ^ Visca, Curt; Visca, Kelley (20 ... Publishing Group. ISBN 9780823961566. + item-315 at level 5: list_item: ^ a b c d Carboneras 1992, p. 536. + item-316 at level 5: list_item: ^ Livezey 1986, pp. 737–738. + item-317 at level 5: list_item: ^ Madsen, McHugh & de Kloet 1988, p. 452. + item-318 at level 5: list_item: ^ Donne-Goussé, Laudet & Hänni 2002, pp. 353–354. + item-319 at level 5: list_item: ^ a b c d e f Carboneras 1992, p. 540. + item-320 at level 5: list_item: ^ Elphick, Dunning & Sibley 2001, p. 191. + item-321 at level 5: list_item: ^ Kear 2005, p. 448. + item-322 at level 5: list_item: ^ Kear 2005, p. 622–623. + item-323 at level 5: list_item: ^ Kear 2005, p. 686. + item-324 at level 5: list_item: ^ Elphick, Dunning & Sibley 2001, p. 193. + item-325 at level 5: list_item: ^ a b c d e f g Carboneras 1992, p. 537. + item-326 at level 5: list_item: ^ American Ornithologists' Union 1998, p. xix. + item-327 at level 5: list_item: ^ American Ornithologists' Union 1998. + item-328 at level 5: list_item: ^ Carboneras 1992, p. 538. + item-329 at level 5: list_item: ^ Christidis & Boles 2008, p. 62. + item-330 at level 5: list_item: ^ Shirihai 2008, pp. 239, 245. + item-331 at level 5: list_item: ^ a b Pratt, Bruner & Berrett 1987, pp. 98–107. + item-332 at level 5: list_item: ^ Fitter, Fitter & Hosking 2000, pp. 52–3. + item-333 at level 5: list_item: ^ "Pacific Black Duck". www.wiresnr.org. Retrieved 2018-04-27. + item-334 at level 5: list_item: ^ Ogden, Evans. "Dabbling Ducks". CWE. Retrieved 2006-11-02. + item-335 at level 5: list_item: ^ Karl Mathiesen (16 March 2015) ... Guardian. Retrieved 13 November 2016. + item-336 at level 5: list_item: ^ Rohwer, Frank C.; Anderson, Mi ... 4615-6787-5_4. ISBN 978-1-4615-6789-9. + item-337 at level 5: list_item: ^ Smith, Cyndi M.; Cooke, Fred; ... 093/condor/102.1.201. hdl:10315/13797. + item-338 at level 5: list_item: ^ "If You Find An Orphaned Duckl ... l on 2018-09-23. Retrieved 2018-12-22. + item-339 at level 5: list_item: ^ Carver, Heather (2011). The Du ...  9780557901562.[self-published source] + item-340 at level 5: list_item: ^ Titlow, Budd (2013-09-03). Bir ... man & Littlefield. ISBN 9780762797707. + item-341 at level 5: list_item: ^ Amos, Jonathan (2003-09-08). " ... kers". BBC News. Retrieved 2006-11-02. + item-342 at level 5: list_item: ^ "Mythbusters Episode 8". 12 December 2003. + item-343 at level 5: list_item: ^ Erlandson 1994, p. 171. + item-344 at level 5: list_item: ^ Jeffries 2008, pp. 168, 243. + item-345 at level 5: list_item: ^ a b Sued-Badillo 2003, p. 65. + item-346 at level 5: list_item: ^ Thorpe 1996, p. 68. + item-347 at level 5: list_item: ^ Maisels 1999, p. 42. + item-348 at level 5: list_item: ^ Rau 1876, p. 133. + item-349 at level 5: list_item: ^ Higman 2012, p. 23. + item-350 at level 5: list_item: ^ Hume 2012, p. 53. + item-351 at level 5: list_item: ^ Hume 2012, p. 52. + item-352 at level 5: list_item: ^ Fieldhouse 2002, p. 167. + item-353 at level 5: list_item: ^ Livingston, A. D. (1998-01-01) ... Editions, Limited. ISBN 9781853263774. + item-354 at level 5: list_item: ^ "Study plan for waterfowl inju ... on 2022-10-09. Retrieved 2 July 2019. + item-355 at level 5: list_item: ^ "FAOSTAT". www.fao.org. Retrieved 2019-10-25. + item-356 at level 5: list_item: ^ "Anas platyrhynchos, Domestic ... . Digimorph.org. Retrieved 2012-12-23. + item-357 at level 5: list_item: ^ Sy Montgomery. "Mallard; Encyc ... Britannica.com. Retrieved 2012-12-23. + item-358 at level 5: list_item: ^ Glenday, Craig (2014). Guinnes ... ited. pp. 135. ISBN 978-1-908843-15-9. + item-359 at level 5: list_item: ^ Suomen kunnallisvaakunat (in F ... tto. 1982. p. 147. ISBN 951-773-085-3. + item-360 at level 5: list_item: ^ "Lubānas simbolika" (in Latvian). Retrieved September 9, 2021. + item-361 at level 5: list_item: ^ "Föglö" (in Swedish). Retrieved September 9, 2021. + item-362 at level 5: list_item: ^ Young, Emma. "World's funniest ... w Scientist. Retrieved 7 January 2019. + item-363 at level 5: list_item: ^ "Howard the Duck (character)". Grand Comics Database. + item-364 at level 5: list_item: ^ Sanderson, Peter; Gilbert, Lau ... luding this bad-tempered talking duck. + item-365 at level 5: list_item: ^ "The Duck". University of Oregon Athletics. Retrieved 2022-01-20. + item-366 at level 3: section_header: Sources + item-367 at level 4: list: group list + item-368 at level 5: list_item: American Ornithologists' Union ( ... (PDF) from the original on 2022-10-09. + item-369 at level 5: list_item: Carboneras, Carlos (1992). del H ... Lynx Edicions. ISBN 978-84-87334-10-8. + item-370 at level 5: list_item: Christidis, Les; Boles, Walter E ... ro Publishing. ISBN 978-0-643-06511-6. + item-371 at level 5: list_item: Donne-Goussé, Carole; Laudet, Vi ... /S1055-7903(02)00019-2. PMID 12099792. + item-372 at level 5: list_item: Elphick, Chris; Dunning, John B. ... istopher Helm. ISBN 978-0-7136-6250-4. + item-373 at level 5: list_item: Erlandson, Jon M. (1994). Early ... usiness Media. ISBN 978-1-4419-3231-0. + item-374 at level 5: list_item: Fieldhouse, Paul (2002). Food, F ... ara: ABC-CLIO. ISBN 978-1-61069-412-4. + item-375 at level 5: list_item: Fitter, Julian; Fitter, Daniel; ... versity Press. ISBN 978-0-691-10295-5. + item-376 at level 5: list_item: Higman, B. W. (2012). How Food M ... Wiley & Sons. ISBN 978-1-4051-8947-7. + item-377 at level 5: list_item: Hume, Julian H. (2012). Extinct ... istopher Helm. ISBN 978-1-4729-3744-5. + item-378 at level 5: list_item: Jeffries, Richard (2008). Holoce ... Alabama Press. ISBN 978-0-8173-1658-7. + item-379 at level 5: list_item: Kear, Janet, ed. (2005). Ducks, ... versity Press. ISBN 978-0-19-861009-0. + item-380 at level 5: list_item: Livezey, Bradley C. (October 198 ... (PDF) from the original on 2022-10-09. + item-381 at level 5: list_item: Madsen, Cort S.; McHugh, Kevin P ... (PDF) from the original on 2022-10-09. + item-382 at level 5: list_item: Maisels, Charles Keith (1999). E ... on: Routledge. ISBN 978-0-415-10975-8. + item-383 at level 5: list_item: Pratt, H. Douglas; Bruner, Phill ... University Press. ISBN 0-691-02399-9. + item-384 at level 5: list_item: Rau, Charles (1876). Early Man i ... ork: Harper & Brothers. LCCN 05040168. + item-385 at level 5: list_item: Shirihai, Hadoram (2008). A Comp ... versity Press. ISBN 978-0-691-13666-0. + item-386 at level 5: list_item: Sued-Badillo, Jalil (2003). Auto ... Paris: UNESCO. ISBN 978-92-3-103832-7. + item-387 at level 5: list_item: Thorpe, I. J. (1996). The Origin ... rk: Routledge. ISBN 978-0-415-08009-5. + item-388 at level 2: section_header: External links + item-389 at level 3: list: group list + item-390 at level 4: list_item: Definitions from Wiktionary + item-391 at level 4: list_item: Media from Commons + item-392 at level 4: list_item: Quotations from Wikiquote + item-393 at level 4: list_item: Recipes from Wikibooks + item-394 at level 4: list_item: Taxa from Wikispecies + item-395 at level 4: list_item: Data from Wikidata + item-396 at level 3: list: group list + item-397 at level 4: list_item: list of books (useful looking abstracts) + item-398 at level 4: list_item: Ducks on postage stamps Archived 2013-05-13 at the Wayback Machine + item-399 at level 4: list_item: + item-400 at level 4: list_item: Ducks at a Distance, by Rob Hine ... uide to identification of US waterfowl + item-401 at level 3: table with [3x2] + item-402 at level 3: picture + item-403 at level 3: list: group list + item-404 at level 4: list_item: Ducks + item-405 at level 4: list_item: Game birds + item-406 at level 4: list_item: Bird common names + item-407 at level 3: list: group list + item-408 at level 4: list_item: All accuracy disputes + item-409 at level 4: list_item: Accuracy disputes from February 2020 + item-410 at level 4: list_item: CS1 Finnish-language sources (fi) + item-411 at level 4: list_item: CS1 Latvian-language sources (lv) + item-412 at level 4: list_item: CS1 Swedish-language sources (sv) + item-413 at level 4: list_item: Articles with short description + item-414 at level 4: list_item: Short description is different from Wikidata + item-415 at level 4: list_item: Wikipedia indefinitely move-protected pages + item-416 at level 4: list_item: Wikipedia indefinitely semi-protected pages + item-417 at level 4: list_item: Articles with 'species' microformats + item-418 at level 4: list_item: Articles containing Old English (ca. 450-1100)-language text + item-419 at level 4: list_item: Articles containing Dutch-language text + item-420 at level 4: list_item: Articles containing German-language text + item-421 at level 4: list_item: Articles containing Norwegian-language text + item-422 at level 4: list_item: Articles containing Lithuanian-language text + item-423 at level 4: list_item: Articles containing Ancient Greek (to 1453)-language text + item-424 at level 4: list_item: All articles with self-published sources + item-425 at level 4: list_item: Articles with self-published sources from February 2020 + item-426 at level 4: list_item: All articles with unsourced statements + item-427 at level 4: list_item: Articles with unsourced statements from January 2022 + item-428 at level 4: list_item: CS1: long volume value + item-429 at level 4: list_item: Pages using Sister project links with wikidata mismatch + item-430 at level 4: list_item: Pages using Sister project links with hidden wikidata + item-431 at level 4: list_item: Webarchive template wayback links + item-432 at level 4: list_item: Articles with Project Gutenberg links + item-433 at level 4: list_item: Articles containing video clips + item-434 at level 3: list: group list + item-435 at level 4: list_item: This page was last edited on 21 September 2024, at 12:11 (UTC). + item-436 at level 4: list_item: Text is available under the Crea ... tion, Inc., a non-profit organization. + item-437 at level 3: list: group list + item-438 at level 4: list_item: Privacy policy + item-439 at level 4: list_item: About Wikipedia + item-440 at level 4: list_item: Disclaimers + item-441 at level 4: list_item: Contact Wikipedia + item-442 at level 4: list_item: Code of Conduct + item-443 at level 4: list_item: Developers + item-444 at level 4: list_item: Statistics + item-445 at level 4: list_item: Cookie statement + item-446 at level 4: list_item: Mobile view + item-447 at level 3: list: group list + item-448 at level 4: list_item: + item-449 at level 4: list_item: + item-450 at level 3: list: group list \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/wiki_duck.html.json b/tests/data/groundtruth/docling_v2/wiki_duck.html.json new file mode 100644 index 00000000..11168769 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.json @@ -0,0 +1,7988 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.0.0", + "name": "wiki_duck", + "origin": { + "mimetype": "text/html", + "binary_hash": 8165458525377019424, + "filename": "wiki_duck.html" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/groups/0" + }, + { + "$ref": "#/groups/1" + }, + { + "$ref": "#/pictures/0" + }, + { + "$ref": "#/pictures/1" + }, + { + "$ref": "#/pictures/2" + }, + { + "$ref": "#/groups/2" + }, + { + "$ref": "#/groups/3" + }, + { + "$ref": "#/groups/4" + }, + { + "$ref": "#/groups/5" + }, + { + "$ref": "#/groups/6" + }, + { + "$ref": "#/groups/7" + }, + { + "$ref": "#/groups/8" + }, + { + "$ref": "#/texts/39" + }, + { + "$ref": "#/texts/202" + }, + { + "$ref": "#/texts/206" + }, + { + "$ref": "#/texts/207" + }, + { + "$ref": "#/texts/210" + }, + { + "$ref": "#/texts/214" + }, + { + "$ref": "#/texts/218" + }, + { + "$ref": "#/texts/220" + }, + { + "$ref": "#/texts/224" + }, + { + "$ref": "#/texts/225" + }, + { + "$ref": "#/texts/233" + }, + { + "$ref": "#/texts/239" + }, + { + "$ref": "#/texts/247" + }, + { + "$ref": "#/texts/250" + } + ], + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/texts/5" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/1", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/6" + }, + { + "$ref": "#/texts/7" + }, + { + "$ref": "#/texts/8" + }, + { + "$ref": "#/texts/9" + }, + { + "$ref": "#/texts/10" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/2", + "parent": { + "$ref": "#/body" + }, + "children": [], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/3", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/11" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/4", + "parent": { + "$ref": "#/body" + }, + "children": [], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/5", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/12" + }, + { + "$ref": "#/texts/13" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/6", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/14" + }, + { + "$ref": "#/texts/15" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/7", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/16" + }, + { + "$ref": "#/texts/17" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/8", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/18" + } + ], + "name": "header-1", + "label": "section" + }, + { + "self_ref": "#/groups/9", + "parent": { + "$ref": "#/texts/18" + }, + "children": [ + { + "$ref": "#/texts/19" + }, + { + "$ref": "#/texts/20" + }, + { + "$ref": "#/texts/21" + }, + { + "$ref": "#/texts/22" + }, + { + "$ref": "#/texts/23" + }, + { + "$ref": "#/texts/24" + }, + { + "$ref": "#/texts/29" + }, + { + "$ref": "#/texts/34" + }, + { + "$ref": "#/texts/35" + }, + { + "$ref": "#/texts/38" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/10", + "parent": { + "$ref": "#/texts/20" + }, + "children": [], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/11", + "parent": { + "$ref": "#/texts/21" + }, + "children": [], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/12", + "parent": { + "$ref": "#/texts/22" + }, + "children": [], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/13", + "parent": { + "$ref": "#/texts/23" + }, + "children": [], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/14", + "parent": { + "$ref": "#/texts/24" + }, + "children": [ + { + "$ref": "#/texts/25" + }, + { + "$ref": "#/texts/26" + }, + { + "$ref": "#/texts/27" + }, + { + "$ref": "#/texts/28" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/15", + "parent": { + "$ref": "#/texts/25" + }, + "children": [], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/16", + "parent": { + "$ref": "#/texts/26" + }, + "children": [], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/17", + "parent": { + "$ref": "#/texts/27" + }, + "children": [], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/18", + "parent": { + "$ref": "#/texts/28" + }, + "children": [], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/19", + "parent": { + "$ref": "#/texts/29" + }, + "children": [ + { + "$ref": "#/texts/30" + }, + { + "$ref": "#/texts/31" + }, + { + "$ref": "#/texts/32" + }, + { + "$ref": "#/texts/33" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/20", + "parent": { + "$ref": "#/texts/30" + }, + "children": [], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/21", + "parent": { + "$ref": "#/texts/31" + }, + "children": [], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/22", + "parent": { + "$ref": "#/texts/32" + }, + "children": [], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/23", + "parent": { + "$ref": "#/texts/33" + }, + "children": [], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/24", + "parent": { + "$ref": "#/texts/34" + }, + "children": [], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/25", + "parent": { + "$ref": "#/texts/35" + }, + "children": [ + { + "$ref": "#/texts/36" + }, + { + "$ref": "#/texts/37" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/26", + "parent": { + "$ref": "#/texts/36" + }, + "children": [], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/27", + "parent": { + "$ref": "#/texts/37" + }, + "children": [], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/28", + "parent": { + "$ref": "#/texts/38" + }, + "children": [], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/29", + "parent": { + "$ref": "#/texts/39" + }, + "children": [ + { + "$ref": "#/texts/40" + }, + { + "$ref": "#/texts/41" + }, + { + "$ref": "#/texts/42" + }, + { + "$ref": "#/texts/43" + }, + { + "$ref": "#/texts/44" + }, + { + "$ref": "#/texts/45" + }, + { + "$ref": "#/texts/46" + }, + { + "$ref": "#/texts/47" + }, + { + "$ref": "#/texts/48" + }, + { + "$ref": "#/texts/49" + }, + { + "$ref": "#/texts/50" + }, + { + "$ref": "#/texts/51" + }, + { + "$ref": "#/texts/52" + }, + { + "$ref": "#/texts/53" + }, + { + "$ref": "#/texts/54" + }, + { + "$ref": "#/texts/55" + }, + { + "$ref": "#/texts/56" + }, + { + "$ref": "#/texts/57" + }, + { + "$ref": "#/texts/58" + }, + { + "$ref": "#/texts/59" + }, + { + "$ref": "#/texts/60" + }, + { + "$ref": "#/texts/61" + }, + { + "$ref": "#/texts/62" + }, + { + "$ref": "#/texts/63" + }, + { + "$ref": "#/texts/64" + }, + { + "$ref": "#/texts/65" + }, + { + "$ref": "#/texts/66" + }, + { + "$ref": "#/texts/67" + }, + { + "$ref": "#/texts/68" + }, + { + "$ref": "#/texts/69" + }, + { + "$ref": "#/texts/70" + }, + { + "$ref": "#/texts/71" + }, + { + "$ref": "#/texts/72" + }, + { + "$ref": "#/texts/73" + }, + { + "$ref": "#/texts/74" + }, + { + "$ref": "#/texts/75" + }, + { + "$ref": "#/texts/76" + }, + { + "$ref": "#/texts/77" + }, + { + "$ref": "#/texts/78" + }, + { + "$ref": "#/texts/79" + }, + { + "$ref": "#/texts/80" + }, + { + "$ref": "#/texts/81" + }, + { + "$ref": "#/texts/82" + }, + { + "$ref": "#/texts/83" + }, + { + "$ref": "#/texts/84" + }, + { + "$ref": "#/texts/85" + }, + { + "$ref": "#/texts/86" + }, + { + "$ref": "#/texts/87" + }, + { + "$ref": "#/texts/88" + }, + { + "$ref": "#/texts/89" + }, + { + "$ref": "#/texts/90" + }, + { + "$ref": "#/texts/91" + }, + { + "$ref": "#/texts/92" + }, + { + "$ref": "#/texts/93" + }, + { + "$ref": "#/texts/94" + }, + { + "$ref": "#/texts/95" + }, + { + "$ref": "#/texts/96" + }, + { + "$ref": "#/texts/97" + }, + { + "$ref": "#/texts/98" + }, + { + "$ref": "#/texts/99" + }, + { + "$ref": "#/texts/100" + }, + { + "$ref": "#/texts/101" + }, + { + "$ref": "#/texts/102" + }, + { + "$ref": "#/texts/103" + }, + { + "$ref": "#/texts/104" + }, + { + "$ref": "#/texts/105" + }, + { + "$ref": "#/texts/106" + }, + { + "$ref": "#/texts/107" + }, + { + "$ref": "#/texts/108" + }, + { + "$ref": "#/texts/109" + }, + { + "$ref": "#/texts/110" + }, + { + "$ref": "#/texts/111" + }, + { + "$ref": "#/texts/112" + }, + { + "$ref": "#/texts/113" + }, + { + "$ref": "#/texts/114" + }, + { + "$ref": "#/texts/115" + }, + { + "$ref": "#/texts/116" + }, + { + "$ref": "#/texts/117" + }, + { + "$ref": "#/texts/118" + }, + { + "$ref": "#/texts/119" + }, + { + "$ref": "#/texts/120" + }, + { + "$ref": "#/texts/121" + }, + { + "$ref": "#/texts/122" + }, + { + "$ref": "#/texts/123" + }, + { + "$ref": "#/texts/124" + }, + { + "$ref": "#/texts/125" + }, + { + "$ref": "#/texts/126" + }, + { + "$ref": "#/texts/127" + }, + { + "$ref": "#/texts/128" + }, + { + "$ref": "#/texts/129" + }, + { + "$ref": "#/texts/130" + }, + { + "$ref": "#/texts/131" + }, + { + "$ref": "#/texts/132" + }, + { + "$ref": "#/texts/133" + }, + { + "$ref": "#/texts/134" + }, + { + "$ref": "#/texts/135" + }, + { + "$ref": "#/texts/136" + }, + { + "$ref": "#/texts/137" + }, + { + "$ref": "#/texts/138" + }, + { + "$ref": "#/texts/139" + }, + { + "$ref": "#/texts/140" + }, + { + "$ref": "#/texts/141" + }, + { + "$ref": "#/texts/142" + }, + { + "$ref": "#/texts/143" + }, + { + "$ref": "#/texts/144" + }, + { + "$ref": "#/texts/145" + }, + { + "$ref": "#/texts/146" + }, + { + "$ref": "#/texts/147" + }, + { + "$ref": "#/texts/148" + }, + { + "$ref": "#/texts/149" + }, + { + "$ref": "#/texts/150" + }, + { + "$ref": "#/texts/151" + }, + { + "$ref": "#/texts/152" + }, + { + "$ref": "#/texts/153" + }, + { + "$ref": "#/texts/154" + }, + { + "$ref": "#/texts/155" + }, + { + "$ref": "#/texts/156" + }, + { + "$ref": "#/texts/157" + }, + { + "$ref": "#/texts/158" + }, + { + "$ref": "#/texts/159" + }, + { + "$ref": "#/texts/160" + }, + { + "$ref": "#/texts/161" + }, + { + "$ref": "#/texts/162" + }, + { + "$ref": "#/texts/163" + }, + { + "$ref": "#/texts/164" + }, + { + "$ref": "#/texts/165" + }, + { + "$ref": "#/texts/166" + }, + { + "$ref": "#/texts/167" + }, + { + "$ref": "#/texts/168" + }, + { + "$ref": "#/texts/169" + }, + { + "$ref": "#/texts/170" + }, + { + "$ref": "#/texts/171" + }, + { + "$ref": "#/texts/172" + }, + { + "$ref": "#/texts/173" + }, + { + "$ref": "#/texts/174" + }, + { + "$ref": "#/texts/175" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/30", + "parent": { + "$ref": "#/texts/39" + }, + "children": [ + { + "$ref": "#/texts/176" + }, + { + "$ref": "#/texts/177" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/31", + "parent": { + "$ref": "#/texts/39" + }, + "children": [], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/32", + "parent": { + "$ref": "#/texts/39" + }, + "children": [ + { + "$ref": "#/texts/178" + }, + { + "$ref": "#/texts/179" + }, + { + "$ref": "#/texts/180" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/33", + "parent": { + "$ref": "#/texts/39" + }, + "children": [ + { + "$ref": "#/texts/181" + }, + { + "$ref": "#/texts/182" + }, + { + "$ref": "#/texts/183" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/34", + "parent": { + "$ref": "#/texts/39" + }, + "children": [ + { + "$ref": "#/texts/184" + }, + { + "$ref": "#/texts/185" + }, + { + "$ref": "#/texts/186" + }, + { + "$ref": "#/texts/187" + }, + { + "$ref": "#/texts/188" + }, + { + "$ref": "#/texts/189" + }, + { + "$ref": "#/texts/190" + }, + { + "$ref": "#/texts/191" + }, + { + "$ref": "#/texts/192" + }, + { + "$ref": "#/texts/193" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/35", + "parent": { + "$ref": "#/texts/39" + }, + "children": [ + { + "$ref": "#/texts/194" + }, + { + "$ref": "#/texts/195" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/36", + "parent": { + "$ref": "#/texts/39" + }, + "children": [ + { + "$ref": "#/texts/196" + }, + { + "$ref": "#/texts/197" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/37", + "parent": { + "$ref": "#/texts/255" + }, + "children": [ + { + "$ref": "#/texts/256" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/38", + "parent": { + "$ref": "#/texts/255" + }, + "children": [ + { + "$ref": "#/texts/257" + }, + { + "$ref": "#/texts/258" + }, + { + "$ref": "#/texts/259" + }, + { + "$ref": "#/texts/260" + }, + { + "$ref": "#/texts/261" + }, + { + "$ref": "#/texts/262" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/39", + "parent": { + "$ref": "#/texts/264" + }, + "children": [ + { + "$ref": "#/texts/265" + }, + { + "$ref": "#/texts/266" + }, + { + "$ref": "#/texts/267" + }, + { + "$ref": "#/texts/268" + }, + { + "$ref": "#/texts/269" + }, + { + "$ref": "#/texts/270" + }, + { + "$ref": "#/texts/271" + }, + { + "$ref": "#/texts/272" + }, + { + "$ref": "#/texts/273" + }, + { + "$ref": "#/texts/274" + }, + { + "$ref": "#/texts/275" + }, + { + "$ref": "#/texts/276" + }, + { + "$ref": "#/texts/277" + }, + { + "$ref": "#/texts/278" + }, + { + "$ref": "#/texts/279" + }, + { + "$ref": "#/texts/280" + }, + { + "$ref": "#/texts/281" + }, + { + "$ref": "#/texts/282" + }, + { + "$ref": "#/texts/283" + }, + { + "$ref": "#/texts/284" + }, + { + "$ref": "#/texts/285" + }, + { + "$ref": "#/texts/286" + }, + { + "$ref": "#/texts/287" + }, + { + "$ref": "#/texts/288" + }, + { + "$ref": "#/texts/289" + }, + { + "$ref": "#/texts/290" + }, + { + "$ref": "#/texts/291" + }, + { + "$ref": "#/texts/292" + }, + { + "$ref": "#/texts/293" + }, + { + "$ref": "#/texts/294" + }, + { + "$ref": "#/texts/295" + }, + { + "$ref": "#/texts/296" + }, + { + "$ref": "#/texts/297" + }, + { + "$ref": "#/texts/298" + }, + { + "$ref": "#/texts/299" + }, + { + "$ref": "#/texts/300" + }, + { + "$ref": "#/texts/301" + }, + { + "$ref": "#/texts/302" + }, + { + "$ref": "#/texts/303" + }, + { + "$ref": "#/texts/304" + }, + { + "$ref": "#/texts/305" + }, + { + "$ref": "#/texts/306" + }, + { + "$ref": "#/texts/307" + }, + { + "$ref": "#/texts/308" + }, + { + "$ref": "#/texts/309" + }, + { + "$ref": "#/texts/310" + }, + { + "$ref": "#/texts/311" + }, + { + "$ref": "#/texts/312" + }, + { + "$ref": "#/texts/313" + }, + { + "$ref": "#/texts/314" + }, + { + "$ref": "#/texts/315" + }, + { + "$ref": "#/texts/316" + }, + { + "$ref": "#/texts/317" + }, + { + "$ref": "#/texts/318" + }, + { + "$ref": "#/texts/319" + } + ], + "name": "ordered list", + "label": "ordered_list" + }, + { + "self_ref": "#/groups/40", + "parent": { + "$ref": "#/texts/320" + }, + "children": [ + { + "$ref": "#/texts/321" + }, + { + "$ref": "#/texts/322" + }, + { + "$ref": "#/texts/323" + }, + { + "$ref": "#/texts/324" + }, + { + "$ref": "#/texts/325" + }, + { + "$ref": "#/texts/326" + }, + { + "$ref": "#/texts/327" + }, + { + "$ref": "#/texts/328" + }, + { + "$ref": "#/texts/329" + }, + { + "$ref": "#/texts/330" + }, + { + "$ref": "#/texts/331" + }, + { + "$ref": "#/texts/332" + }, + { + "$ref": "#/texts/333" + }, + { + "$ref": "#/texts/334" + }, + { + "$ref": "#/texts/335" + }, + { + "$ref": "#/texts/336" + }, + { + "$ref": "#/texts/337" + }, + { + "$ref": "#/texts/338" + }, + { + "$ref": "#/texts/339" + }, + { + "$ref": "#/texts/340" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/41", + "parent": { + "$ref": "#/texts/341" + }, + "children": [ + { + "$ref": "#/texts/342" + }, + { + "$ref": "#/texts/343" + }, + { + "$ref": "#/texts/344" + }, + { + "$ref": "#/texts/345" + }, + { + "$ref": "#/texts/346" + }, + { + "$ref": "#/texts/347" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/42", + "parent": { + "$ref": "#/texts/341" + }, + "children": [ + { + "$ref": "#/texts/348" + }, + { + "$ref": "#/texts/349" + }, + { + "$ref": "#/texts/350" + }, + { + "$ref": "#/texts/351" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/43", + "parent": { + "$ref": "#/texts/341" + }, + "children": [ + { + "$ref": "#/texts/352" + }, + { + "$ref": "#/texts/353" + }, + { + "$ref": "#/texts/354" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/44", + "parent": { + "$ref": "#/texts/341" + }, + "children": [ + { + "$ref": "#/texts/355" + }, + { + "$ref": "#/texts/356" + }, + { + "$ref": "#/texts/357" + }, + { + "$ref": "#/texts/358" + }, + { + "$ref": "#/texts/359" + }, + { + "$ref": "#/texts/360" + }, + { + "$ref": "#/texts/361" + }, + { + "$ref": "#/texts/362" + }, + { + "$ref": "#/texts/363" + }, + { + "$ref": "#/texts/364" + }, + { + "$ref": "#/texts/365" + }, + { + "$ref": "#/texts/366" + }, + { + "$ref": "#/texts/367" + }, + { + "$ref": "#/texts/368" + }, + { + "$ref": "#/texts/369" + }, + { + "$ref": "#/texts/370" + }, + { + "$ref": "#/texts/371" + }, + { + "$ref": "#/texts/372" + }, + { + "$ref": "#/texts/373" + }, + { + "$ref": "#/texts/374" + }, + { + "$ref": "#/texts/375" + }, + { + "$ref": "#/texts/376" + }, + { + "$ref": "#/texts/377" + }, + { + "$ref": "#/texts/378" + }, + { + "$ref": "#/texts/379" + }, + { + "$ref": "#/texts/380" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/45", + "parent": { + "$ref": "#/texts/341" + }, + "children": [ + { + "$ref": "#/texts/381" + }, + { + "$ref": "#/texts/382" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/46", + "parent": { + "$ref": "#/texts/341" + }, + "children": [ + { + "$ref": "#/texts/383" + }, + { + "$ref": "#/texts/384" + }, + { + "$ref": "#/texts/385" + }, + { + "$ref": "#/texts/386" + }, + { + "$ref": "#/texts/387" + }, + { + "$ref": "#/texts/388" + }, + { + "$ref": "#/texts/389" + }, + { + "$ref": "#/texts/390" + }, + { + "$ref": "#/texts/391" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/47", + "parent": { + "$ref": "#/texts/341" + }, + "children": [ + { + "$ref": "#/texts/392" + }, + { + "$ref": "#/texts/393" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/48", + "parent": { + "$ref": "#/texts/341" + }, + "children": [], + "name": "list", + "label": "list" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Main page", + "text": "Main page", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Contents", + "text": "Contents", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Current events", + "text": "Current events", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Random article", + "text": "Random article", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "About Wikipedia", + "text": "About Wikipedia", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Contact us", + "text": "Contact us", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Help", + "text": "Help", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Learn to edit", + "text": "Learn to edit", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Community portal", + "text": "Community portal", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Recent changes", + "text": "Recent changes", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Upload file", + "text": "Upload file", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/11", + "parent": { + "$ref": "#/groups/3" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Donate", + "text": "Donate", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/12", + "parent": { + "$ref": "#/groups/5" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Create account", + "text": "Create account", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/13", + "parent": { + "$ref": "#/groups/5" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Log in", + "text": "Log in", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/14", + "parent": { + "$ref": "#/groups/6" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Create account", + "text": "Create account", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/15", + "parent": { + "$ref": "#/groups/6" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Log in", + "text": "Log in", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/16", + "parent": { + "$ref": "#/groups/7" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Contributions", + "text": "Contributions", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/17", + "parent": { + "$ref": "#/groups/7" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Talk", + "text": "Talk", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/18", + "parent": { + "$ref": "#/groups/8" + }, + "children": [ + { + "$ref": "#/groups/9" + } + ], + "label": "section_header", + "prov": [], + "orig": "Contents", + "text": "Contents", + "level": 2 + }, + { + "self_ref": "#/texts/19", + "parent": { + "$ref": "#/groups/9" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "(Top)", + "text": "(Top)", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/20", + "parent": { + "$ref": "#/groups/9" + }, + "children": [ + { + "$ref": "#/groups/10" + } + ], + "label": "list_item", + "prov": [], + "orig": "1 Etymology", + "text": "1 Etymology", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/21", + "parent": { + "$ref": "#/groups/9" + }, + "children": [ + { + "$ref": "#/groups/11" + } + ], + "label": "list_item", + "prov": [], + "orig": "2 Taxonomy", + "text": "2 Taxonomy", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/22", + "parent": { + "$ref": "#/groups/9" + }, + "children": [ + { + "$ref": "#/groups/12" + } + ], + "label": "list_item", + "prov": [], + "orig": "3 Morphology", + "text": "3 Morphology", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/23", + "parent": { + "$ref": "#/groups/9" + }, + "children": [ + { + "$ref": "#/groups/13" + } + ], + "label": "list_item", + "prov": [], + "orig": "4 Distribution and habitat", + "text": "4 Distribution and habitat", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/24", + "parent": { + "$ref": "#/groups/9" + }, + "children": [ + { + "$ref": "#/groups/14" + } + ], + "label": "list_item", + "prov": [], + "orig": "5 Behaviour Toggle Behaviour subsection", + "text": "5 Behaviour Toggle Behaviour subsection", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/25", + "parent": { + "$ref": "#/groups/14" + }, + "children": [ + { + "$ref": "#/groups/15" + } + ], + "label": "list_item", + "prov": [], + "orig": "5.1 Feeding", + "text": "5.1 Feeding", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/26", + "parent": { + "$ref": "#/groups/14" + }, + "children": [ + { + "$ref": "#/groups/16" + } + ], + "label": "list_item", + "prov": [], + "orig": "5.2 Breeding", + "text": "5.2 Breeding", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/27", + "parent": { + "$ref": "#/groups/14" + }, + "children": [ + { + "$ref": "#/groups/17" + } + ], + "label": "list_item", + "prov": [], + "orig": "5.3 Communication", + "text": "5.3 Communication", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/28", + "parent": { + "$ref": "#/groups/14" + }, + "children": [ + { + "$ref": "#/groups/18" + } + ], + "label": "list_item", + "prov": [], + "orig": "5.4 Predators", + "text": "5.4 Predators", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/29", + "parent": { + "$ref": "#/groups/9" + }, + "children": [ + { + "$ref": "#/groups/19" + } + ], + "label": "list_item", + "prov": [], + "orig": "6 Relationship with humans Toggle Relationship with humans subsection", + "text": "6 Relationship with humans Toggle Relationship with humans subsection", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/30", + "parent": { + "$ref": "#/groups/19" + }, + "children": [ + { + "$ref": "#/groups/20" + } + ], + "label": "list_item", + "prov": [], + "orig": "6.1 Hunting", + "text": "6.1 Hunting", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/31", + "parent": { + "$ref": "#/groups/19" + }, + "children": [ + { + "$ref": "#/groups/21" + } + ], + "label": "list_item", + "prov": [], + "orig": "6.2 Domestication", + "text": "6.2 Domestication", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/32", + "parent": { + "$ref": "#/groups/19" + }, + "children": [ + { + "$ref": "#/groups/22" + } + ], + "label": "list_item", + "prov": [], + "orig": "6.3 Heraldry", + "text": "6.3 Heraldry", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/33", + "parent": { + "$ref": "#/groups/19" + }, + "children": [ + { + "$ref": "#/groups/23" + } + ], + "label": "list_item", + "prov": [], + "orig": "6.4 Cultural references", + "text": "6.4 Cultural references", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/34", + "parent": { + "$ref": "#/groups/9" + }, + "children": [ + { + "$ref": "#/groups/24" + } + ], + "label": "list_item", + "prov": [], + "orig": "7 See also", + "text": "7 See also", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/35", + "parent": { + "$ref": "#/groups/9" + }, + "children": [ + { + "$ref": "#/groups/25" + } + ], + "label": "list_item", + "prov": [], + "orig": "8 Notes Toggle Notes subsection", + "text": "8 Notes Toggle Notes subsection", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/36", + "parent": { + "$ref": "#/groups/25" + }, + "children": [ + { + "$ref": "#/groups/26" + } + ], + "label": "list_item", + "prov": [], + "orig": "8.1 Citations", + "text": "8.1 Citations", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/37", + "parent": { + "$ref": "#/groups/25" + }, + "children": [ + { + "$ref": "#/groups/27" + } + ], + "label": "list_item", + "prov": [], + "orig": "8.2 Sources", + "text": "8.2 Sources", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/38", + "parent": { + "$ref": "#/groups/9" + }, + "children": [ + { + "$ref": "#/groups/28" + } + ], + "label": "list_item", + "prov": [], + "orig": "9 External links", + "text": "9 External links", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/39", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/groups/29" + }, + { + "$ref": "#/groups/30" + }, + { + "$ref": "#/groups/31" + }, + { + "$ref": "#/groups/32" + }, + { + "$ref": "#/groups/33" + }, + { + "$ref": "#/groups/34" + }, + { + "$ref": "#/groups/35" + }, + { + "$ref": "#/groups/36" + }, + { + "$ref": "#/pictures/3" + }, + { + "$ref": "#/tables/0" + }, + { + "$ref": "#/texts/198" + }, + { + "$ref": "#/texts/199" + }, + { + "$ref": "#/texts/200" + }, + { + "$ref": "#/texts/208" + }, + { + "$ref": "#/texts/213" + }, + { + "$ref": "#/texts/217" + }, + { + "$ref": "#/texts/222" + }, + { + "$ref": "#/texts/242" + }, + { + "$ref": "#/texts/255" + }, + { + "$ref": "#/texts/263" + }, + { + "$ref": "#/texts/341" + } + ], + "label": "title", + "prov": [], + "orig": "Duck", + "text": "Duck" + }, + { + "self_ref": "#/texts/40", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Ac\u00e8h", + "text": "Ac\u00e8h", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/41", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Afrikaans", + "text": "Afrikaans", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/42", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Alemannisch", + "text": "Alemannisch", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/43", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u12a0\u121b\u122d\u129b", + "text": "\u12a0\u121b\u122d\u129b", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/44", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u00c6nglisc", + "text": "\u00c6nglisc", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/45", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u0627\u0644\u0639\u0631\u0628\u064a\u0629", + "text": "\u0627\u0644\u0639\u0631\u0628\u064a\u0629", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/46", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Aragon\u00e9s", + "text": "Aragon\u00e9s", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/47", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u0710\u072a\u0721\u071d\u0710", + "text": "\u0710\u072a\u0721\u071d\u0710", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/48", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Arm\u00e3neashti", + "text": "Arm\u00e3neashti", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/49", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Asturianu", + "text": "Asturianu", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/50", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Atikamekw", + "text": "Atikamekw", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/51", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u0410\u0432\u0430\u0440", + "text": "\u0410\u0432\u0430\u0440", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/52", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Aymar aru", + "text": "Aymar aru", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/53", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u062a\u06c6\u0631\u06a9\u062c\u0647", + "text": "\u062a\u06c6\u0631\u06a9\u062c\u0647", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/54", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Basa Bali", + "text": "Basa Bali", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/55", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u09ac\u09be\u0982\u09b2\u09be", + "text": "\u09ac\u09be\u0982\u09b2\u09be", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/56", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u95a9\u5357\u8a9e / B\u00e2n-l\u00e2m-g\u00fa", + "text": "\u95a9\u5357\u8a9e / B\u00e2n-l\u00e2m-g\u00fa", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/57", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u0411\u0435\u043b\u0430\u0440\u0443\u0441\u043a\u0430\u044f", + "text": "\u0411\u0435\u043b\u0430\u0440\u0443\u0441\u043a\u0430\u044f", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/58", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u0411\u0435\u043b\u0430\u0440\u0443\u0441\u043a\u0430\u044f (\u0442\u0430\u0440\u0430\u0448\u043a\u0435\u0432\u0456\u0446\u0430)", + "text": "\u0411\u0435\u043b\u0430\u0440\u0443\u0441\u043a\u0430\u044f (\u0442\u0430\u0440\u0430\u0448\u043a\u0435\u0432\u0456\u0446\u0430)", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/59", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Bikol Central", + "text": "Bikol Central", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/60", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u0411\u044a\u043b\u0433\u0430\u0440\u0441\u043a\u0438", + "text": "\u0411\u044a\u043b\u0433\u0430\u0440\u0441\u043a\u0438", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/61", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Brezhoneg", + "text": "Brezhoneg", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/62", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u0411\u0443\u0440\u044f\u0430\u0434", + "text": "\u0411\u0443\u0440\u044f\u0430\u0434", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/63", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Catal\u00e0", + "text": "Catal\u00e0", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/64", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u0427\u04d1\u0432\u0430\u0448\u043b\u0430", + "text": "\u0427\u04d1\u0432\u0430\u0448\u043b\u0430", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/65", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u010ce\u0161tina", + "text": "\u010ce\u0161tina", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/66", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "ChiShona", + "text": "ChiShona", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/67", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Cymraeg", + "text": "Cymraeg", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/68", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Dagbanli", + "text": "Dagbanli", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/69", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Dansk", + "text": "Dansk", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/70", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Deitsch", + "text": "Deitsch", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/71", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Deutsch", + "text": "Deutsch", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/72", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u0921\u094b\u091f\u0947\u0932\u0940", + "text": "\u0921\u094b\u091f\u0947\u0932\u0940", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/73", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u0395\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ac", + "text": "\u0395\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ac", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/74", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Emili\u00e0n e rumagn\u00f2l", + "text": "Emili\u00e0n e rumagn\u00f2l", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/75", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Espa\u00f1ol", + "text": "Espa\u00f1ol", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/76", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Esperanto", + "text": "Esperanto", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/77", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Euskara", + "text": "Euskara", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/78", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u0641\u0627\u0631\u0633\u06cc", + "text": "\u0641\u0627\u0631\u0633\u06cc", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/79", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Fran\u00e7ais", + "text": "Fran\u00e7ais", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/80", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Gaeilge", + "text": "Gaeilge", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/81", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Galego", + "text": "Galego", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/82", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u0413\u04c0\u0430\u043b\u0433\u04c0\u0430\u0439", + "text": "\u0413\u04c0\u0430\u043b\u0433\u04c0\u0430\u0439", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/83", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u8d1b\u8a9e", + "text": "\u8d1b\u8a9e", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/84", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u06af\u06cc\u0644\u06a9\u06cc", + "text": "\u06af\u06cc\u0644\u06a9\u06cc", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/85", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\ud800\udf32\ud800\udf3f\ud800\udf44\ud800\udf39\ud800\udf43\ud800\udf3a", + "text": "\ud800\udf32\ud800\udf3f\ud800\udf44\ud800\udf39\ud800\udf43\ud800\udf3a", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/86", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u0917\u094b\u0902\u092f\u091a\u0940 \u0915\u094b\u0902\u0915\u0923\u0940 / G\u00f5ychi Konknni", + "text": "\u0917\u094b\u0902\u092f\u091a\u0940 \u0915\u094b\u0902\u0915\u0923\u0940 / G\u00f5ychi Konknni", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/87", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u5ba2\u5bb6\u8a9e / Hak-k\u00e2-ng\u00ee", + "text": "\u5ba2\u5bb6\u8a9e / Hak-k\u00e2-ng\u00ee", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/88", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\ud55c\uad6d\uc5b4", + "text": "\ud55c\uad6d\uc5b4", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/89", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Hausa", + "text": "Hausa", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/90", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u0540\u0561\u0575\u0565\u0580\u0565\u0576", + "text": "\u0540\u0561\u0575\u0565\u0580\u0565\u0576", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/91", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u0939\u093f\u0928\u094d\u0926\u0940", + "text": "\u0939\u093f\u0928\u094d\u0926\u0940", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/92", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Hrvatski", + "text": "Hrvatski", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/93", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Ido", + "text": "Ido", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/94", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Bahasa Indonesia", + "text": "Bahasa Indonesia", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/95", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "I\u00f1upiatun", + "text": "I\u00f1upiatun", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/96", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u00cdslenska", + "text": "\u00cdslenska", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/97", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Italiano", + "text": "Italiano", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/98", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u05e2\u05d1\u05e8\u05d9\u05ea", + "text": "\u05e2\u05d1\u05e8\u05d9\u05ea", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/99", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Jawa", + "text": "Jawa", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/100", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u0c95\u0ca8\u0ccd\u0ca8\u0ca1", + "text": "\u0c95\u0ca8\u0ccd\u0ca8\u0ca1", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/101", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Kapampangan", + "text": "Kapampangan", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/102", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u10e5\u10d0\u10e0\u10d7\u10e3\u10da\u10d8", + "text": "\u10e5\u10d0\u10e0\u10d7\u10e3\u10da\u10d8", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/103", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u0915\u0949\u0936\u0941\u0930 / \u06a9\u0672\u0634\u064f\u0631", + "text": "\u0915\u0949\u0936\u0941\u0930 / \u06a9\u0672\u0634\u064f\u0631", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/104", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u049a\u0430\u0437\u0430\u049b\u0448\u0430", + "text": "\u049a\u0430\u0437\u0430\u049b\u0448\u0430", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/105", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Ikirundi", + "text": "Ikirundi", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/106", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Kongo", + "text": "Kongo", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/107", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Krey\u00f2l ayisyen", + "text": "Krey\u00f2l ayisyen", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/108", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u041a\u044b\u0440\u044b\u043a \u043c\u0430\u0440\u044b", + "text": "\u041a\u044b\u0440\u044b\u043a \u043c\u0430\u0440\u044b", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/109", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u0ea5\u0eb2\u0ea7", + "text": "\u0ea5\u0eb2\u0ea7", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/110", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Latina", + "text": "Latina", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/111", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Latvie\u0161u", + "text": "Latvie\u0161u", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/112", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Lietuvi\u0173", + "text": "Lietuvi\u0173", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/113", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Li Niha", + "text": "Li Niha", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/114", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Ligure", + "text": "Ligure", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/115", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Limburgs", + "text": "Limburgs", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/116", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Ling\u00e1la", + "text": "Ling\u00e1la", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/117", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Malagasy", + "text": "Malagasy", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/118", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u0d2e\u0d32\u0d2f\u0d3e\u0d33\u0d02", + "text": "\u0d2e\u0d32\u0d2f\u0d3e\u0d33\u0d02", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/119", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u092e\u0930\u093e\u0920\u0940", + "text": "\u092e\u0930\u093e\u0920\u0940", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/120", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u0645\u0627\u0632\u0650\u0631\u0648\u0646\u06cc", + "text": "\u0645\u0627\u0632\u0650\u0631\u0648\u0646\u06cc", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/121", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Bahasa Melayu", + "text": "Bahasa Melayu", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/122", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\uabc3\uabe4\uabc7\uabe9 \uabc2\uabe3\uabdf", + "text": "\uabc3\uabe4\uabc7\uabe9 \uabc2\uabe3\uabdf", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/123", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u95a9\u6771\u8a9e / M\u00ecng-d\u0115\u0324ng-ng\u1e73\u0304", + "text": "\u95a9\u6771\u8a9e / M\u00ecng-d\u0115\u0324ng-ng\u1e73\u0304", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/124", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u041c\u043e\u043a\u0448\u0435\u043d\u044c", + "text": "\u041c\u043e\u043a\u0448\u0435\u043d\u044c", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/125", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u041c\u043e\u043d\u0433\u043e\u043b", + "text": "\u041c\u043e\u043d\u0433\u043e\u043b", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/126", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u1019\u103c\u1014\u103a\u1019\u102c\u1018\u102c\u101e\u102c", + "text": "\u1019\u103c\u1014\u103a\u1019\u102c\u1018\u102c\u101e\u102c", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/127", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Nederlands", + "text": "Nederlands", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/128", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Nedersaksies", + "text": "Nedersaksies", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/129", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u0928\u0947\u092a\u093e\u0932\u0940", + "text": "\u0928\u0947\u092a\u093e\u0932\u0940", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/130", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u0928\u0947\u092a\u093e\u0932 \u092d\u093e\u0937\u093e", + "text": "\u0928\u0947\u092a\u093e\u0932 \u092d\u093e\u0937\u093e", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/131", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u65e5\u672c\u8a9e", + "text": "\u65e5\u672c\u8a9e", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/132", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u041d\u043e\u0445\u0447\u0438\u0439\u043d", + "text": "\u041d\u043e\u0445\u0447\u0438\u0439\u043d", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/133", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Norsk nynorsk", + "text": "Norsk nynorsk", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/134", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Occitan", + "text": "Occitan", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/135", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Oromoo", + "text": "Oromoo", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/136", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u0a2a\u0a70\u0a1c\u0a3e\u0a2c\u0a40", + "text": "\u0a2a\u0a70\u0a1c\u0a3e\u0a2c\u0a40", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/137", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Picard", + "text": "Picard", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/138", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Plattd\u00fc\u00fctsch", + "text": "Plattd\u00fc\u00fctsch", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/139", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Polski", + "text": "Polski", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/140", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Portugu\u00eas", + "text": "Portugu\u00eas", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/141", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Q\u0131r\u0131mtatarca", + "text": "Q\u0131r\u0131mtatarca", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/142", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Rom\u00e2n\u0103", + "text": "Rom\u00e2n\u0103", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/143", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u0420\u0443\u0441\u0441\u043a\u0438\u0439", + "text": "\u0420\u0443\u0441\u0441\u043a\u0438\u0439", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/144", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u0421\u0430\u0445\u0430 \u0442\u044b\u043b\u0430", + "text": "\u0421\u0430\u0445\u0430 \u0442\u044b\u043b\u0430", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/145", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u1c65\u1c5f\u1c71\u1c5b\u1c5f\u1c72\u1c64", + "text": "\u1c65\u1c5f\u1c71\u1c5b\u1c5f\u1c72\u1c64", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/146", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Sardu", + "text": "Sardu", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/147", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Scots", + "text": "Scots", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/148", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Seeltersk", + "text": "Seeltersk", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/149", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Shqip", + "text": "Shqip", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/150", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Sicilianu", + "text": "Sicilianu", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/151", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u0dc3\u0dd2\u0d82\u0dc4\u0dbd", + "text": "\u0dc3\u0dd2\u0d82\u0dc4\u0dbd", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/152", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Simple English", + "text": "Simple English", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/153", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u0633\u0646\u068c\u064a", + "text": "\u0633\u0646\u068c\u064a", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/154", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u06a9\u0648\u0631\u062f\u06cc", + "text": "\u06a9\u0648\u0631\u062f\u06cc", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/155", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u0421\u0440\u043f\u0441\u043a\u0438 / srpski", + "text": "\u0421\u0440\u043f\u0441\u043a\u0438 / srpski", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/156", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Srpskohrvatski / \u0441\u0440\u043f\u0441\u043a\u043e\u0445\u0440\u0432\u0430\u0442\u0441\u043a\u0438", + "text": "Srpskohrvatski / \u0441\u0440\u043f\u0441\u043a\u043e\u0445\u0440\u0432\u0430\u0442\u0441\u043a\u0438", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/157", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Sunda", + "text": "Sunda", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/158", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Svenska", + "text": "Svenska", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/159", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Tagalog", + "text": "Tagalog", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/160", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u0ba4\u0bae\u0bbf\u0bb4\u0bcd", + "text": "\u0ba4\u0bae\u0bbf\u0bb4\u0bcd", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/161", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Taqbaylit", + "text": "Taqbaylit", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/162", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u0422\u0430\u0442\u0430\u0440\u0447\u0430 / tatar\u00e7a", + "text": "\u0422\u0430\u0442\u0430\u0440\u0447\u0430 / tatar\u00e7a", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/163", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u0e44\u0e17\u0e22", + "text": "\u0e44\u0e17\u0e22", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/164", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "T\u00fcrk\u00e7e", + "text": "T\u00fcrk\u00e7e", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/165", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u0423\u043a\u0440\u0430\u0457\u043d\u0441\u044c\u043a\u0430", + "text": "\u0423\u043a\u0440\u0430\u0457\u043d\u0441\u044c\u043a\u0430", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/166", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u0626\u06c7\u064a\u063a\u06c7\u0631\u0686\u06d5 / Uyghurche", + "text": "\u0626\u06c7\u064a\u063a\u06c7\u0631\u0686\u06d5 / Uyghurche", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/167", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Vahcuengh", + "text": "Vahcuengh", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/168", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Ti\u1ebfng Vi\u1ec7t", + "text": "Ti\u1ebfng Vi\u1ec7t", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/169", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Walon", + "text": "Walon", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/170", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u6587\u8a00", + "text": "\u6587\u8a00", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/171", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Winaray", + "text": "Winaray", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/172", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u5434\u8bed", + "text": "\u5434\u8bed", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/173", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u7cb5\u8a9e", + "text": "\u7cb5\u8a9e", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/174", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u017demait\u0117\u0161ka", + "text": "\u017demait\u0117\u0161ka", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/175", + "parent": { + "$ref": "#/groups/29" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "\u4e2d\u6587", + "text": "\u4e2d\u6587", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/176", + "parent": { + "$ref": "#/groups/30" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Article", + "text": "Article", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/177", + "parent": { + "$ref": "#/groups/30" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Talk", + "text": "Talk", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/178", + "parent": { + "$ref": "#/groups/32" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Read", + "text": "Read", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/179", + "parent": { + "$ref": "#/groups/32" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "View source", + "text": "View source", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/180", + "parent": { + "$ref": "#/groups/32" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "View history", + "text": "View history", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/181", + "parent": { + "$ref": "#/groups/33" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Read", + "text": "Read", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/182", + "parent": { + "$ref": "#/groups/33" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "View source", + "text": "View source", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/183", + "parent": { + "$ref": "#/groups/33" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "View history", + "text": "View history", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/184", + "parent": { + "$ref": "#/groups/34" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "What links here", + "text": "What links here", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/185", + "parent": { + "$ref": "#/groups/34" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Related changes", + "text": "Related changes", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/186", + "parent": { + "$ref": "#/groups/34" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Upload file", + "text": "Upload file", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/187", + "parent": { + "$ref": "#/groups/34" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Special pages", + "text": "Special pages", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/188", + "parent": { + "$ref": "#/groups/34" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Permanent link", + "text": "Permanent link", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/189", + "parent": { + "$ref": "#/groups/34" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Page information", + "text": "Page information", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/190", + "parent": { + "$ref": "#/groups/34" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Cite this page", + "text": "Cite this page", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/191", + "parent": { + "$ref": "#/groups/34" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Get shortened URL", + "text": "Get shortened URL", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/192", + "parent": { + "$ref": "#/groups/34" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Download QR code", + "text": "Download QR code", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/193", + "parent": { + "$ref": "#/groups/34" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Wikidata item", + "text": "Wikidata item", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/194", + "parent": { + "$ref": "#/groups/35" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Download as PDF", + "text": "Download as PDF", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/195", + "parent": { + "$ref": "#/groups/35" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Printable version", + "text": "Printable version", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/196", + "parent": { + "$ref": "#/groups/36" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Wikimedia Commons", + "text": "Wikimedia Commons", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/197", + "parent": { + "$ref": "#/groups/36" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Wikiquote", + "text": "Wikiquote", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/198", + "parent": { + "$ref": "#/texts/39" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Duck is the common name for numerous species of waterfowl in the family Anatidae. Ducks are generally smaller and shorter-necked than swans and geese, which are members of the same family. Divided among several subfamilies, they are a form taxon; they do not represent a monophyletic group (the group of all descendants of a single common ancestral species), since swans and geese are not considered ducks. Ducks are mostly aquatic birds, and may be found in both fresh water and sea water.", + "text": "Duck is the common name for numerous species of waterfowl in the family Anatidae. Ducks are generally smaller and shorter-necked than swans and geese, which are members of the same family. Divided among several subfamilies, they are a form taxon; they do not represent a monophyletic group (the group of all descendants of a single common ancestral species), since swans and geese are not considered ducks. Ducks are mostly aquatic birds, and may be found in both fresh water and sea water." + }, + { + "self_ref": "#/texts/199", + "parent": { + "$ref": "#/texts/39" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Ducks are sometimes confused with several types of unrelated water birds with similar forms, such as loons or divers, grebes, gallinules and coots.", + "text": "Ducks are sometimes confused with several types of unrelated water birds with similar forms, such as loons or divers, grebes, gallinules and coots." + }, + { + "self_ref": "#/texts/200", + "parent": { + "$ref": "#/texts/39" + }, + "children": [ + { + "$ref": "#/texts/201" + }, + { + "$ref": "#/pictures/4" + }, + { + "$ref": "#/texts/203" + }, + { + "$ref": "#/texts/204" + }, + { + "$ref": "#/texts/205" + }, + { + "$ref": "#/pictures/5" + }, + { + "$ref": "#/pictures/6" + } + ], + "label": "section_header", + "prov": [], + "orig": "Etymology", + "text": "Etymology", + "level": 2 + }, + { + "self_ref": "#/texts/201", + "parent": { + "$ref": "#/texts/200" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "The word duck comes from Old English d\u016bce 'diver', a derivative of the verb *d\u016bcan 'to duck, bend down low as if to get under something, or dive', because of the way many species in the dabbling duck group feed by upending; compare with Dutch duiken and German tauchen 'to dive'.", + "text": "The word duck comes from Old English d\u016bce 'diver', a derivative of the verb *d\u016bcan 'to duck, bend down low as if to get under something, or dive', because of the way many species in the dabbling duck group feed by upending; compare with Dutch duiken and German tauchen 'to dive'." + }, + { + "self_ref": "#/texts/202", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "caption", + "prov": [], + "orig": "Pacific black duck displaying the characteristic upending \"duck\"", + "text": "Pacific black duck displaying the characteristic upending \"duck\"" + }, + { + "self_ref": "#/texts/203", + "parent": { + "$ref": "#/texts/200" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "This word replaced Old English ened /\u00e6nid 'duck', possibly to avoid confusion with other words, such as ende 'end' with similar forms. Other Germanic languages still have similar words for duck, for example, Dutch eend, German Ente and Norwegian and. The word ened /\u00e6nid was inherited from Proto-Indo-European; cf. Latin anas \"duck\", Lithuanian \u00e1ntis 'duck', Ancient Greek \u03bd\u1fc6\u03c3\u03c3\u03b1 /\u03bd\u1fc6\u03c4\u03c4\u03b1 (n\u0113ssa /n\u0113tta) 'duck', and Sanskrit \u0101t\u00ed 'water bird', among others.", + "text": "This word replaced Old English ened /\u00e6nid 'duck', possibly to avoid confusion with other words, such as ende 'end' with similar forms. Other Germanic languages still have similar words for duck, for example, Dutch eend, German Ente and Norwegian and. The word ened /\u00e6nid was inherited from Proto-Indo-European; cf. Latin anas \"duck\", Lithuanian \u00e1ntis 'duck', Ancient Greek \u03bd\u1fc6\u03c3\u03c3\u03b1 /\u03bd\u1fc6\u03c4\u03c4\u03b1 (n\u0113ssa /n\u0113tta) 'duck', and Sanskrit \u0101t\u00ed 'water bird', among others." + }, + { + "self_ref": "#/texts/204", + "parent": { + "$ref": "#/texts/200" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "A duckling is a young duck in downy plumage[1] or baby duck,[2] but in the food trade a young domestic duck which has just reached adult size and bulk and its meat is still fully tender, is sometimes labelled as a duckling.", + "text": "A duckling is a young duck in downy plumage[1] or baby duck,[2] but in the food trade a young domestic duck which has just reached adult size and bulk and its meat is still fully tender, is sometimes labelled as a duckling." + }, + { + "self_ref": "#/texts/205", + "parent": { + "$ref": "#/texts/200" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "A male is called a drake and the female is called a duck, or in ornithology a hen.[3][4]", + "text": "A male is called a drake and the female is called a duck, or in ornithology a hen.[3][4]" + }, + { + "self_ref": "#/texts/206", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "caption", + "prov": [], + "orig": "Male mallard.", + "text": "Male mallard." + }, + { + "self_ref": "#/texts/207", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "caption", + "prov": [], + "orig": "Wood ducks.", + "text": "Wood ducks." + }, + { + "self_ref": "#/texts/208", + "parent": { + "$ref": "#/texts/39" + }, + "children": [ + { + "$ref": "#/texts/209" + }, + { + "$ref": "#/pictures/7" + }, + { + "$ref": "#/texts/211" + }, + { + "$ref": "#/texts/212" + } + ], + "label": "section_header", + "prov": [], + "orig": "Taxonomy", + "text": "Taxonomy", + "level": 2 + }, + { + "self_ref": "#/texts/209", + "parent": { + "$ref": "#/texts/208" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "All ducks belong to the biological order Anseriformes, a group that contains the ducks, geese and swans, as well as the screamers, and the magpie goose.[5] All except the screamers belong to the biological family Anatidae.[5] Within the family, ducks are split into a variety of subfamilies and 'tribes'. The number and composition of these subfamilies and tribes is the cause of considerable disagreement among taxonomists.[5] Some base their decisions on morphological characteristics, others on shared behaviours or genetic studies.[6][7] The number of suggested subfamilies containing ducks ranges from two to five.[8][9] The significant level of hybridisation that occurs among wild ducks complicates efforts to tease apart the relationships between various species.[9]", + "text": "All ducks belong to the biological order Anseriformes, a group that contains the ducks, geese and swans, as well as the screamers, and the magpie goose.[5] All except the screamers belong to the biological family Anatidae.[5] Within the family, ducks are split into a variety of subfamilies and 'tribes'. The number and composition of these subfamilies and tribes is the cause of considerable disagreement among taxonomists.[5] Some base their decisions on morphological characteristics, others on shared behaviours or genetic studies.[6][7] The number of suggested subfamilies containing ducks ranges from two to five.[8][9] The significant level of hybridisation that occurs among wild ducks complicates efforts to tease apart the relationships between various species.[9]" + }, + { + "self_ref": "#/texts/210", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "caption", + "prov": [], + "orig": "Mallard landing in approach", + "text": "Mallard landing in approach" + }, + { + "self_ref": "#/texts/211", + "parent": { + "$ref": "#/texts/208" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "In most modern classifications, the so-called 'true ducks' belong to the subfamily Anatinae, which is further split into a varying number of tribes.[10] The largest of these, the Anatini, contains the 'dabbling' or 'river' ducks \u2013 named for their method of feeding primarily at the surface of fresh water.[11] The 'diving ducks', also named for their primary feeding method, make up the tribe Aythyini.[12] The 'sea ducks' of the tribe Mergini are diving ducks which specialise on fish and shellfish and spend a majority of their lives in saltwater.[13] The tribe Oxyurini contains the 'stifftails', diving ducks notable for their small size and stiff, upright tails.[14]", + "text": "In most modern classifications, the so-called 'true ducks' belong to the subfamily Anatinae, which is further split into a varying number of tribes.[10] The largest of these, the Anatini, contains the 'dabbling' or 'river' ducks \u2013 named for their method of feeding primarily at the surface of fresh water.[11] The 'diving ducks', also named for their primary feeding method, make up the tribe Aythyini.[12] The 'sea ducks' of the tribe Mergini are diving ducks which specialise on fish and shellfish and spend a majority of their lives in saltwater.[13] The tribe Oxyurini contains the 'stifftails', diving ducks notable for their small size and stiff, upright tails.[14]" + }, + { + "self_ref": "#/texts/212", + "parent": { + "$ref": "#/texts/208" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "A number of other species called ducks are not considered to be 'true ducks', and are typically placed in other subfamilies or tribes. The whistling ducks are assigned either to a tribe (Dendrocygnini) in the subfamily Anatinae or the subfamily Anserinae,[15] or to their own subfamily (Dendrocygninae) or family (Dendrocyganidae).[9][16] The freckled duck of Australia is either the sole member of the tribe Stictonettini in the subfamily Anserinae,[15] or in its own family, the Stictonettinae.[9] The shelducks make up the tribe Tadornini in the family Anserinae in some classifications,[15] and their own subfamily, Tadorninae, in others,[17] while the steamer ducks are either placed in the family Anserinae in the tribe Tachyerini[15] or lumped with the shelducks in the tribe Tadorini.[9] The perching ducks make up in the tribe Cairinini in the subfamily Anserinae in some classifications, while that tribe is eliminated in other classifications and its members assigned to the tribe Anatini.[9] The torrent duck is generally included in the subfamily Anserinae in the monotypic tribe Merganettini,[15] but is sometimes included in the tribe Tadornini.[18] The pink-eared duck is sometimes included as a true duck either in the tribe Anatini[15] or the tribe Malacorhynchini,[19] and other times is included with the shelducks in the tribe Tadornini.[15]", + "text": "A number of other species called ducks are not considered to be 'true ducks', and are typically placed in other subfamilies or tribes. The whistling ducks are assigned either to a tribe (Dendrocygnini) in the subfamily Anatinae or the subfamily Anserinae,[15] or to their own subfamily (Dendrocygninae) or family (Dendrocyganidae).[9][16] The freckled duck of Australia is either the sole member of the tribe Stictonettini in the subfamily Anserinae,[15] or in its own family, the Stictonettinae.[9] The shelducks make up the tribe Tadornini in the family Anserinae in some classifications,[15] and their own subfamily, Tadorninae, in others,[17] while the steamer ducks are either placed in the family Anserinae in the tribe Tachyerini[15] or lumped with the shelducks in the tribe Tadorini.[9] The perching ducks make up in the tribe Cairinini in the subfamily Anserinae in some classifications, while that tribe is eliminated in other classifications and its members assigned to the tribe Anatini.[9] The torrent duck is generally included in the subfamily Anserinae in the monotypic tribe Merganettini,[15] but is sometimes included in the tribe Tadornini.[18] The pink-eared duck is sometimes included as a true duck either in the tribe Anatini[15] or the tribe Malacorhynchini,[19] and other times is included with the shelducks in the tribe Tadornini.[15]" + }, + { + "self_ref": "#/texts/213", + "parent": { + "$ref": "#/texts/39" + }, + "children": [ + { + "$ref": "#/pictures/8" + }, + { + "$ref": "#/texts/215" + }, + { + "$ref": "#/texts/216" + } + ], + "label": "section_header", + "prov": [], + "orig": "Morphology", + "text": "Morphology", + "level": 2 + }, + { + "self_ref": "#/texts/214", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "caption", + "prov": [], + "orig": "Male Mandarin duck", + "text": "Male Mandarin duck" + }, + { + "self_ref": "#/texts/215", + "parent": { + "$ref": "#/texts/213" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "The overall body plan of ducks is elongated and broad, and they are also relatively long-necked, albeit not as long-necked as the geese and swans. The body shape of diving ducks varies somewhat from this in being more rounded. The bill is usually broad and contains serrated pectens, which are particularly well defined in the filter-feeding species. In the case of some fishing species the bill is long and strongly serrated. The scaled legs are strong and well developed, and generally set far back on the body, more so in the highly aquatic species. The wings are very strong and are generally short and pointed, and the flight of ducks requires fast continuous strokes, requiring in turn strong wing muscles. Three species of steamer duck are almost flightless, however. Many species of duck are temporarily flightless while moulting; they seek out protected habitat with good food supplies during this period. This moult typically precedes migration.", + "text": "The overall body plan of ducks is elongated and broad, and they are also relatively long-necked, albeit not as long-necked as the geese and swans. The body shape of diving ducks varies somewhat from this in being more rounded. The bill is usually broad and contains serrated pectens, which are particularly well defined in the filter-feeding species. In the case of some fishing species the bill is long and strongly serrated. The scaled legs are strong and well developed, and generally set far back on the body, more so in the highly aquatic species. The wings are very strong and are generally short and pointed, and the flight of ducks requires fast continuous strokes, requiring in turn strong wing muscles. Three species of steamer duck are almost flightless, however. Many species of duck are temporarily flightless while moulting; they seek out protected habitat with good food supplies during this period. This moult typically precedes migration." + }, + { + "self_ref": "#/texts/216", + "parent": { + "$ref": "#/texts/213" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "The drakes of northern species often have extravagant plumage, but that is moulted in summer to give a more female-like appearance, the \"eclipse\" plumage. Southern resident species typically show less sexual dimorphism, although there are exceptions such as the paradise shelduck of New Zealand, which is both strikingly sexually dimorphic and in which the female's plumage is brighter than that of the male. The plumage of juvenile birds generally resembles that of the female. Female ducks have evolved to have a corkscrew shaped vagina to prevent rape.", + "text": "The drakes of northern species often have extravagant plumage, but that is moulted in summer to give a more female-like appearance, the \"eclipse\" plumage. Southern resident species typically show less sexual dimorphism, although there are exceptions such as the paradise shelduck of New Zealand, which is both strikingly sexually dimorphic and in which the female's plumage is brighter than that of the male. The plumage of juvenile birds generally resembles that of the female. Female ducks have evolved to have a corkscrew shaped vagina to prevent rape." + }, + { + "self_ref": "#/texts/217", + "parent": { + "$ref": "#/texts/39" + }, + "children": [ + { + "$ref": "#/pictures/9" + }, + { + "$ref": "#/texts/219" + }, + { + "$ref": "#/pictures/10" + }, + { + "$ref": "#/texts/221" + } + ], + "label": "section_header", + "prov": [], + "orig": "Distribution and habitat", + "text": "Distribution and habitat", + "level": 2 + }, + { + "self_ref": "#/texts/218", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "caption", + "prov": [], + "orig": "Flying steamer ducks in Ushuaia, Argentina", + "text": "Flying steamer ducks in Ushuaia, Argentina" + }, + { + "self_ref": "#/texts/219", + "parent": { + "$ref": "#/texts/217" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Ducks have a cosmopolitan distribution, and are found on every continent except Antarctica.[5] Several species manage to live on subantarctic islands, including South Georgia and the Auckland Islands.[20] Ducks have reached a number of isolated oceanic islands, including the Hawaiian Islands, Micronesia and the Gal\u00e1pagos Islands, where they are often vagrants and less often residents.[21][22] A handful are endemic to such far-flung islands.[21]", + "text": "Ducks have a cosmopolitan distribution, and are found on every continent except Antarctica.[5] Several species manage to live on subantarctic islands, including South Georgia and the Auckland Islands.[20] Ducks have reached a number of isolated oceanic islands, including the Hawaiian Islands, Micronesia and the Gal\u00e1pagos Islands, where they are often vagrants and less often residents.[21][22] A handful are endemic to such far-flung islands.[21]" + }, + { + "self_ref": "#/texts/220", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "caption", + "prov": [], + "orig": "Female mallard in Cornwall, England", + "text": "Female mallard in Cornwall, England" + }, + { + "self_ref": "#/texts/221", + "parent": { + "$ref": "#/texts/217" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Some duck species, mainly those breeding in the temperate and Arctic Northern Hemisphere, are migratory; those in the tropics are generally not. Some ducks, particularly in Australia where rainfall is erratic, are nomadic, seeking out the temporary lakes and pools that form after localised heavy rain.[23]", + "text": "Some duck species, mainly those breeding in the temperate and Arctic Northern Hemisphere, are migratory; those in the tropics are generally not. Some ducks, particularly in Australia where rainfall is erratic, are nomadic, seeking out the temporary lakes and pools that form after localised heavy rain.[23]" + }, + { + "self_ref": "#/texts/222", + "parent": { + "$ref": "#/texts/39" + }, + "children": [ + { + "$ref": "#/texts/223" + }, + { + "$ref": "#/texts/232" + }, + { + "$ref": "#/texts/235" + }, + { + "$ref": "#/texts/238" + } + ], + "label": "section_header", + "prov": [], + "orig": "Behaviour", + "text": "Behaviour", + "level": 2 + }, + { + "self_ref": "#/texts/223", + "parent": { + "$ref": "#/texts/222" + }, + "children": [ + { + "$ref": "#/pictures/11" + }, + { + "$ref": "#/pictures/12" + }, + { + "$ref": "#/texts/226" + }, + { + "$ref": "#/texts/227" + }, + { + "$ref": "#/texts/228" + }, + { + "$ref": "#/texts/229" + }, + { + "$ref": "#/texts/230" + }, + { + "$ref": "#/texts/231" + } + ], + "label": "section_header", + "prov": [], + "orig": "Feeding", + "text": "Feeding", + "level": 3 + }, + { + "self_ref": "#/texts/224", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "caption", + "prov": [], + "orig": "Pecten along the bill", + "text": "Pecten along the bill" + }, + { + "self_ref": "#/texts/225", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "caption", + "prov": [], + "orig": "Mallard duckling preening", + "text": "Mallard duckling preening" + }, + { + "self_ref": "#/texts/226", + "parent": { + "$ref": "#/texts/223" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Ducks eat food sources such as grasses, aquatic plants, fish, insects, small amphibians, worms, and small molluscs.", + "text": "Ducks eat food sources such as grasses, aquatic plants, fish, insects, small amphibians, worms, and small molluscs." + }, + { + "self_ref": "#/texts/227", + "parent": { + "$ref": "#/texts/223" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Dabbling ducks feed on the surface of water or on land, or as deep as they can reach by up-ending without completely submerging.[24] Along the edge of the bill, there is a comb-like structure called a pecten. This strains the water squirting from the side of the bill and traps any food. The pecten is also used to preen feathers and to hold slippery food items.", + "text": "Dabbling ducks feed on the surface of water or on land, or as deep as they can reach by up-ending without completely submerging.[24] Along the edge of the bill, there is a comb-like structure called a pecten. This strains the water squirting from the side of the bill and traps any food. The pecten is also used to preen feathers and to hold slippery food items." + }, + { + "self_ref": "#/texts/228", + "parent": { + "$ref": "#/texts/223" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Diving ducks and sea ducks forage deep underwater. To be able to submerge more easily, the diving ducks are heavier than dabbling ducks, and therefore have more difficulty taking off to fly.", + "text": "Diving ducks and sea ducks forage deep underwater. To be able to submerge more easily, the diving ducks are heavier than dabbling ducks, and therefore have more difficulty taking off to fly." + }, + { + "self_ref": "#/texts/229", + "parent": { + "$ref": "#/texts/223" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "A few specialized species such as the mergansers are adapted to catch and swallow large fish.", + "text": "A few specialized species such as the mergansers are adapted to catch and swallow large fish." + }, + { + "self_ref": "#/texts/230", + "parent": { + "$ref": "#/texts/223" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "The others have the characteristic wide flat bill adapted to dredging-type jobs such as pulling up waterweed, pulling worms and small molluscs out of mud, searching for insect larvae, and bulk jobs such as dredging out, holding, turning head first, and swallowing a squirming frog. To avoid injury when digging into sediment it has no cere, but the nostrils come out through hard horn.", + "text": "The others have the characteristic wide flat bill adapted to dredging-type jobs such as pulling up waterweed, pulling worms and small molluscs out of mud, searching for insect larvae, and bulk jobs such as dredging out, holding, turning head first, and swallowing a squirming frog. To avoid injury when digging into sediment it has no cere, but the nostrils come out through hard horn." + }, + { + "self_ref": "#/texts/231", + "parent": { + "$ref": "#/texts/223" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "The Guardian published an article advising that ducks should not be fed with bread because it damages the health of the ducks and pollutes waterways.[25]", + "text": "The Guardian published an article advising that ducks should not be fed with bread because it damages the health of the ducks and pollutes waterways.[25]" + }, + { + "self_ref": "#/texts/232", + "parent": { + "$ref": "#/texts/222" + }, + "children": [ + { + "$ref": "#/pictures/13" + }, + { + "$ref": "#/texts/234" + } + ], + "label": "section_header", + "prov": [], + "orig": "Breeding", + "text": "Breeding", + "level": 3 + }, + { + "self_ref": "#/texts/233", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "caption", + "prov": [], + "orig": "A Muscovy duckling", + "text": "A Muscovy duckling" + }, + { + "self_ref": "#/texts/234", + "parent": { + "$ref": "#/texts/232" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Ducks generally only have one partner at a time, although the partnership usually only lasts one year.[26] Larger species and the more sedentary species (like fast-river specialists) tend to have pair-bonds that last numerous years.[27] Most duck species breed once a year, choosing to do so in favourable conditions (spring/summer or wet seasons). Ducks also tend to make a nest before breeding, and, after hatching, lead their ducklings to water. Mother ducks are very caring and protective of their young, but may abandon some of their ducklings if they are physically stuck in an area they cannot get out of (such as nesting in an enclosed courtyard) or are not prospering due to genetic defects or sickness brought about by hypothermia, starvation, or disease. Ducklings can also be orphaned by inconsistent late hatching where a few eggs hatch after the mother has abandoned the nest and led her ducklings to water.[28]", + "text": "Ducks generally only have one partner at a time, although the partnership usually only lasts one year.[26] Larger species and the more sedentary species (like fast-river specialists) tend to have pair-bonds that last numerous years.[27] Most duck species breed once a year, choosing to do so in favourable conditions (spring/summer or wet seasons). Ducks also tend to make a nest before breeding, and, after hatching, lead their ducklings to water. Mother ducks are very caring and protective of their young, but may abandon some of their ducklings if they are physically stuck in an area they cannot get out of (such as nesting in an enclosed courtyard) or are not prospering due to genetic defects or sickness brought about by hypothermia, starvation, or disease. Ducklings can also be orphaned by inconsistent late hatching where a few eggs hatch after the mother has abandoned the nest and led her ducklings to water.[28]" + }, + { + "self_ref": "#/texts/235", + "parent": { + "$ref": "#/texts/222" + }, + "children": [ + { + "$ref": "#/texts/236" + }, + { + "$ref": "#/texts/237" + } + ], + "label": "section_header", + "prov": [], + "orig": "Communication", + "text": "Communication", + "level": 3 + }, + { + "self_ref": "#/texts/236", + "parent": { + "$ref": "#/texts/235" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Female mallard ducks (as well as several other species in the genus Anas, such as the American and Pacific black ducks, spot-billed duck, northern pintail and common teal) make the classic \"quack\" sound while males make a similar but raspier sound that is sometimes written as \"breeeeze\",[29][self-published source?] but, despite widespread misconceptions, most species of duck do not \"quack\".[30] In general, ducks make a range of calls, including whistles, cooing, yodels and grunts. For example, the scaup \u2013 which are diving ducks \u2013 make a noise like \"scaup\" (hence their name). Calls may be loud displaying calls or quieter contact calls.", + "text": "Female mallard ducks (as well as several other species in the genus Anas, such as the American and Pacific black ducks, spot-billed duck, northern pintail and common teal) make the classic \"quack\" sound while males make a similar but raspier sound that is sometimes written as \"breeeeze\",[29][self-published source?] but, despite widespread misconceptions, most species of duck do not \"quack\".[30] In general, ducks make a range of calls, including whistles, cooing, yodels and grunts. For example, the scaup \u2013 which are diving ducks \u2013 make a noise like \"scaup\" (hence their name). Calls may be loud displaying calls or quieter contact calls." + }, + { + "self_ref": "#/texts/237", + "parent": { + "$ref": "#/texts/235" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "A common urban legend claims that duck quacks do not echo; however, this has been proven to be false. This myth was first debunked by the Acoustics Research Centre at the University of Salford in 2003 as part of the British Association's Festival of Science.[31] It was also debunked in one of the earlier episodes of the popular Discovery Channel television show MythBusters.[32]", + "text": "A common urban legend claims that duck quacks do not echo; however, this has been proven to be false. This myth was first debunked by the Acoustics Research Centre at the University of Salford in 2003 as part of the British Association's Festival of Science.[31] It was also debunked in one of the earlier episodes of the popular Discovery Channel television show MythBusters.[32]" + }, + { + "self_ref": "#/texts/238", + "parent": { + "$ref": "#/texts/222" + }, + "children": [ + { + "$ref": "#/pictures/14" + }, + { + "$ref": "#/texts/240" + }, + { + "$ref": "#/texts/241" + } + ], + "label": "section_header", + "prov": [], + "orig": "Predators", + "text": "Predators", + "level": 3 + }, + { + "self_ref": "#/texts/239", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "caption", + "prov": [], + "orig": "Ringed teal", + "text": "Ringed teal" + }, + { + "self_ref": "#/texts/240", + "parent": { + "$ref": "#/texts/238" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Ducks have many predators. Ducklings are particularly vulnerable, since their inability to fly makes them easy prey not only for predatory birds but also for large fish like pike, crocodilians, predatory testudines such as the alligator snapping turtle, and other aquatic hunters, including fish-eating birds such as herons. Ducks' nests are raided by land-based predators, and brooding females may be caught unaware on the nest by mammals, such as foxes, or large birds, such as hawks or owls.", + "text": "Ducks have many predators. Ducklings are particularly vulnerable, since their inability to fly makes them easy prey not only for predatory birds but also for large fish like pike, crocodilians, predatory testudines such as the alligator snapping turtle, and other aquatic hunters, including fish-eating birds such as herons. Ducks' nests are raided by land-based predators, and brooding females may be caught unaware on the nest by mammals, such as foxes, or large birds, such as hawks or owls." + }, + { + "self_ref": "#/texts/241", + "parent": { + "$ref": "#/texts/238" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Adult ducks are fast fliers, but may be caught on the water by large aquatic predators including big fish such as the North American muskie and the European pike. In flight, ducks are safe from all but a few predators such as humans and the peregrine falcon, which uses its speed and strength to catch ducks.", + "text": "Adult ducks are fast fliers, but may be caught on the water by large aquatic predators including big fish such as the North American muskie and the European pike. In flight, ducks are safe from all but a few predators such as humans and the peregrine falcon, which uses its speed and strength to catch ducks." + }, + { + "self_ref": "#/texts/242", + "parent": { + "$ref": "#/texts/39" + }, + "children": [ + { + "$ref": "#/texts/243" + }, + { + "$ref": "#/texts/246" + }, + { + "$ref": "#/texts/249" + }, + { + "$ref": "#/texts/252" + } + ], + "label": "section_header", + "prov": [], + "orig": "Relationship with humans", + "text": "Relationship with humans", + "level": 2 + }, + { + "self_ref": "#/texts/243", + "parent": { + "$ref": "#/texts/242" + }, + "children": [ + { + "$ref": "#/texts/244" + }, + { + "$ref": "#/texts/245" + } + ], + "label": "section_header", + "prov": [], + "orig": "Hunting", + "text": "Hunting", + "level": 3 + }, + { + "self_ref": "#/texts/244", + "parent": { + "$ref": "#/texts/243" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Humans have hunted ducks since prehistoric times. Excavations of middens in California dating to 7800 \u2013 6400 BP have turned up bones of ducks, including at least one now-extinct flightless species.[33] Ducks were captured in \"significant numbers\" by Holocene inhabitants of the lower Ohio River valley, suggesting they took advantage of the seasonal bounty provided by migrating waterfowl.[34] Neolithic hunters in locations as far apart as the Caribbean,[35] Scandinavia,[36] Egypt,[37] Switzerland,[38] and China relied on ducks as a source of protein for some or all of the year.[39] Archeological evidence shows that M\u0101ori people in New Zealand hunted the flightless Finsch's duck, possibly to extinction, though rat predation may also have contributed to its fate.[40] A similar end awaited the Chatham duck, a species with reduced flying capabilities which went extinct shortly after its island was colonised by Polynesian settlers.[41] It is probable that duck eggs were gathered by Neolithic hunter-gathers as well, though hard evidence of this is uncommon.[35][42]", + "text": "Humans have hunted ducks since prehistoric times. Excavations of middens in California dating to 7800 \u2013 6400 BP have turned up bones of ducks, including at least one now-extinct flightless species.[33] Ducks were captured in \"significant numbers\" by Holocene inhabitants of the lower Ohio River valley, suggesting they took advantage of the seasonal bounty provided by migrating waterfowl.[34] Neolithic hunters in locations as far apart as the Caribbean,[35] Scandinavia,[36] Egypt,[37] Switzerland,[38] and China relied on ducks as a source of protein for some or all of the year.[39] Archeological evidence shows that M\u0101ori people in New Zealand hunted the flightless Finsch's duck, possibly to extinction, though rat predation may also have contributed to its fate.[40] A similar end awaited the Chatham duck, a species with reduced flying capabilities which went extinct shortly after its island was colonised by Polynesian settlers.[41] It is probable that duck eggs were gathered by Neolithic hunter-gathers as well, though hard evidence of this is uncommon.[35][42]" + }, + { + "self_ref": "#/texts/245", + "parent": { + "$ref": "#/texts/243" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "In many areas, wild ducks (including ducks farmed and released into the wild) are hunted for food or sport,[43] by shooting, or by being trapped using duck decoys. Because an idle floating duck or a duck squatting on land cannot react to fly or move quickly, \"a sitting duck\" has come to mean \"an easy target\". These ducks may be contaminated by pollutants such as PCBs.[44]", + "text": "In many areas, wild ducks (including ducks farmed and released into the wild) are hunted for food or sport,[43] by shooting, or by being trapped using duck decoys. Because an idle floating duck or a duck squatting on land cannot react to fly or move quickly, \"a sitting duck\" has come to mean \"an easy target\". These ducks may be contaminated by pollutants such as PCBs.[44]" + }, + { + "self_ref": "#/texts/246", + "parent": { + "$ref": "#/texts/242" + }, + "children": [ + { + "$ref": "#/pictures/15" + }, + { + "$ref": "#/texts/248" + } + ], + "label": "section_header", + "prov": [], + "orig": "Domestication", + "text": "Domestication", + "level": 3 + }, + { + "self_ref": "#/texts/247", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "caption", + "prov": [], + "orig": "Indian Runner ducks, a common breed of domestic ducks", + "text": "Indian Runner ducks, a common breed of domestic ducks" + }, + { + "self_ref": "#/texts/248", + "parent": { + "$ref": "#/texts/246" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Ducks have many economic uses, being farmed for their meat, eggs, and feathers (particularly their down). Approximately 3 billion ducks are slaughtered each year for meat worldwide.[45] They are also kept and bred by aviculturists and often displayed in zoos. Almost all the varieties of domestic ducks are descended from the mallard (Anas platyrhynchos), apart from the Muscovy duck (Cairina moschata).[46][47] The Call duck is another example of a domestic duck breed. Its name comes from its original use established by hunters, as a decoy to attract wild mallards from the sky, into traps set for them on the ground. The call duck is the world's smallest domestic duck breed, as it weighs less than 1\u00a0kg (2.2\u00a0lb).[48]", + "text": "Ducks have many economic uses, being farmed for their meat, eggs, and feathers (particularly their down). Approximately 3 billion ducks are slaughtered each year for meat worldwide.[45] They are also kept and bred by aviculturists and often displayed in zoos. Almost all the varieties of domestic ducks are descended from the mallard (Anas platyrhynchos), apart from the Muscovy duck (Cairina moschata).[46][47] The Call duck is another example of a domestic duck breed. Its name comes from its original use established by hunters, as a decoy to attract wild mallards from the sky, into traps set for them on the ground. The call duck is the world's smallest domestic duck breed, as it weighs less than 1\u00a0kg (2.2\u00a0lb).[48]" + }, + { + "self_ref": "#/texts/249", + "parent": { + "$ref": "#/texts/242" + }, + "children": [ + { + "$ref": "#/pictures/16" + }, + { + "$ref": "#/texts/251" + } + ], + "label": "section_header", + "prov": [], + "orig": "Heraldry", + "text": "Heraldry", + "level": 3 + }, + { + "self_ref": "#/texts/250", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "caption", + "prov": [], + "orig": "Three black-colored ducks in the coat of arms of Maaninka[49]", + "text": "Three black-colored ducks in the coat of arms of Maaninka[49]" + }, + { + "self_ref": "#/texts/251", + "parent": { + "$ref": "#/texts/249" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Ducks appear on several coats of arms, including the coat of arms of Lub\u0101na (Latvia)[50] and the coat of arms of F\u00f6gl\u00f6 (\u00c5land).[51]", + "text": "Ducks appear on several coats of arms, including the coat of arms of Lub\u0101na (Latvia)[50] and the coat of arms of F\u00f6gl\u00f6 (\u00c5land).[51]" + }, + { + "self_ref": "#/texts/252", + "parent": { + "$ref": "#/texts/242" + }, + "children": [ + { + "$ref": "#/texts/253" + }, + { + "$ref": "#/texts/254" + } + ], + "label": "section_header", + "prov": [], + "orig": "Cultural references", + "text": "Cultural references", + "level": 3 + }, + { + "self_ref": "#/texts/253", + "parent": { + "$ref": "#/texts/252" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "In 2002, psychologist Richard Wiseman and colleagues at the University of Hertfordshire, UK, finished a year-long LaughLab experiment, concluding that of all animals, ducks attract the most humor and silliness; he said, \"If you're going to tell a joke involving an animal, make it a duck.\"[52] The word \"duck\" may have become an inherently funny word in many languages, possibly because ducks are seen as silly in their looks or behavior. Of the many ducks in fiction, many are cartoon characters, such as Walt Disney's Donald Duck, and Warner Bros.' Daffy Duck. Howard the Duck started as a comic book character in 1973[53][54] and was made into a movie in 1986.", + "text": "In 2002, psychologist Richard Wiseman and colleagues at the University of Hertfordshire, UK, finished a year-long LaughLab experiment, concluding that of all animals, ducks attract the most humor and silliness; he said, \"If you're going to tell a joke involving an animal, make it a duck.\"[52] The word \"duck\" may have become an inherently funny word in many languages, possibly because ducks are seen as silly in their looks or behavior. Of the many ducks in fiction, many are cartoon characters, such as Walt Disney's Donald Duck, and Warner Bros.' Daffy Duck. Howard the Duck started as a comic book character in 1973[53][54] and was made into a movie in 1986." + }, + { + "self_ref": "#/texts/254", + "parent": { + "$ref": "#/texts/252" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "The 1992 Disney film The Mighty Ducks, starring Emilio Estevez, chose the duck as the mascot for the fictional youth hockey team who are protagonists of the movie, based on the duck being described as a fierce fighter. This led to the duck becoming the nickname and mascot for the eventual National Hockey League professional team of the Anaheim Ducks, who were founded with the name the Mighty Ducks of Anaheim.[citation needed] The duck is also the nickname of the University of Oregon sports teams as well as the Long Island Ducks minor league baseball team.[55]", + "text": "The 1992 Disney film The Mighty Ducks, starring Emilio Estevez, chose the duck as the mascot for the fictional youth hockey team who are protagonists of the movie, based on the duck being described as a fierce fighter. This led to the duck becoming the nickname and mascot for the eventual National Hockey League professional team of the Anaheim Ducks, who were founded with the name the Mighty Ducks of Anaheim.[citation needed] The duck is also the nickname of the University of Oregon sports teams as well as the Long Island Ducks minor league baseball team.[55]" + }, + { + "self_ref": "#/texts/255", + "parent": { + "$ref": "#/texts/39" + }, + "children": [ + { + "$ref": "#/groups/37" + }, + { + "$ref": "#/groups/38" + } + ], + "label": "section_header", + "prov": [], + "orig": "See also", + "text": "See also", + "level": 2 + }, + { + "self_ref": "#/texts/256", + "parent": { + "$ref": "#/groups/37" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Birds portal", + "text": "Birds portal", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/257", + "parent": { + "$ref": "#/groups/38" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Domestic duck", + "text": "Domestic duck", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/258", + "parent": { + "$ref": "#/groups/38" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Duck as food", + "text": "Duck as food", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/259", + "parent": { + "$ref": "#/groups/38" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Duck test", + "text": "Duck test", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/260", + "parent": { + "$ref": "#/groups/38" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Duck breeds", + "text": "Duck breeds", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/261", + "parent": { + "$ref": "#/groups/38" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Fictional ducks", + "text": "Fictional ducks", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/262", + "parent": { + "$ref": "#/groups/38" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Rubber duck", + "text": "Rubber duck", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/263", + "parent": { + "$ref": "#/texts/39" + }, + "children": [ + { + "$ref": "#/texts/264" + }, + { + "$ref": "#/texts/320" + } + ], + "label": "section_header", + "prov": [], + "orig": "Notes", + "text": "Notes", + "level": 2 + }, + { + "self_ref": "#/texts/264", + "parent": { + "$ref": "#/texts/263" + }, + "children": [ + { + "$ref": "#/groups/39" + } + ], + "label": "section_header", + "prov": [], + "orig": "Citations", + "text": "Citations", + "level": 3 + }, + { + "self_ref": "#/texts/265", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ \"Duckling\". The American Heritage Dictionary of the English Language, Fourth Edition. Houghton Mifflin Company. 2006. Retrieved 2015-05-22.", + "text": "^ \"Duckling\". The American Heritage Dictionary of the English Language, Fourth Edition. Houghton Mifflin Company. 2006. Retrieved 2015-05-22.", + "enumerated": true, + "marker": "1." + }, + { + "self_ref": "#/texts/266", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ \"Duckling\". Kernerman English Multilingual Dictionary (Beta Version). K. Dictionaries Ltd. 2000\u20132006. Retrieved 2015-05-22.", + "text": "^ \"Duckling\". Kernerman English Multilingual Dictionary (Beta Version). K. Dictionaries Ltd. 2000\u20132006. Retrieved 2015-05-22.", + "enumerated": true, + "marker": "2." + }, + { + "self_ref": "#/texts/267", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Dohner, Janet Vorwald (2001). The Encyclopedia of Historic and Endangered Livestock and Poultry Breeds. Yale University Press. ISBN\u00a0978-0300138139.", + "text": "^ Dohner, Janet Vorwald (2001). The Encyclopedia of Historic and Endangered Livestock and Poultry Breeds. Yale University Press. ISBN\u00a0978-0300138139.", + "enumerated": true, + "marker": "3." + }, + { + "self_ref": "#/texts/268", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Visca, Curt; Visca, Kelley (2003). How to Draw Cartoon Birds. The Rosen Publishing Group. ISBN\u00a09780823961566.", + "text": "^ Visca, Curt; Visca, Kelley (2003). How to Draw Cartoon Birds. The Rosen Publishing Group. ISBN\u00a09780823961566.", + "enumerated": true, + "marker": "4." + }, + { + "self_ref": "#/texts/269", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ a b c d Carboneras 1992, p.\u00a0536.", + "text": "^ a b c d Carboneras 1992, p.\u00a0536.", + "enumerated": true, + "marker": "5." + }, + { + "self_ref": "#/texts/270", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Livezey 1986, pp.\u00a0737\u2013738.", + "text": "^ Livezey 1986, pp.\u00a0737\u2013738.", + "enumerated": true, + "marker": "6." + }, + { + "self_ref": "#/texts/271", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Madsen, McHugh & de Kloet 1988, p.\u00a0452.", + "text": "^ Madsen, McHugh & de Kloet 1988, p.\u00a0452.", + "enumerated": true, + "marker": "7." + }, + { + "self_ref": "#/texts/272", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Donne-Gouss\u00e9, Laudet & H\u00e4nni 2002, pp.\u00a0353\u2013354.", + "text": "^ Donne-Gouss\u00e9, Laudet & H\u00e4nni 2002, pp.\u00a0353\u2013354.", + "enumerated": true, + "marker": "8." + }, + { + "self_ref": "#/texts/273", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ a b c d e f Carboneras 1992, p.\u00a0540.", + "text": "^ a b c d e f Carboneras 1992, p.\u00a0540.", + "enumerated": true, + "marker": "9." + }, + { + "self_ref": "#/texts/274", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Elphick, Dunning & Sibley 2001, p.\u00a0191.", + "text": "^ Elphick, Dunning & Sibley 2001, p.\u00a0191.", + "enumerated": true, + "marker": "10." + }, + { + "self_ref": "#/texts/275", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Kear 2005, p.\u00a0448.", + "text": "^ Kear 2005, p.\u00a0448.", + "enumerated": true, + "marker": "11." + }, + { + "self_ref": "#/texts/276", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Kear 2005, p.\u00a0622\u2013623.", + "text": "^ Kear 2005, p.\u00a0622\u2013623.", + "enumerated": true, + "marker": "12." + }, + { + "self_ref": "#/texts/277", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Kear 2005, p.\u00a0686.", + "text": "^ Kear 2005, p.\u00a0686.", + "enumerated": true, + "marker": "13." + }, + { + "self_ref": "#/texts/278", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Elphick, Dunning & Sibley 2001, p.\u00a0193.", + "text": "^ Elphick, Dunning & Sibley 2001, p.\u00a0193.", + "enumerated": true, + "marker": "14." + }, + { + "self_ref": "#/texts/279", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ a b c d e f g Carboneras 1992, p.\u00a0537.", + "text": "^ a b c d e f g Carboneras 1992, p.\u00a0537.", + "enumerated": true, + "marker": "15." + }, + { + "self_ref": "#/texts/280", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ American Ornithologists' Union 1998, p.\u00a0xix.", + "text": "^ American Ornithologists' Union 1998, p.\u00a0xix.", + "enumerated": true, + "marker": "16." + }, + { + "self_ref": "#/texts/281", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ American Ornithologists' Union 1998.", + "text": "^ American Ornithologists' Union 1998.", + "enumerated": true, + "marker": "17." + }, + { + "self_ref": "#/texts/282", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Carboneras 1992, p.\u00a0538.", + "text": "^ Carboneras 1992, p.\u00a0538.", + "enumerated": true, + "marker": "18." + }, + { + "self_ref": "#/texts/283", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Christidis & Boles 2008, p.\u00a062.", + "text": "^ Christidis & Boles 2008, p.\u00a062.", + "enumerated": true, + "marker": "19." + }, + { + "self_ref": "#/texts/284", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Shirihai 2008, pp.\u00a0239, 245.", + "text": "^ Shirihai 2008, pp.\u00a0239, 245.", + "enumerated": true, + "marker": "20." + }, + { + "self_ref": "#/texts/285", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ a b Pratt, Bruner & Berrett 1987, pp.\u00a098\u2013107.", + "text": "^ a b Pratt, Bruner & Berrett 1987, pp.\u00a098\u2013107.", + "enumerated": true, + "marker": "21." + }, + { + "self_ref": "#/texts/286", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Fitter, Fitter & Hosking 2000, pp.\u00a052\u20133.", + "text": "^ Fitter, Fitter & Hosking 2000, pp.\u00a052\u20133.", + "enumerated": true, + "marker": "22." + }, + { + "self_ref": "#/texts/287", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ \"Pacific Black Duck\". www.wiresnr.org. Retrieved 2018-04-27.", + "text": "^ \"Pacific Black Duck\". www.wiresnr.org. Retrieved 2018-04-27.", + "enumerated": true, + "marker": "23." + }, + { + "self_ref": "#/texts/288", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Ogden, Evans. \"Dabbling Ducks\". CWE. Retrieved 2006-11-02.", + "text": "^ Ogden, Evans. \"Dabbling Ducks\". CWE. Retrieved 2006-11-02.", + "enumerated": true, + "marker": "24." + }, + { + "self_ref": "#/texts/289", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Karl Mathiesen (16 March 2015). \"Don't feed the ducks bread, say conservationists\". The Guardian. Retrieved 13 November 2016.", + "text": "^ Karl Mathiesen (16 March 2015). \"Don't feed the ducks bread, say conservationists\". The Guardian. Retrieved 13 November 2016.", + "enumerated": true, + "marker": "25." + }, + { + "self_ref": "#/texts/290", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Rohwer, Frank C.; Anderson, Michael G. (1988). \"Female-Biased Philopatry, Monogamy, and the Timing of Pair Formation in Migratory Waterfowl\". Current Ornithology. pp.\u00a0187\u2013221. doi:10.1007/978-1-4615-6787-5_4. ISBN\u00a0978-1-4615-6789-9.", + "text": "^ Rohwer, Frank C.; Anderson, Michael G. (1988). \"Female-Biased Philopatry, Monogamy, and the Timing of Pair Formation in Migratory Waterfowl\". Current Ornithology. pp.\u00a0187\u2013221. doi:10.1007/978-1-4615-6787-5_4. ISBN\u00a0978-1-4615-6789-9.", + "enumerated": true, + "marker": "26." + }, + { + "self_ref": "#/texts/291", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Smith, Cyndi M.; Cooke, Fred; Robertson, Gregory J.; Goudie, R. Ian; Boyd, W. Sean (2000). \"Long-Term Pair Bonds in Harlequin Ducks\". The Condor. 102 (1): 201\u2013205. doi:10.1093/condor/102.1.201. hdl:10315/13797.", + "text": "^ Smith, Cyndi M.; Cooke, Fred; Robertson, Gregory J.; Goudie, R. Ian; Boyd, W. Sean (2000). \"Long-Term Pair Bonds in Harlequin Ducks\". The Condor. 102 (1): 201\u2013205. doi:10.1093/condor/102.1.201. hdl:10315/13797.", + "enumerated": true, + "marker": "27." + }, + { + "self_ref": "#/texts/292", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ \"If You Find An Orphaned Duckling - Wildlife Rehabber\". wildliferehabber.com. Archived from the original on 2018-09-23. Retrieved 2018-12-22.", + "text": "^ \"If You Find An Orphaned Duckling - Wildlife Rehabber\". wildliferehabber.com. Archived from the original on 2018-09-23. Retrieved 2018-12-22.", + "enumerated": true, + "marker": "28." + }, + { + "self_ref": "#/texts/293", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Carver, Heather (2011). The Duck Bible. Lulu.com. ISBN\u00a09780557901562.[self-published source]", + "text": "^ Carver, Heather (2011). The Duck Bible. Lulu.com. ISBN\u00a09780557901562.[self-published source]", + "enumerated": true, + "marker": "29." + }, + { + "self_ref": "#/texts/294", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Titlow, Budd (2013-09-03). Bird Brains: Inside the Strange Minds of Our Fine Feathered Friends. Rowman & Littlefield. ISBN\u00a09780762797707.", + "text": "^ Titlow, Budd (2013-09-03). Bird Brains: Inside the Strange Minds of Our Fine Feathered Friends. Rowman & Littlefield. ISBN\u00a09780762797707.", + "enumerated": true, + "marker": "30." + }, + { + "self_ref": "#/texts/295", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Amos, Jonathan (2003-09-08). \"Sound science is quackers\". BBC News. Retrieved 2006-11-02.", + "text": "^ Amos, Jonathan (2003-09-08). \"Sound science is quackers\". BBC News. Retrieved 2006-11-02.", + "enumerated": true, + "marker": "31." + }, + { + "self_ref": "#/texts/296", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ \"Mythbusters Episode 8\". 12 December 2003.", + "text": "^ \"Mythbusters Episode 8\". 12 December 2003.", + "enumerated": true, + "marker": "32." + }, + { + "self_ref": "#/texts/297", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Erlandson 1994, p.\u00a0171.", + "text": "^ Erlandson 1994, p.\u00a0171.", + "enumerated": true, + "marker": "33." + }, + { + "self_ref": "#/texts/298", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Jeffries 2008, pp.\u00a0168, 243.", + "text": "^ Jeffries 2008, pp.\u00a0168, 243.", + "enumerated": true, + "marker": "34." + }, + { + "self_ref": "#/texts/299", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ a b Sued-Badillo 2003, p.\u00a065.", + "text": "^ a b Sued-Badillo 2003, p.\u00a065.", + "enumerated": true, + "marker": "35." + }, + { + "self_ref": "#/texts/300", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Thorpe 1996, p.\u00a068.", + "text": "^ Thorpe 1996, p.\u00a068.", + "enumerated": true, + "marker": "36." + }, + { + "self_ref": "#/texts/301", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Maisels 1999, p.\u00a042.", + "text": "^ Maisels 1999, p.\u00a042.", + "enumerated": true, + "marker": "37." + }, + { + "self_ref": "#/texts/302", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Rau 1876, p.\u00a0133.", + "text": "^ Rau 1876, p.\u00a0133.", + "enumerated": true, + "marker": "38." + }, + { + "self_ref": "#/texts/303", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Higman 2012, p.\u00a023.", + "text": "^ Higman 2012, p.\u00a023.", + "enumerated": true, + "marker": "39." + }, + { + "self_ref": "#/texts/304", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Hume 2012, p.\u00a053.", + "text": "^ Hume 2012, p.\u00a053.", + "enumerated": true, + "marker": "40." + }, + { + "self_ref": "#/texts/305", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Hume 2012, p.\u00a052.", + "text": "^ Hume 2012, p.\u00a052.", + "enumerated": true, + "marker": "41." + }, + { + "self_ref": "#/texts/306", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Fieldhouse 2002, p.\u00a0167.", + "text": "^ Fieldhouse 2002, p.\u00a0167.", + "enumerated": true, + "marker": "42." + }, + { + "self_ref": "#/texts/307", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Livingston, A. D. (1998-01-01). Guide to Edible Plants and Animals. Wordsworth Editions, Limited. ISBN\u00a09781853263774.", + "text": "^ Livingston, A. D. (1998-01-01). Guide to Edible Plants and Animals. Wordsworth Editions, Limited. ISBN\u00a09781853263774.", + "enumerated": true, + "marker": "43." + }, + { + "self_ref": "#/texts/308", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ \"Study plan for waterfowl injury assessment: Determining PCB concentrations in Hudson river resident waterfowl\" (PDF). New York State Department of Environmental Conservation. US Department of Commerce. December 2008. p.\u00a03. Archived (PDF) from the original on 2022-10-09. Retrieved 2 July 2019.", + "text": "^ \"Study plan for waterfowl injury assessment: Determining PCB concentrations in Hudson river resident waterfowl\" (PDF). New York State Department of Environmental Conservation. US Department of Commerce. December 2008. p.\u00a03. Archived (PDF) from the original on 2022-10-09. Retrieved 2 July 2019.", + "enumerated": true, + "marker": "44." + }, + { + "self_ref": "#/texts/309", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ \"FAOSTAT\". www.fao.org. Retrieved 2019-10-25.", + "text": "^ \"FAOSTAT\". www.fao.org. Retrieved 2019-10-25.", + "enumerated": true, + "marker": "45." + }, + { + "self_ref": "#/texts/310", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ \"Anas platyrhynchos, Domestic Duck; DigiMorph Staff - The University of Texas at Austin\". Digimorph.org. Retrieved 2012-12-23.", + "text": "^ \"Anas platyrhynchos, Domestic Duck; DigiMorph Staff - The University of Texas at Austin\". Digimorph.org. Retrieved 2012-12-23.", + "enumerated": true, + "marker": "46." + }, + { + "self_ref": "#/texts/311", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Sy Montgomery. \"Mallard; Encyclop\u00e6dia Britannica\". Britannica.com. Retrieved 2012-12-23.", + "text": "^ Sy Montgomery. \"Mallard; Encyclop\u00e6dia Britannica\". Britannica.com. Retrieved 2012-12-23.", + "enumerated": true, + "marker": "47." + }, + { + "self_ref": "#/texts/312", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Glenday, Craig (2014). Guinness World Records. Guinness World Records Limited. pp.\u00a0135. ISBN\u00a0978-1-908843-15-9.", + "text": "^ Glenday, Craig (2014). Guinness World Records. Guinness World Records Limited. pp.\u00a0135. ISBN\u00a0978-1-908843-15-9.", + "enumerated": true, + "marker": "48." + }, + { + "self_ref": "#/texts/313", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Suomen kunnallisvaakunat (in Finnish). Suomen Kunnallisliitto. 1982. p.\u00a0147. ISBN\u00a0951-773-085-3.", + "text": "^ Suomen kunnallisvaakunat (in Finnish). Suomen Kunnallisliitto. 1982. p.\u00a0147. ISBN\u00a0951-773-085-3.", + "enumerated": true, + "marker": "49." + }, + { + "self_ref": "#/texts/314", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ \"Lub\u0101nas simbolika\" (in Latvian). Retrieved September 9, 2021.", + "text": "^ \"Lub\u0101nas simbolika\" (in Latvian). Retrieved September 9, 2021.", + "enumerated": true, + "marker": "50." + }, + { + "self_ref": "#/texts/315", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ \"F\u00f6gl\u00f6\" (in Swedish). Retrieved September 9, 2021.", + "text": "^ \"F\u00f6gl\u00f6\" (in Swedish). Retrieved September 9, 2021.", + "enumerated": true, + "marker": "51." + }, + { + "self_ref": "#/texts/316", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Young, Emma. \"World's funniest joke revealed\". New Scientist. Retrieved 7 January 2019.", + "text": "^ Young, Emma. \"World's funniest joke revealed\". New Scientist. Retrieved 7 January 2019.", + "enumerated": true, + "marker": "52." + }, + { + "self_ref": "#/texts/317", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ \"Howard the Duck (character)\". Grand Comics Database.", + "text": "^ \"Howard the Duck (character)\". Grand Comics Database.", + "enumerated": true, + "marker": "53." + }, + { + "self_ref": "#/texts/318", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ Sanderson, Peter; Gilbert, Laura (2008). \"1970s\". Marvel Chronicle A Year by Year History. London, United Kingdom: Dorling Kindersley. p.\u00a0161. ISBN\u00a0978-0756641238. December saw the debut of the cigar-smoking Howard the Duck. In this story by writer Steve Gerber and artist Val Mayerik, various beings from different realities had begun turning up in the Man-Thing's Florida swamp, including this bad-tempered talking duck.", + "text": "^ Sanderson, Peter; Gilbert, Laura (2008). \"1970s\". Marvel Chronicle A Year by Year History. London, United Kingdom: Dorling Kindersley. p.\u00a0161. ISBN\u00a0978-0756641238. December saw the debut of the cigar-smoking Howard the Duck. In this story by writer Steve Gerber and artist Val Mayerik, various beings from different realities had begun turning up in the Man-Thing's Florida swamp, including this bad-tempered talking duck.", + "enumerated": true, + "marker": "54." + }, + { + "self_ref": "#/texts/319", + "parent": { + "$ref": "#/groups/39" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "^ \"The Duck\". University of Oregon Athletics. Retrieved 2022-01-20.", + "text": "^ \"The Duck\". University of Oregon Athletics. Retrieved 2022-01-20.", + "enumerated": true, + "marker": "55." + }, + { + "self_ref": "#/texts/320", + "parent": { + "$ref": "#/texts/263" + }, + "children": [ + { + "$ref": "#/groups/40" + } + ], + "label": "section_header", + "prov": [], + "orig": "Sources", + "text": "Sources", + "level": 3 + }, + { + "self_ref": "#/texts/321", + "parent": { + "$ref": "#/groups/40" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "American Ornithologists' Union (1998). Checklist of North American Birds (PDF). Washington, DC: American Ornithologists' Union. ISBN\u00a0978-1-891276-00-2. Archived (PDF) from the original on 2022-10-09.", + "text": "American Ornithologists' Union (1998). Checklist of North American Birds (PDF). Washington, DC: American Ornithologists' Union. ISBN\u00a0978-1-891276-00-2. Archived (PDF) from the original on 2022-10-09.", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/322", + "parent": { + "$ref": "#/groups/40" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Carboneras, Carlos (1992). del Hoyo, Josep; Elliott, Andrew; Sargatal, Jordi (eds.). Handbook of the Birds of the World. Vol.\u00a01: Ostrich to Ducks. Barcelona: Lynx Edicions. ISBN\u00a0978-84-87334-10-8.", + "text": "Carboneras, Carlos (1992). del Hoyo, Josep; Elliott, Andrew; Sargatal, Jordi (eds.). Handbook of the Birds of the World. Vol.\u00a01: Ostrich to Ducks. Barcelona: Lynx Edicions. ISBN\u00a0978-84-87334-10-8.", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/323", + "parent": { + "$ref": "#/groups/40" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Christidis, Les; Boles, Walter E., eds. (2008). Systematics and Taxonomy of Australian Birds. Collingwood, VIC: Csiro Publishing. ISBN\u00a0978-0-643-06511-6.", + "text": "Christidis, Les; Boles, Walter E., eds. (2008). Systematics and Taxonomy of Australian Birds. Collingwood, VIC: Csiro Publishing. ISBN\u00a0978-0-643-06511-6.", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/324", + "parent": { + "$ref": "#/groups/40" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Donne-Gouss\u00e9, Carole; Laudet, Vincent; H\u00e4nni, Catherine (July 2002). \"A molecular phylogeny of Anseriformes based on mitochondrial DNA analysis\". Molecular Phylogenetics and Evolution. 23 (3): 339\u2013356. Bibcode:2002MolPE..23..339D. doi:10.1016/S1055-7903(02)00019-2. PMID\u00a012099792.", + "text": "Donne-Gouss\u00e9, Carole; Laudet, Vincent; H\u00e4nni, Catherine (July 2002). \"A molecular phylogeny of Anseriformes based on mitochondrial DNA analysis\". Molecular Phylogenetics and Evolution. 23 (3): 339\u2013356. Bibcode:2002MolPE..23..339D. doi:10.1016/S1055-7903(02)00019-2. PMID\u00a012099792.", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/325", + "parent": { + "$ref": "#/groups/40" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Elphick, Chris; Dunning, John B. Jr.; Sibley, David, eds. (2001). The Sibley Guide to Bird Life and Behaviour. London: Christopher Helm. ISBN\u00a0978-0-7136-6250-4.", + "text": "Elphick, Chris; Dunning, John B. Jr.; Sibley, David, eds. (2001). The Sibley Guide to Bird Life and Behaviour. London: Christopher Helm. ISBN\u00a0978-0-7136-6250-4.", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/326", + "parent": { + "$ref": "#/groups/40" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Erlandson, Jon M. (1994). Early Hunter-Gatherers of the California Coast. New York, NY: Springer Science & Business Media. ISBN\u00a0978-1-4419-3231-0.", + "text": "Erlandson, Jon M. (1994). Early Hunter-Gatherers of the California Coast. New York, NY: Springer Science & Business Media. ISBN\u00a0978-1-4419-3231-0.", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/327", + "parent": { + "$ref": "#/groups/40" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Fieldhouse, Paul (2002). Food, Feasts, and Faith: An Encyclopedia of Food Culture in World Religions. Vol.\u00a0I: A\u2013K. Santa Barbara: ABC-CLIO. ISBN\u00a0978-1-61069-412-4.", + "text": "Fieldhouse, Paul (2002). Food, Feasts, and Faith: An Encyclopedia of Food Culture in World Religions. Vol.\u00a0I: A\u2013K. Santa Barbara: ABC-CLIO. ISBN\u00a0978-1-61069-412-4.", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/328", + "parent": { + "$ref": "#/groups/40" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Fitter, Julian; Fitter, Daniel; Hosking, David (2000). Wildlife of the Gal\u00e1pagos. Princeton, NJ: Princeton University Press. ISBN\u00a0978-0-691-10295-5.", + "text": "Fitter, Julian; Fitter, Daniel; Hosking, David (2000). Wildlife of the Gal\u00e1pagos. Princeton, NJ: Princeton University Press. ISBN\u00a0978-0-691-10295-5.", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/329", + "parent": { + "$ref": "#/groups/40" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Higman, B. W. (2012). How Food Made History. Chichester, UK: John Wiley & Sons. ISBN\u00a0978-1-4051-8947-7.", + "text": "Higman, B. W. (2012). How Food Made History. Chichester, UK: John Wiley & Sons. ISBN\u00a0978-1-4051-8947-7.", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/330", + "parent": { + "$ref": "#/groups/40" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Hume, Julian H. (2012). Extinct Birds. London: Christopher Helm. ISBN\u00a0978-1-4729-3744-5.", + "text": "Hume, Julian H. (2012). Extinct Birds. London: Christopher Helm. ISBN\u00a0978-1-4729-3744-5.", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/331", + "parent": { + "$ref": "#/groups/40" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Jeffries, Richard (2008). Holocene Hunter-Gatherers of the Lower Ohio River Valley. Tuscaloosa: University of Alabama Press. ISBN\u00a0978-0-8173-1658-7.", + "text": "Jeffries, Richard (2008). Holocene Hunter-Gatherers of the Lower Ohio River Valley. Tuscaloosa: University of Alabama Press. ISBN\u00a0978-0-8173-1658-7.", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/332", + "parent": { + "$ref": "#/groups/40" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Kear, Janet, ed. (2005). Ducks, Geese and Swans: Species Accounts (Cairina to Mergus). Bird Families of the World. Oxford: Oxford University Press. ISBN\u00a0978-0-19-861009-0.", + "text": "Kear, Janet, ed. (2005). Ducks, Geese and Swans: Species Accounts (Cairina to Mergus). Bird Families of the World. Oxford: Oxford University Press. ISBN\u00a0978-0-19-861009-0.", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/333", + "parent": { + "$ref": "#/groups/40" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Livezey, Bradley C. (October 1986). \"A phylogenetic analysis of recent Anseriform genera using morphological characters\" (PDF). The Auk. 103 (4): 737\u2013754. doi:10.1093/auk/103.4.737. Archived (PDF) from the original on 2022-10-09.", + "text": "Livezey, Bradley C. (October 1986). \"A phylogenetic analysis of recent Anseriform genera using morphological characters\" (PDF). The Auk. 103 (4): 737\u2013754. doi:10.1093/auk/103.4.737. Archived (PDF) from the original on 2022-10-09.", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/334", + "parent": { + "$ref": "#/groups/40" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Madsen, Cort S.; McHugh, Kevin P.; de Kloet, Siwo R. (July 1988). \"A partial classification of waterfowl (Anatidae) based on single-copy DNA\" (PDF). The Auk. 105 (3): 452\u2013459. doi:10.1093/auk/105.3.452. Archived (PDF) from the original on 2022-10-09.", + "text": "Madsen, Cort S.; McHugh, Kevin P.; de Kloet, Siwo R. (July 1988). \"A partial classification of waterfowl (Anatidae) based on single-copy DNA\" (PDF). The Auk. 105 (3): 452\u2013459. doi:10.1093/auk/105.3.452. Archived (PDF) from the original on 2022-10-09.", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/335", + "parent": { + "$ref": "#/groups/40" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Maisels, Charles Keith (1999). Early Civilizations of the Old World. London: Routledge. ISBN\u00a0978-0-415-10975-8.", + "text": "Maisels, Charles Keith (1999). Early Civilizations of the Old World. London: Routledge. ISBN\u00a0978-0-415-10975-8.", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/336", + "parent": { + "$ref": "#/groups/40" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Pratt, H. Douglas; Bruner, Phillip L.; Berrett, Delwyn G. (1987). A Field Guide to the Birds of Hawaii and the Tropical Pacific. Princeton, NJ: Princeton University Press. ISBN\u00a00-691-02399-9.", + "text": "Pratt, H. Douglas; Bruner, Phillip L.; Berrett, Delwyn G. (1987). A Field Guide to the Birds of Hawaii and the Tropical Pacific. Princeton, NJ: Princeton University Press. ISBN\u00a00-691-02399-9.", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/337", + "parent": { + "$ref": "#/groups/40" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Rau, Charles (1876). Early Man in Europe. New York: Harper & Brothers. LCCN\u00a005040168.", + "text": "Rau, Charles (1876). Early Man in Europe. New York: Harper & Brothers. LCCN\u00a005040168.", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/338", + "parent": { + "$ref": "#/groups/40" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Shirihai, Hadoram (2008). A Complete Guide to Antarctic Wildlife. Princeton, NJ, US: Princeton University Press. ISBN\u00a0978-0-691-13666-0.", + "text": "Shirihai, Hadoram (2008). A Complete Guide to Antarctic Wildlife. Princeton, NJ, US: Princeton University Press. ISBN\u00a0978-0-691-13666-0.", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/339", + "parent": { + "$ref": "#/groups/40" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Sued-Badillo, Jalil (2003). Autochthonous Societies. General History of the Caribbean. Paris: UNESCO. ISBN\u00a0978-92-3-103832-7.", + "text": "Sued-Badillo, Jalil (2003). Autochthonous Societies. General History of the Caribbean. Paris: UNESCO. ISBN\u00a0978-92-3-103832-7.", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/340", + "parent": { + "$ref": "#/groups/40" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Thorpe, I. J. (1996). The Origins of Agriculture in Europe. New York: Routledge. ISBN\u00a0978-0-415-08009-5.", + "text": "Thorpe, I. J. (1996). The Origins of Agriculture in Europe. New York: Routledge. ISBN\u00a0978-0-415-08009-5.", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/341", + "parent": { + "$ref": "#/texts/39" + }, + "children": [ + { + "$ref": "#/groups/41" + }, + { + "$ref": "#/groups/42" + }, + { + "$ref": "#/tables/1" + }, + { + "$ref": "#/pictures/17" + }, + { + "$ref": "#/groups/43" + }, + { + "$ref": "#/groups/44" + }, + { + "$ref": "#/groups/45" + }, + { + "$ref": "#/groups/46" + }, + { + "$ref": "#/groups/47" + }, + { + "$ref": "#/groups/48" + } + ], + "label": "section_header", + "prov": [], + "orig": "External links", + "text": "External links", + "level": 2 + }, + { + "self_ref": "#/texts/342", + "parent": { + "$ref": "#/groups/41" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Definitions from Wiktionary", + "text": "Definitions from Wiktionary", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/343", + "parent": { + "$ref": "#/groups/41" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Media from Commons", + "text": "Media from Commons", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/344", + "parent": { + "$ref": "#/groups/41" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Quotations from Wikiquote", + "text": "Quotations from Wikiquote", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/345", + "parent": { + "$ref": "#/groups/41" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Recipes from Wikibooks", + "text": "Recipes from Wikibooks", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/346", + "parent": { + "$ref": "#/groups/41" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Taxa from Wikispecies", + "text": "Taxa from Wikispecies", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/347", + "parent": { + "$ref": "#/groups/41" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Data from Wikidata", + "text": "Data from Wikidata", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/348", + "parent": { + "$ref": "#/groups/42" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "list of books (useful looking abstracts)", + "text": "list of books (useful looking abstracts)", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/349", + "parent": { + "$ref": "#/groups/42" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Ducks on postage stamps Archived 2013-05-13 at the Wayback Machine", + "text": "Ducks on postage stamps Archived 2013-05-13 at the Wayback Machine", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/350", + "parent": { + "$ref": "#/groups/42" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "", + "text": "", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/351", + "parent": { + "$ref": "#/groups/42" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Ducks at a Distance, by Rob Hines at Project Gutenberg - A modern illustrated guide to identification of US waterfowl", + "text": "Ducks at a Distance, by Rob Hines at Project Gutenberg - A modern illustrated guide to identification of US waterfowl", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/352", + "parent": { + "$ref": "#/groups/43" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Ducks", + "text": "Ducks", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/353", + "parent": { + "$ref": "#/groups/43" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Game birds", + "text": "Game birds", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/354", + "parent": { + "$ref": "#/groups/43" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Bird common names", + "text": "Bird common names", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/355", + "parent": { + "$ref": "#/groups/44" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "All accuracy disputes", + "text": "All accuracy disputes", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/356", + "parent": { + "$ref": "#/groups/44" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Accuracy disputes from February 2020", + "text": "Accuracy disputes from February 2020", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/357", + "parent": { + "$ref": "#/groups/44" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "CS1 Finnish-language sources (fi)", + "text": "CS1 Finnish-language sources (fi)", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/358", + "parent": { + "$ref": "#/groups/44" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "CS1 Latvian-language sources (lv)", + "text": "CS1 Latvian-language sources (lv)", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/359", + "parent": { + "$ref": "#/groups/44" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "CS1 Swedish-language sources (sv)", + "text": "CS1 Swedish-language sources (sv)", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/360", + "parent": { + "$ref": "#/groups/44" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Articles with short description", + "text": "Articles with short description", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/361", + "parent": { + "$ref": "#/groups/44" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Short description is different from Wikidata", + "text": "Short description is different from Wikidata", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/362", + "parent": { + "$ref": "#/groups/44" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Wikipedia indefinitely move-protected pages", + "text": "Wikipedia indefinitely move-protected pages", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/363", + "parent": { + "$ref": "#/groups/44" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Wikipedia indefinitely semi-protected pages", + "text": "Wikipedia indefinitely semi-protected pages", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/364", + "parent": { + "$ref": "#/groups/44" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Articles with 'species' microformats", + "text": "Articles with 'species' microformats", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/365", + "parent": { + "$ref": "#/groups/44" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Articles containing Old English (ca. 450-1100)-language text", + "text": "Articles containing Old English (ca. 450-1100)-language text", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/366", + "parent": { + "$ref": "#/groups/44" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Articles containing Dutch-language text", + "text": "Articles containing Dutch-language text", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/367", + "parent": { + "$ref": "#/groups/44" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Articles containing German-language text", + "text": "Articles containing German-language text", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/368", + "parent": { + "$ref": "#/groups/44" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Articles containing Norwegian-language text", + "text": "Articles containing Norwegian-language text", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/369", + "parent": { + "$ref": "#/groups/44" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Articles containing Lithuanian-language text", + "text": "Articles containing Lithuanian-language text", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/370", + "parent": { + "$ref": "#/groups/44" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Articles containing Ancient Greek (to 1453)-language text", + "text": "Articles containing Ancient Greek (to 1453)-language text", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/371", + "parent": { + "$ref": "#/groups/44" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "All articles with self-published sources", + "text": "All articles with self-published sources", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/372", + "parent": { + "$ref": "#/groups/44" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Articles with self-published sources from February 2020", + "text": "Articles with self-published sources from February 2020", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/373", + "parent": { + "$ref": "#/groups/44" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "All articles with unsourced statements", + "text": "All articles with unsourced statements", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/374", + "parent": { + "$ref": "#/groups/44" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Articles with unsourced statements from January 2022", + "text": "Articles with unsourced statements from January 2022", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/375", + "parent": { + "$ref": "#/groups/44" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "CS1: long volume value", + "text": "CS1: long volume value", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/376", + "parent": { + "$ref": "#/groups/44" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Pages using Sister project links with wikidata mismatch", + "text": "Pages using Sister project links with wikidata mismatch", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/377", + "parent": { + "$ref": "#/groups/44" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Pages using Sister project links with hidden wikidata", + "text": "Pages using Sister project links with hidden wikidata", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/378", + "parent": { + "$ref": "#/groups/44" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Webarchive template wayback links", + "text": "Webarchive template wayback links", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/379", + "parent": { + "$ref": "#/groups/44" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Articles with Project Gutenberg links", + "text": "Articles with Project Gutenberg links", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/380", + "parent": { + "$ref": "#/groups/44" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Articles containing video clips", + "text": "Articles containing video clips", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/381", + "parent": { + "$ref": "#/groups/45" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "This page was last edited on 21 September 2024, at 12:11\u00a0(UTC).", + "text": "This page was last edited on 21 September 2024, at 12:11\u00a0(UTC).", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/382", + "parent": { + "$ref": "#/groups/45" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Text is available under the Creative Commons Attribution-ShareAlike License 4.0;\nadditional terms may apply. By using this site, you agree to the Terms of Use and Privacy Policy. Wikipedia\u00ae is a registered trademark of the Wikimedia Foundation, Inc., a non-profit organization.", + "text": "Text is available under the Creative Commons Attribution-ShareAlike License 4.0;\nadditional terms may apply. By using this site, you agree to the Terms of Use and Privacy Policy. Wikipedia\u00ae is a registered trademark of the Wikimedia Foundation, Inc., a non-profit organization.", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/383", + "parent": { + "$ref": "#/groups/46" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Privacy policy", + "text": "Privacy policy", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/384", + "parent": { + "$ref": "#/groups/46" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "About Wikipedia", + "text": "About Wikipedia", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/385", + "parent": { + "$ref": "#/groups/46" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Disclaimers", + "text": "Disclaimers", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/386", + "parent": { + "$ref": "#/groups/46" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Contact Wikipedia", + "text": "Contact Wikipedia", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/387", + "parent": { + "$ref": "#/groups/46" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Code of Conduct", + "text": "Code of Conduct", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/388", + "parent": { + "$ref": "#/groups/46" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Developers", + "text": "Developers", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/389", + "parent": { + "$ref": "#/groups/46" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Statistics", + "text": "Statistics", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/390", + "parent": { + "$ref": "#/groups/46" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Cookie statement", + "text": "Cookie statement", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/391", + "parent": { + "$ref": "#/groups/46" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Mobile view", + "text": "Mobile view", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/392", + "parent": { + "$ref": "#/groups/47" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "", + "text": "", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/393", + "parent": { + "$ref": "#/groups/47" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "", + "text": "", + "enumerated": false, + "marker": "-" + } + ], + "pictures": [ + { + "self_ref": "#/pictures/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "picture", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "annotations": [] + }, + { + "self_ref": "#/pictures/1", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "picture", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "annotations": [] + }, + { + "self_ref": "#/pictures/2", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "picture", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "annotations": [] + }, + { + "self_ref": "#/pictures/3", + "parent": { + "$ref": "#/texts/39" + }, + "children": [], + "label": "picture", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "annotations": [] + }, + { + "self_ref": "#/pictures/4", + "parent": { + "$ref": "#/texts/200" + }, + "children": [], + "label": "picture", + "prov": [], + "captions": [ + { + "$ref": "#/texts/202" + } + ], + "references": [], + "footnotes": [], + "annotations": [] + }, + { + "self_ref": "#/pictures/5", + "parent": { + "$ref": "#/texts/200" + }, + "children": [], + "label": "picture", + "prov": [], + "captions": [ + { + "$ref": "#/texts/206" + } + ], + "references": [], + "footnotes": [], + "annotations": [] + }, + { + "self_ref": "#/pictures/6", + "parent": { + "$ref": "#/texts/200" + }, + "children": [], + "label": "picture", + "prov": [], + "captions": [ + { + "$ref": "#/texts/207" + } + ], + "references": [], + "footnotes": [], + "annotations": [] + }, + { + "self_ref": "#/pictures/7", + "parent": { + "$ref": "#/texts/208" + }, + "children": [], + "label": "picture", + "prov": [], + "captions": [ + { + "$ref": "#/texts/210" + } + ], + "references": [], + "footnotes": [], + "annotations": [] + }, + { + "self_ref": "#/pictures/8", + "parent": { + "$ref": "#/texts/213" + }, + "children": [], + "label": "picture", + "prov": [], + "captions": [ + { + "$ref": "#/texts/214" + } + ], + "references": [], + "footnotes": [], + "annotations": [] + }, + { + "self_ref": "#/pictures/9", + "parent": { + "$ref": "#/texts/217" + }, + "children": [], + "label": "picture", + "prov": [], + "captions": [ + { + "$ref": "#/texts/218" + } + ], + "references": [], + "footnotes": [], + "annotations": [] + }, + { + "self_ref": "#/pictures/10", + "parent": { + "$ref": "#/texts/217" + }, + "children": [], + "label": "picture", + "prov": [], + "captions": [ + { + "$ref": "#/texts/220" + } + ], + "references": [], + "footnotes": [], + "annotations": [] + }, + { + "self_ref": "#/pictures/11", + "parent": { + "$ref": "#/texts/223" + }, + "children": [], + "label": "picture", + "prov": [], + "captions": [ + { + "$ref": "#/texts/224" + } + ], + "references": [], + "footnotes": [], + "annotations": [] + }, + { + "self_ref": "#/pictures/12", + "parent": { + "$ref": "#/texts/223" + }, + "children": [], + "label": "picture", + "prov": [], + "captions": [ + { + "$ref": "#/texts/225" + } + ], + "references": [], + "footnotes": [], + "annotations": [] + }, + { + "self_ref": "#/pictures/13", + "parent": { + "$ref": "#/texts/232" + }, + "children": [], + "label": "picture", + "prov": [], + "captions": [ + { + "$ref": "#/texts/233" + } + ], + "references": [], + "footnotes": [], + "annotations": [] + }, + { + "self_ref": "#/pictures/14", + "parent": { + "$ref": "#/texts/238" + }, + "children": [], + "label": "picture", + "prov": [], + "captions": [ + { + "$ref": "#/texts/239" + } + ], + "references": [], + "footnotes": [], + "annotations": [] + }, + { + "self_ref": "#/pictures/15", + "parent": { + "$ref": "#/texts/246" + }, + "children": [], + "label": "picture", + "prov": [], + "captions": [ + { + "$ref": "#/texts/247" + } + ], + "references": [], + "footnotes": [], + "annotations": [] + }, + { + "self_ref": "#/pictures/16", + "parent": { + "$ref": "#/texts/249" + }, + "children": [], + "label": "picture", + "prov": [], + "captions": [ + { + "$ref": "#/texts/250" + } + ], + "references": [], + "footnotes": [], + "annotations": [] + }, + { + "self_ref": "#/pictures/17", + "parent": { + "$ref": "#/texts/341" + }, + "children": [], + "label": "picture", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "annotations": [] + } + ], + "tables": [ + { + "self_ref": "#/tables/0", + "parent": { + "$ref": "#/texts/39" + }, + "children": [], + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 2, + "text": "Duck\n", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 2, + "text": "\n", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 2, + "text": "Bufflehead\n(Bucephala albeola)\n", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 2, + "text": "Scientific classification \n", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Domain:\n", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Eukaryota\n", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Kingdom:\n", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Animalia\n", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Phylum:\n", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Chordata\n", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Class:\n", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Aves\n", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Order:\n", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Anseriformes\n", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 9, + "end_row_offset_idx": 10, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Superfamily:\n", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 9, + "end_row_offset_idx": 10, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Anatoidea\n", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 10, + "end_row_offset_idx": 11, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Family:\n", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 10, + "end_row_offset_idx": 11, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Anatidae\n", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 11, + "end_row_offset_idx": 12, + "start_col_offset_idx": 0, + "end_col_offset_idx": 2, + "text": "Subfamilies\n", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 12, + "end_row_offset_idx": 13, + "start_col_offset_idx": 0, + "end_col_offset_idx": 2, + "text": "\nSee text\n\n", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 13, + "num_cols": 2, + "grid": [ + [ + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 2, + "text": "Duck\n", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 2, + "text": "Duck\n", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 2, + "text": "\n", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 2, + "text": "\n", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 2, + "text": "Bufflehead\n(Bucephala albeola)\n", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 2, + "text": "Bufflehead\n(Bucephala albeola)\n", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 2, + "text": "Scientific classification \n", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 2, + "text": "Scientific classification \n", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Domain:\n", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Eukaryota\n", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Kingdom:\n", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Animalia\n", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Phylum:\n", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Chordata\n", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Class:\n", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Aves\n", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Order:\n", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Anseriformes\n", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 9, + "end_row_offset_idx": 10, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Superfamily:\n", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 9, + "end_row_offset_idx": 10, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Anatoidea\n", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 10, + "end_row_offset_idx": 11, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Family:\n", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 10, + "end_row_offset_idx": 11, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Anatidae\n", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 11, + "end_row_offset_idx": 12, + "start_col_offset_idx": 0, + "end_col_offset_idx": 2, + "text": "Subfamilies\n", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 11, + "end_row_offset_idx": 12, + "start_col_offset_idx": 0, + "end_col_offset_idx": 2, + "text": "Subfamilies\n", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 12, + "end_row_offset_idx": 13, + "start_col_offset_idx": 0, + "end_col_offset_idx": 2, + "text": "\nSee text\n\n", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 12, + "end_row_offset_idx": 13, + "start_col_offset_idx": 0, + "end_col_offset_idx": 2, + "text": "\nSee text\n\n", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + }, + { + "self_ref": "#/tables/1", + "parent": { + "$ref": "#/texts/341" + }, + "children": [], + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 2, + "text": "Authority control databases ", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "National", + "column_header": false, + "row_header": true, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "United StatesFranceBnF dataJapanLatviaIsrael", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Other", + "column_header": false, + "row_header": true, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "IdRef", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 3, + "num_cols": 2, + "grid": [ + [ + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 2, + "text": "Authority control databases ", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 2, + "text": "Authority control databases ", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "National", + "column_header": false, + "row_header": true, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "United StatesFranceBnF dataJapanLatviaIsrael", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Other", + "column_header": false, + "row_header": true, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "IdRef", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + } + ], + "key_value_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/wiki_duck.html.md b/tests/data/groundtruth/docling_v2/wiki_duck.html.md new file mode 100644 index 00000000..856e97a7 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.md @@ -0,0 +1,532 @@ +- Main page +- Contents +- Current events +- Random article +- About Wikipedia +- Contact us + +- Help +- Learn to edit +- Community portal +- Recent changes +- Upload file + + + + + + + + - Donate + - Create account + - Log in + - Create account + - Log in + - Contributions + - Talk + +## Contents + + - (Top) + - 1 Etymology + - 2 Taxonomy + - 3 Morphology + - 4 Distribution and habitat + - 5 Behaviour Toggle Behaviour subsection + - 5.1 Feeding + - 5.2 Breeding + - 5.3 Communication + - 5.4 Predators + - 6 Relationship with humans Toggle Relationship with humans subsection + - 6.1 Hunting + - 6.2 Domestication + - 6.3 Heraldry + - 6.4 Cultural references +- 7 See also +- 8 Notes Toggle Notes subsection + - 8.1 Citations + - 8.2 Sources +- 9 External links + +# Duck + +- Acèh +- Afrikaans +- Alemannisch +- አማርኛ +- Ænglisc +- العربية +- Aragonés +- ܐܪܡܝܐ +- Armãneashti +- Asturianu +- Atikamekw +- Авар +- Aymar aru +- تۆرکجه +- Basa Bali +- বাংলা +- 閩南語 / Bân-lâm-gú +- Беларуская +- Беларуская (тарашкевіца) +- Bikol Central +- Български +- Brezhoneg +- Буряад +- Català +- Чӑвашла +- Čeština +- ChiShona +- Cymraeg +- Dagbanli +- Dansk +- Deitsch +- Deutsch +- डोटेली +- Ελληνικά +- Emiliàn e rumagnòl +- Español +- Esperanto +- Euskara +- فارسی +- Français +- Gaeilge +- Galego +- ГӀалгӀай +- 贛語 +- گیلکی +- 𐌲𐌿𐍄𐌹𐍃𐌺 +- गोंयची कोंकणी / Gõychi Konknni +- 客家語 / Hak-kâ-ngî +- 한국어 +- Hausa +- Հայերեն +- हिन्दी +- Hrvatski +- Ido +- Bahasa Indonesia +- Iñupiatun +- Íslenska +- Italiano +- עברית +- Jawa +- ಕನ್ನಡ +- Kapampangan +- ქართული +- कॉशुर / کٲشُر +- Қазақша +- Ikirundi +- Kongo +- Kreyòl ayisyen +- Кырык мары +- ລາວ +- Latina +- Latviešu +- Lietuvių +- Li Niha +- Ligure +- Limburgs +- Lingála +- Malagasy +- മലയാളം +- मराठी +- مازِرونی +- Bahasa Melayu +- ꯃꯤꯇꯩ ꯂꯣꯟ +- 閩東語 / Mìng-dĕ̤ng-ngṳ̄ +- Мокшень +- Монгол +- မြန်မာဘာသာ +- Nederlands +- Nedersaksies +- नेपाली +- नेपाल भाषा +- 日本語 +- Нохчийн +- Norsk nynorsk +- Occitan +- Oromoo +- ਪੰਜਾਬੀ +- Picard +- Plattdüütsch +- Polski +- Português +- Qırımtatarca +- Română +- Русский +- Саха тыла +- ᱥᱟᱱᱛᱟᱲᱤ +- Sardu +- Scots +- Seeltersk +- Shqip +- Sicilianu +- සිංහල +- Simple English +- سنڌي +- کوردی +- Српски / srpski +- Srpskohrvatski / српскохрватски +- Sunda +- Svenska +- Tagalog +- தமிழ் +- Taqbaylit +- Татарча / tatarça +- ไทย +- Türkçe +- Українська +- ئۇيغۇرچە / Uyghurche +- Vahcuengh +- Tiếng Việt +- Walon +- 文言 +- Winaray +- 吴语 +- 粵語 +- Žemaitėška +- 中文 + +- Article +- Talk + + - Read + - View source + - View history + - Read + - View source + - View history + - What links here + - Related changes + - Upload file + - Special pages + - Permanent link + - Page information + - Cite this page + - Get shortened URL + - Download QR code + - Wikidata item + - Download as PDF + - Printable version + - Wikimedia Commons + - Wikiquote + + + +| Duck | Duck | +|--------------------------------|--------------------------------| +| | | +| Bufflehead (Bucephala albeola) | Bufflehead (Bucephala albeola) | +| Scientific classification | Scientific classification | +| Domain: | Eukaryota | +| Kingdom: | Animalia | +| Phylum: | Chordata | +| Class: | Aves | +| Order: | Anseriformes | +| Superfamily: | Anatoidea | +| Family: | Anatidae | +| Subfamilies | Subfamilies | +| See text | See text | + +Duck is the common name for numerous species of waterfowl in the family Anatidae. Ducks are generally smaller and shorter-necked than swans and geese, which are members of the same family. Divided among several subfamilies, they are a form taxon; they do not represent a monophyletic group (the group of all descendants of a single common ancestral species), since swans and geese are not considered ducks. Ducks are mostly aquatic birds, and may be found in both fresh water and sea water. + +Ducks are sometimes confused with several types of unrelated water birds with similar forms, such as loons or divers, grebes, gallinules and coots. + +## Etymology + +The word duck comes from Old English dūce 'diver', a derivative of the verb *dūcan 'to duck, bend down low as if to get under something, or dive', because of the way many species in the dabbling duck group feed by upending; compare with Dutch duiken and German tauchen 'to dive'. + +Pacific black duck displaying the characteristic upending "duck" + + + +This word replaced Old English ened /ænid 'duck', possibly to avoid confusion with other words, such as ende 'end' with similar forms. Other Germanic languages still have similar words for duck, for example, Dutch eend, German Ente and Norwegian and. The word ened /ænid was inherited from Proto-Indo-European; cf. Latin anas "duck", Lithuanian ántis 'duck', Ancient Greek νῆσσα /νῆττα (nēssa /nētta) 'duck', and Sanskrit ātí 'water bird', among others. + +A duckling is a young duck in downy plumage[1] or baby duck,[2] but in the food trade a young domestic duck which has just reached adult size and bulk and its meat is still fully tender, is sometimes labelled as a duckling. + +A male is called a drake and the female is called a duck, or in ornithology a hen.[3][4] + +Male mallard. + + + +Wood ducks. + + + +## Taxonomy + +All ducks belong to the biological order Anseriformes, a group that contains the ducks, geese and swans, as well as the screamers, and the magpie goose.[5] All except the screamers belong to the biological family Anatidae.[5] Within the family, ducks are split into a variety of subfamilies and 'tribes'. The number and composition of these subfamilies and tribes is the cause of considerable disagreement among taxonomists.[5] Some base their decisions on morphological characteristics, others on shared behaviours or genetic studies.[6][7] The number of suggested subfamilies containing ducks ranges from two to five.[8][9] The significant level of hybridisation that occurs among wild ducks complicates efforts to tease apart the relationships between various species.[9] + +Mallard landing in approach + + + +In most modern classifications, the so-called 'true ducks' belong to the subfamily Anatinae, which is further split into a varying number of tribes.[10] The largest of these, the Anatini, contains the 'dabbling' or 'river' ducks – named for their method of feeding primarily at the surface of fresh water.[11] The 'diving ducks', also named for their primary feeding method, make up the tribe Aythyini.[12] The 'sea ducks' of the tribe Mergini are diving ducks which specialise on fish and shellfish and spend a majority of their lives in saltwater.[13] The tribe Oxyurini contains the 'stifftails', diving ducks notable for their small size and stiff, upright tails.[14] + +A number of other species called ducks are not considered to be 'true ducks', and are typically placed in other subfamilies or tribes. The whistling ducks are assigned either to a tribe (Dendrocygnini) in the subfamily Anatinae or the subfamily Anserinae,[15] or to their own subfamily (Dendrocygninae) or family (Dendrocyganidae).[9][16] The freckled duck of Australia is either the sole member of the tribe Stictonettini in the subfamily Anserinae,[15] or in its own family, the Stictonettinae.[9] The shelducks make up the tribe Tadornini in the family Anserinae in some classifications,[15] and their own subfamily, Tadorninae, in others,[17] while the steamer ducks are either placed in the family Anserinae in the tribe Tachyerini[15] or lumped with the shelducks in the tribe Tadorini.[9] The perching ducks make up in the tribe Cairinini in the subfamily Anserinae in some classifications, while that tribe is eliminated in other classifications and its members assigned to the tribe Anatini.[9] The torrent duck is generally included in the subfamily Anserinae in the monotypic tribe Merganettini,[15] but is sometimes included in the tribe Tadornini.[18] The pink-eared duck is sometimes included as a true duck either in the tribe Anatini[15] or the tribe Malacorhynchini,[19] and other times is included with the shelducks in the tribe Tadornini.[15] + +## Morphology + +Male Mandarin duck + + + +The overall body plan of ducks is elongated and broad, and they are also relatively long-necked, albeit not as long-necked as the geese and swans. The body shape of diving ducks varies somewhat from this in being more rounded. The bill is usually broad and contains serrated pectens, which are particularly well defined in the filter-feeding species. In the case of some fishing species the bill is long and strongly serrated. The scaled legs are strong and well developed, and generally set far back on the body, more so in the highly aquatic species. The wings are very strong and are generally short and pointed, and the flight of ducks requires fast continuous strokes, requiring in turn strong wing muscles. Three species of steamer duck are almost flightless, however. Many species of duck are temporarily flightless while moulting; they seek out protected habitat with good food supplies during this period. This moult typically precedes migration. + +The drakes of northern species often have extravagant plumage, but that is moulted in summer to give a more female-like appearance, the "eclipse" plumage. Southern resident species typically show less sexual dimorphism, although there are exceptions such as the paradise shelduck of New Zealand, which is both strikingly sexually dimorphic and in which the female's plumage is brighter than that of the male. The plumage of juvenile birds generally resembles that of the female. Female ducks have evolved to have a corkscrew shaped vagina to prevent rape. + +## Distribution and habitat + +Flying steamer ducks in Ushuaia, Argentina + + + +Ducks have a cosmopolitan distribution, and are found on every continent except Antarctica.[5] Several species manage to live on subantarctic islands, including South Georgia and the Auckland Islands.[20] Ducks have reached a number of isolated oceanic islands, including the Hawaiian Islands, Micronesia and the Galápagos Islands, where they are often vagrants and less often residents.[21][22] A handful are endemic to such far-flung islands.[21] + +Female mallard in Cornwall, England + + + +Some duck species, mainly those breeding in the temperate and Arctic Northern Hemisphere, are migratory; those in the tropics are generally not. Some ducks, particularly in Australia where rainfall is erratic, are nomadic, seeking out the temporary lakes and pools that form after localised heavy rain.[23] + +## Behaviour + +### Feeding + +Pecten along the bill + + + +Mallard duckling preening + + + +Ducks eat food sources such as grasses, aquatic plants, fish, insects, small amphibians, worms, and small molluscs. + +Dabbling ducks feed on the surface of water or on land, or as deep as they can reach by up-ending without completely submerging.[24] Along the edge of the bill, there is a comb-like structure called a pecten. This strains the water squirting from the side of the bill and traps any food. The pecten is also used to preen feathers and to hold slippery food items. + +Diving ducks and sea ducks forage deep underwater. To be able to submerge more easily, the diving ducks are heavier than dabbling ducks, and therefore have more difficulty taking off to fly. + +A few specialized species such as the mergansers are adapted to catch and swallow large fish. + +The others have the characteristic wide flat bill adapted to dredging-type jobs such as pulling up waterweed, pulling worms and small molluscs out of mud, searching for insect larvae, and bulk jobs such as dredging out, holding, turning head first, and swallowing a squirming frog. To avoid injury when digging into sediment it has no cere, but the nostrils come out through hard horn. + +The Guardian published an article advising that ducks should not be fed with bread because it damages the health of the ducks and pollutes waterways.[25] + +### Breeding + +A Muscovy duckling + + + +Ducks generally only have one partner at a time, although the partnership usually only lasts one year.[26] Larger species and the more sedentary species (like fast-river specialists) tend to have pair-bonds that last numerous years.[27] Most duck species breed once a year, choosing to do so in favourable conditions (spring/summer or wet seasons). Ducks also tend to make a nest before breeding, and, after hatching, lead their ducklings to water. Mother ducks are very caring and protective of their young, but may abandon some of their ducklings if they are physically stuck in an area they cannot get out of (such as nesting in an enclosed courtyard) or are not prospering due to genetic defects or sickness brought about by hypothermia, starvation, or disease. Ducklings can also be orphaned by inconsistent late hatching where a few eggs hatch after the mother has abandoned the nest and led her ducklings to water.[28] + +### Communication + +Female mallard ducks (as well as several other species in the genus Anas, such as the American and Pacific black ducks, spot-billed duck, northern pintail and common teal) make the classic "quack" sound while males make a similar but raspier sound that is sometimes written as "breeeeze",[29][self-published source?] but, despite widespread misconceptions, most species of duck do not "quack".[30] In general, ducks make a range of calls, including whistles, cooing, yodels and grunts. For example, the scaup – which are diving ducks – make a noise like "scaup" (hence their name). Calls may be loud displaying calls or quieter contact calls. + +A common urban legend claims that duck quacks do not echo; however, this has been proven to be false. This myth was first debunked by the Acoustics Research Centre at the University of Salford in 2003 as part of the British Association's Festival of Science.[31] It was also debunked in one of the earlier episodes of the popular Discovery Channel television show MythBusters.[32] + +### Predators + +Ringed teal + + + +Ducks have many predators. Ducklings are particularly vulnerable, since their inability to fly makes them easy prey not only for predatory birds but also for large fish like pike, crocodilians, predatory testudines such as the alligator snapping turtle, and other aquatic hunters, including fish-eating birds such as herons. Ducks' nests are raided by land-based predators, and brooding females may be caught unaware on the nest by mammals, such as foxes, or large birds, such as hawks or owls. + +Adult ducks are fast fliers, but may be caught on the water by large aquatic predators including big fish such as the North American muskie and the European pike. In flight, ducks are safe from all but a few predators such as humans and the peregrine falcon, which uses its speed and strength to catch ducks. + +## Relationship with humans + +### Hunting + +Humans have hunted ducks since prehistoric times. Excavations of middens in California dating to 7800 – 6400 BP have turned up bones of ducks, including at least one now-extinct flightless species.[33] Ducks were captured in "significant numbers" by Holocene inhabitants of the lower Ohio River valley, suggesting they took advantage of the seasonal bounty provided by migrating waterfowl.[34] Neolithic hunters in locations as far apart as the Caribbean,[35] Scandinavia,[36] Egypt,[37] Switzerland,[38] and China relied on ducks as a source of protein for some or all of the year.[39] Archeological evidence shows that Māori people in New Zealand hunted the flightless Finsch's duck, possibly to extinction, though rat predation may also have contributed to its fate.[40] A similar end awaited the Chatham duck, a species with reduced flying capabilities which went extinct shortly after its island was colonised by Polynesian settlers.[41] It is probable that duck eggs were gathered by Neolithic hunter-gathers as well, though hard evidence of this is uncommon.[35][42] + +In many areas, wild ducks (including ducks farmed and released into the wild) are hunted for food or sport,[43] by shooting, or by being trapped using duck decoys. Because an idle floating duck or a duck squatting on land cannot react to fly or move quickly, "a sitting duck" has come to mean "an easy target". These ducks may be contaminated by pollutants such as PCBs.[44] + +### Domestication + +Indian Runner ducks, a common breed of domestic ducks + + + +Ducks have many economic uses, being farmed for their meat, eggs, and feathers (particularly their down). Approximately 3 billion ducks are slaughtered each year for meat worldwide.[45] They are also kept and bred by aviculturists and often displayed in zoos. Almost all the varieties of domestic ducks are descended from the mallard (Anas platyrhynchos), apart from the Muscovy duck (Cairina moschata).[46][47] The Call duck is another example of a domestic duck breed. Its name comes from its original use established by hunters, as a decoy to attract wild mallards from the sky, into traps set for them on the ground. The call duck is the world's smallest domestic duck breed, as it weighs less than 1 kg (2.2 lb).[48] + +### Heraldry + +Three black-colored ducks in the coat of arms of Maaninka[49] + + + +Ducks appear on several coats of arms, including the coat of arms of Lubāna (Latvia)[50] and the coat of arms of Föglö (Åland).[51] + +### Cultural references + +In 2002, psychologist Richard Wiseman and colleagues at the University of Hertfordshire, UK, finished a year-long LaughLab experiment, concluding that of all animals, ducks attract the most humor and silliness; he said, "If you're going to tell a joke involving an animal, make it a duck."[52] The word "duck" may have become an inherently funny word in many languages, possibly because ducks are seen as silly in their looks or behavior. Of the many ducks in fiction, many are cartoon characters, such as Walt Disney's Donald Duck, and Warner Bros.' Daffy Duck. Howard the Duck started as a comic book character in 1973[53][54] and was made into a movie in 1986. + +The 1992 Disney film The Mighty Ducks, starring Emilio Estevez, chose the duck as the mascot for the fictional youth hockey team who are protagonists of the movie, based on the duck being described as a fierce fighter. This led to the duck becoming the nickname and mascot for the eventual National Hockey League professional team of the Anaheim Ducks, who were founded with the name the Mighty Ducks of Anaheim.[citation needed] The duck is also the nickname of the University of Oregon sports teams as well as the Long Island Ducks minor league baseball team.[55] + +## See also + +- Birds portal + +- Domestic duck +- Duck as food +- Duck test +- Duck breeds +- Fictional ducks +- Rubber duck + +## Notes + +### Citations + +1. ^ "Duckling". The American Heritage Dictionary of the English Language, Fourth Edition. Houghton Mifflin Company. 2006. Retrieved 2015-05-22. +2. ^ "Duckling". Kernerman English Multilingual Dictionary (Beta Version). K. Dictionaries Ltd. 2000–2006. Retrieved 2015-05-22. +3. ^ Dohner, Janet Vorwald (2001). The Encyclopedia of Historic and Endangered Livestock and Poultry Breeds. Yale University Press. ISBN 978-0300138139. +4. ^ Visca, Curt; Visca, Kelley (2003). How to Draw Cartoon Birds. The Rosen Publishing Group. ISBN 9780823961566. +5. ^ a b c d Carboneras 1992, p. 536. +6. ^ Livezey 1986, pp. 737–738. +7. ^ Madsen, McHugh & de Kloet 1988, p. 452. +8. ^ Donne-Goussé, Laudet & Hänni 2002, pp. 353–354. +9. ^ a b c d e f Carboneras 1992, p. 540. +10. ^ Elphick, Dunning & Sibley 2001, p. 191. +11. ^ Kear 2005, p. 448. +12. ^ Kear 2005, p. 622–623. +13. ^ Kear 2005, p. 686. +14. ^ Elphick, Dunning & Sibley 2001, p. 193. +15. ^ a b c d e f g Carboneras 1992, p. 537. +16. ^ American Ornithologists' Union 1998, p. xix. +17. ^ American Ornithologists' Union 1998. +18. ^ Carboneras 1992, p. 538. +19. ^ Christidis & Boles 2008, p. 62. +20. ^ Shirihai 2008, pp. 239, 245. +21. ^ a b Pratt, Bruner & Berrett 1987, pp. 98–107. +22. ^ Fitter, Fitter & Hosking 2000, pp. 52–3. +23. ^ "Pacific Black Duck". www.wiresnr.org. Retrieved 2018-04-27. +24. ^ Ogden, Evans. "Dabbling Ducks". CWE. Retrieved 2006-11-02. +25. ^ Karl Mathiesen (16 March 2015). "Don't feed the ducks bread, say conservationists". The Guardian. Retrieved 13 November 2016. +26. ^ Rohwer, Frank C.; Anderson, Michael G. (1988). "Female-Biased Philopatry, Monogamy, and the Timing of Pair Formation in Migratory Waterfowl". Current Ornithology. pp. 187–221. doi:10.1007/978-1-4615-6787-5\_4. ISBN 978-1-4615-6789-9. +27. ^ Smith, Cyndi M.; Cooke, Fred; Robertson, Gregory J.; Goudie, R. Ian; Boyd, W. Sean (2000). "Long-Term Pair Bonds in Harlequin Ducks". The Condor. 102 (1): 201–205. doi:10.1093/condor/102.1.201. hdl:10315/13797. +28. ^ "If You Find An Orphaned Duckling - Wildlife Rehabber". wildliferehabber.com. Archived from the original on 2018-09-23. Retrieved 2018-12-22. +29. ^ Carver, Heather (2011). The Duck Bible. Lulu.com. ISBN 9780557901562.[self-published source] +30. ^ Titlow, Budd (2013-09-03). Bird Brains: Inside the Strange Minds of Our Fine Feathered Friends. Rowman & Littlefield. ISBN 9780762797707. +31. ^ Amos, Jonathan (2003-09-08). "Sound science is quackers". BBC News. Retrieved 2006-11-02. +32. ^ "Mythbusters Episode 8". 12 December 2003. +33. ^ Erlandson 1994, p. 171. +34. ^ Jeffries 2008, pp. 168, 243. +35. ^ a b Sued-Badillo 2003, p. 65. +36. ^ Thorpe 1996, p. 68. +37. ^ Maisels 1999, p. 42. +38. ^ Rau 1876, p. 133. +39. ^ Higman 2012, p. 23. +40. ^ Hume 2012, p. 53. +41. ^ Hume 2012, p. 52. +42. ^ Fieldhouse 2002, p. 167. +43. ^ Livingston, A. D. (1998-01-01). Guide to Edible Plants and Animals. Wordsworth Editions, Limited. ISBN 9781853263774. +44. ^ "Study plan for waterfowl injury assessment: Determining PCB concentrations in Hudson river resident waterfowl" (PDF). New York State Department of Environmental Conservation. US Department of Commerce. December 2008. p. 3. Archived (PDF) from the original on 2022-10-09. Retrieved 2 July 2019. +45. ^ "FAOSTAT". www.fao.org. Retrieved 2019-10-25. +46. ^ "Anas platyrhynchos, Domestic Duck; DigiMorph Staff - The University of Texas at Austin". Digimorph.org. Retrieved 2012-12-23. +47. ^ Sy Montgomery. "Mallard; Encyclopædia Britannica". Britannica.com. Retrieved 2012-12-23. +48. ^ Glenday, Craig (2014). Guinness World Records. Guinness World Records Limited. pp. 135. ISBN 978-1-908843-15-9. +49. ^ Suomen kunnallisvaakunat (in Finnish). Suomen Kunnallisliitto. 1982. p. 147. ISBN 951-773-085-3. +50. ^ "Lubānas simbolika" (in Latvian). Retrieved September 9, 2021. +51. ^ "Föglö" (in Swedish). Retrieved September 9, 2021. +52. ^ Young, Emma. "World's funniest joke revealed". New Scientist. Retrieved 7 January 2019. +53. ^ "Howard the Duck (character)". Grand Comics Database. +54. ^ Sanderson, Peter; Gilbert, Laura (2008). "1970s". Marvel Chronicle A Year by Year History. London, United Kingdom: Dorling Kindersley. p. 161. ISBN 978-0756641238. December saw the debut of the cigar-smoking Howard the Duck. In this story by writer Steve Gerber and artist Val Mayerik, various beings from different realities had begun turning up in the Man-Thing's Florida swamp, including this bad-tempered talking duck. +55. ^ "The Duck". University of Oregon Athletics. Retrieved 2022-01-20. + +### Sources + +- American Ornithologists' Union (1998). Checklist of North American Birds (PDF). Washington, DC: American Ornithologists' Union. ISBN 978-1-891276-00-2. Archived (PDF) from the original on 2022-10-09. +- Carboneras, Carlos (1992). del Hoyo, Josep; Elliott, Andrew; Sargatal, Jordi (eds.). Handbook of the Birds of the World. Vol. 1: Ostrich to Ducks. Barcelona: Lynx Edicions. ISBN 978-84-87334-10-8. +- Christidis, Les; Boles, Walter E., eds. (2008). Systematics and Taxonomy of Australian Birds. Collingwood, VIC: Csiro Publishing. ISBN 978-0-643-06511-6. +- Donne-Goussé, Carole; Laudet, Vincent; Hänni, Catherine (July 2002). "A molecular phylogeny of Anseriformes based on mitochondrial DNA analysis". Molecular Phylogenetics and Evolution. 23 (3): 339–356. Bibcode:2002MolPE..23..339D. doi:10.1016/S1055-7903(02)00019-2. PMID 12099792. +- Elphick, Chris; Dunning, John B. Jr.; Sibley, David, eds. (2001). The Sibley Guide to Bird Life and Behaviour. London: Christopher Helm. ISBN 978-0-7136-6250-4. +- Erlandson, Jon M. (1994). Early Hunter-Gatherers of the California Coast. New York, NY: Springer Science & Business Media. ISBN 978-1-4419-3231-0. +- Fieldhouse, Paul (2002). Food, Feasts, and Faith: An Encyclopedia of Food Culture in World Religions. Vol. I: A–K. Santa Barbara: ABC-CLIO. ISBN 978-1-61069-412-4. +- Fitter, Julian; Fitter, Daniel; Hosking, David (2000). Wildlife of the Galápagos. Princeton, NJ: Princeton University Press. ISBN 978-0-691-10295-5. +- Higman, B. W. (2012). How Food Made History. Chichester, UK: John Wiley & Sons. ISBN 978-1-4051-8947-7. +- Hume, Julian H. (2012). Extinct Birds. London: Christopher Helm. ISBN 978-1-4729-3744-5. +- Jeffries, Richard (2008). Holocene Hunter-Gatherers of the Lower Ohio River Valley. Tuscaloosa: University of Alabama Press. ISBN 978-0-8173-1658-7. +- Kear, Janet, ed. (2005). Ducks, Geese and Swans: Species Accounts (Cairina to Mergus). Bird Families of the World. Oxford: Oxford University Press. ISBN 978-0-19-861009-0. +- Livezey, Bradley C. (October 1986). "A phylogenetic analysis of recent Anseriform genera using morphological characters" (PDF). The Auk. 103 (4): 737–754. doi:10.1093/auk/103.4.737. Archived (PDF) from the original on 2022-10-09. +- Madsen, Cort S.; McHugh, Kevin P.; de Kloet, Siwo R. (July 1988). "A partial classification of waterfowl (Anatidae) based on single-copy DNA" (PDF). The Auk. 105 (3): 452–459. doi:10.1093/auk/105.3.452. Archived (PDF) from the original on 2022-10-09. +- Maisels, Charles Keith (1999). Early Civilizations of the Old World. London: Routledge. ISBN 978-0-415-10975-8. +- Pratt, H. Douglas; Bruner, Phillip L.; Berrett, Delwyn G. (1987). A Field Guide to the Birds of Hawaii and the Tropical Pacific. Princeton, NJ: Princeton University Press. ISBN 0-691-02399-9. +- Rau, Charles (1876). Early Man in Europe. New York: Harper & Brothers. LCCN 05040168. +- Shirihai, Hadoram (2008). A Complete Guide to Antarctic Wildlife. Princeton, NJ, US: Princeton University Press. ISBN 978-0-691-13666-0. +- Sued-Badillo, Jalil (2003). Autochthonous Societies. General History of the Caribbean. Paris: UNESCO. ISBN 978-92-3-103832-7. +- Thorpe, I. J. (1996). The Origins of Agriculture in Europe. New York: Routledge. ISBN 978-0-415-08009-5. + +## External links + +- Definitions from Wiktionary +- Media from Commons +- Quotations from Wikiquote +- Recipes from Wikibooks +- Taxa from Wikispecies +- Data from Wikidata + +- list of books (useful looking abstracts) +- Ducks on postage stamps Archived 2013-05-13 at the Wayback Machine +- +- Ducks at a Distance, by Rob Hines at Project Gutenberg - A modern illustrated guide to identification of US waterfowl + +| Authority control databases | Authority control databases | +|--------------------------------|----------------------------------------------| +| National | United StatesFranceBnF dataJapanLatviaIsrael | +| Other | IdRef | + + + +- Ducks +- Game birds +- Bird common names + +- All accuracy disputes +- Accuracy disputes from February 2020 +- CS1 Finnish-language sources (fi) +- CS1 Latvian-language sources (lv) +- CS1 Swedish-language sources (sv) +- Articles with short description +- Short description is different from Wikidata +- Wikipedia indefinitely move-protected pages +- Wikipedia indefinitely semi-protected pages +- Articles with 'species' microformats +- Articles containing Old English (ca. 450-1100)-language text +- Articles containing Dutch-language text +- Articles containing German-language text +- Articles containing Norwegian-language text +- Articles containing Lithuanian-language text +- Articles containing Ancient Greek (to 1453)-language text +- All articles with self-published sources +- Articles with self-published sources from February 2020 +- All articles with unsourced statements +- Articles with unsourced statements from January 2022 +- CS1: long volume value +- Pages using Sister project links with wikidata mismatch +- Pages using Sister project links with hidden wikidata +- Webarchive template wayback links +- Articles with Project Gutenberg links +- Articles containing video clips + +- This page was last edited on 21 September 2024, at 12:11 (UTC). +- Text is available under the Creative Commons Attribution-ShareAlike License 4.0; +additional terms may apply. By using this site, you agree to the Terms of Use and Privacy Policy. Wikipedia® is a registered trademark of the Wikimedia Foundation, Inc., a non-profit organization. + +- Privacy policy +- About Wikipedia +- Disclaimers +- Contact Wikipedia +- Code of Conduct +- Developers +- Statistics +- Cookie statement +- Mobile view + +- +- \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/word_sample.docx.itxt b/tests/data/groundtruth/docling_v2/word_sample.docx.itxt new file mode 100644 index 00000000..dbce1f7a --- /dev/null +++ b/tests/data/groundtruth/docling_v2/word_sample.docx.itxt @@ -0,0 +1,29 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: paragraph: Summer activities + item-2 at level 1: title: Swimming in the lake + item-3 at level 2: paragraph: Duck + item-4 at level 2: paragraph: + item-5 at level 2: paragraph: Figure 1: This is a cute duckling + item-6 at level 2: section_header: Let’s swim! + item-7 at level 3: paragraph: To get started with swimming, fi ... down in a water and try not to drown: + item-8 at level 3: list: group list + item-9 at level 4: list_item: You can relax and look around + item-10 at level 4: list_item: Paddle about + item-11 at level 4: list_item: Enjoy summer warmth + item-12 at level 3: paragraph: Also, don’t forget: + item-13 at level 3: list: group list + item-14 at level 4: list_item: Wear sunglasses + item-15 at level 4: list_item: Don’t forget to drink water + item-16 at level 4: list_item: Use sun cream + item-17 at level 3: paragraph: Hmm, what else… + item-18 at level 3: section_header: Let’s eat + item-19 at level 4: paragraph: After we had a good day of swimm ... , it’s important to eat something nice + item-20 at level 4: paragraph: I like to eat leaves + item-21 at level 4: paragraph: Here are some interesting things a respectful duck could eat: + item-22 at level 4: table with [4x3] + item-23 at level 4: paragraph: + item-24 at level 4: paragraph: And let’s add another list in the end: + item-25 at level 4: list: group list + item-26 at level 5: list_item: Leaves + item-27 at level 5: list_item: Berries + item-28 at level 5: list_item: Grain \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/word_sample.docx.json b/tests/data/groundtruth/docling_v2/word_sample.docx.json new file mode 100644 index 00000000..a1ccaa13 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/word_sample.docx.json @@ -0,0 +1,749 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.0.0", + "name": "word_sample", + "origin": { + "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "binary_hash": 5964280909995938039, + "filename": "word_sample.docx" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/texts/1" + } + ], + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/texts/5" + }, + "children": [ + { + "$ref": "#/texts/7" + }, + { + "$ref": "#/texts/8" + }, + { + "$ref": "#/texts/9" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/1", + "parent": { + "$ref": "#/texts/5" + }, + "children": [ + { + "$ref": "#/texts/11" + }, + { + "$ref": "#/texts/12" + }, + { + "$ref": "#/texts/13" + } + ], + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/2", + "parent": { + "$ref": "#/texts/15" + }, + "children": [ + { + "$ref": "#/texts/21" + }, + { + "$ref": "#/texts/22" + }, + { + "$ref": "#/texts/23" + } + ], + "name": "list", + "label": "list" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Summer activities", + "text": "Summer activities" + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/texts/5" + } + ], + "label": "title", + "prov": [], + "orig": "Swimming in the lake", + "text": "Swimming in the lake" + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/texts/1" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Duck", + "text": "Duck" + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/texts/1" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/texts/1" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Figure 1: This is a cute duckling", + "text": "Figure 1: This is a cute duckling" + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/texts/1" + }, + "children": [ + { + "$ref": "#/texts/6" + }, + { + "$ref": "#/groups/0" + }, + { + "$ref": "#/texts/10" + }, + { + "$ref": "#/groups/1" + }, + { + "$ref": "#/texts/14" + }, + { + "$ref": "#/texts/15" + } + ], + "label": "section_header", + "prov": [], + "orig": "Let\u2019s swim!", + "text": "Let\u2019s swim!", + "level": 1 + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/texts/5" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "To get started with swimming, first lay down in a water and try not to drown:", + "text": "To get started with swimming, first lay down in a water and try not to drown:" + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "You can relax and look around", + "text": "You can relax and look around", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Paddle about", + "text": "Paddle about", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Enjoy summer warmth", + "text": "Enjoy summer warmth", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/texts/5" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Also, don\u2019t forget:", + "text": "Also, don\u2019t forget:" + }, + { + "self_ref": "#/texts/11", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Wear sunglasses", + "text": "Wear sunglasses", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/12", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Don\u2019t forget to drink water", + "text": "Don\u2019t forget to drink water", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/13", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Use sun cream", + "text": "Use sun cream", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/14", + "parent": { + "$ref": "#/texts/5" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Hmm, what else\u2026", + "text": "Hmm, what else\u2026" + }, + { + "self_ref": "#/texts/15", + "parent": { + "$ref": "#/texts/5" + }, + "children": [ + { + "$ref": "#/texts/16" + }, + { + "$ref": "#/texts/17" + }, + { + "$ref": "#/texts/18" + }, + { + "$ref": "#/tables/0" + }, + { + "$ref": "#/texts/19" + }, + { + "$ref": "#/texts/20" + }, + { + "$ref": "#/groups/2" + } + ], + "label": "section_header", + "prov": [], + "orig": "Let\u2019s eat", + "text": "Let\u2019s eat", + "level": 2 + }, + { + "self_ref": "#/texts/16", + "parent": { + "$ref": "#/texts/15" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "After we had a good day of swimming in the lake, it\u2019s important to eat something nice", + "text": "After we had a good day of swimming in the lake, it\u2019s important to eat something nice" + }, + { + "self_ref": "#/texts/17", + "parent": { + "$ref": "#/texts/15" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "I like to eat leaves", + "text": "I like to eat leaves" + }, + { + "self_ref": "#/texts/18", + "parent": { + "$ref": "#/texts/15" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Here are some interesting things a respectful duck could eat:", + "text": "Here are some interesting things a respectful duck could eat:" + }, + { + "self_ref": "#/texts/19", + "parent": { + "$ref": "#/texts/15" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/20", + "parent": { + "$ref": "#/texts/15" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "And let\u2019s add another list in the end:", + "text": "And let\u2019s add another list in the end:" + }, + { + "self_ref": "#/texts/21", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Leaves", + "text": "Leaves", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/22", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Berries", + "text": "Berries", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/23", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "label": "list_item", + "prov": [], + "orig": "Grain", + "text": "Grain", + "enumerated": false, + "marker": "-" + } + ], + "pictures": [], + "tables": [ + { + "self_ref": "#/tables/0", + "parent": { + "$ref": "#/texts/15" + }, + "children": [], + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Food", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Calories per portion", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Leaves", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Ash, Elm, Maple", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "50", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Berries", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Blueberry, Strawberry, Cranberry", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "150", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Grain", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Corn, Buckwheat, Barley", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "200", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 4, + "num_cols": 3, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Food", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Calories per portion", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Leaves", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Ash, Elm, Maple", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "50", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Berries", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Blueberry, Strawberry, Cranberry", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "150", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Grain", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Corn, Buckwheat, Barley", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "200", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + } + ], + "key_value_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/word_sample.docx.md b/tests/data/groundtruth/docling_v2/word_sample.docx.md new file mode 100644 index 00000000..639c8780 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/word_sample.docx.md @@ -0,0 +1,43 @@ +Summer activities + +# Swimming in the lake + +Duck + +Figure 1: This is a cute duckling + +## Let’s swim! + +To get started with swimming, first lay down in a water and try not to drown: + +- You can relax and look around +- Paddle about +- Enjoy summer warmth + +Also, don’t forget: + +- Wear sunglasses +- Don’t forget to drink water +- Use sun cream + +Hmm, what else… + +### Let’s eat + +After we had a good day of swimming in the lake, it’s important to eat something nice + +I like to eat leaves + +Here are some interesting things a respectful duck could eat: + +| | Food | Calories per portion | +|---------|----------------------------------|------------------------| +| Leaves | Ash, Elm, Maple | 50 | +| Berries | Blueberry, Strawberry, Cranberry | 150 | +| Grain | Corn, Buckwheat, Barley | 200 | + +And let’s add another list in the end: + +- Leaves +- Berries +- Grain \ No newline at end of file diff --git a/tests/data/html/example_01.html b/tests/data/html/example_01.html new file mode 100644 index 00000000..792dc6c2 --- /dev/null +++ b/tests/data/html/example_01.html @@ -0,0 +1,17 @@ + + +

Introduction

+

This is the first paragraph of the introduction.

+

Background

+

Some background information here.

+ Example image +
    +
  • First item in unordered list
  • +
  • Second item in unordered list
  • +
+
    +
  1. First item in ordered list
  2. +
  3. Second item in ordered list
  4. +
+ + diff --git a/tests/data/html/example_02.html b/tests/data/html/example_02.html new file mode 100644 index 00000000..5f7c4d63 --- /dev/null +++ b/tests/data/html/example_02.html @@ -0,0 +1,16 @@ + + +

Introduction

+

This is the first paragraph of the introduction.

+

Background

+

Some background information here.

+
    +
  • First item in unordered list
  • +
  • Second item in unordered list
  • +
+
    +
  1. First item in ordered list
  2. +
  3. Second item in ordered list
  4. +
+ + diff --git a/tests/data/html/example_03.html b/tests/data/html/example_03.html new file mode 100644 index 00000000..c3252a73 --- /dev/null +++ b/tests/data/html/example_03.html @@ -0,0 +1,66 @@ + + + + + +

Example Document

+

Introduction

+

This is the first paragraph of the introduction.

+

Background

+

Some background information here.

+
    +
  • First item in unordered list +
      +
    • Nested item 1
    • +
    • Nested item 2
    • +
    +
  • +
  • Second item in unordered list
  • +
+
    +
  1. First item in ordered list +
      +
    1. Nested ordered item 1
    2. +
    3. Nested ordered item 2
    4. +
    +
  2. +
  3. Second item in ordered list
  4. +
+

Data Table

+ + + + + + + + + + + + + + + + + + + + + +
Header 1Header 2Header 3
Row 1, Col 1Row 1, Col 2Row 1, Col 3
Row 2, Col 1Row 2, Col 2Row 2, Col 3
Row 3, Col 1Row 3, Col 2Row 3, Col 3
+ + diff --git a/tests/data/html/example_04.html b/tests/data/html/example_04.html new file mode 100644 index 00000000..0f62771b --- /dev/null +++ b/tests/data/html/example_04.html @@ -0,0 +1,24 @@ + + +

Data Table with Rowspan and Colspan

+ + + + + + + + + + + + + + + + + + +
Header 1Header 2 & 3 (colspan)
Row 1 & 2, Col 1 (rowspan)Row 1, Col 2Row 1, Col 3
Row 2, Col 2 & 3 (colspan)
Row 3, Col 1Row 3, Col 2Row 3, Col 3
+ + diff --git a/tests/data/html/unit_test_01.html b/tests/data/html/unit_test_01.html new file mode 100644 index 00000000..82a696c4 --- /dev/null +++ b/tests/data/html/unit_test_01.html @@ -0,0 +1,11 @@ + + +

Title

+

section-1

+

section-1.1

+

section-2

+

section-2.0.1

+

section-2.2

+

section-2.3

+ + diff --git a/tests/data/wiki_duck.html b/tests/data/html/wiki_duck.html similarity index 100% rename from tests/data/wiki_duck.html rename to tests/data/html/wiki_duck.html diff --git a/tests/data/powerpoint_sample.pptx b/tests/data/pptx/powerpoint_sample.pptx similarity index 100% rename from tests/data/powerpoint_sample.pptx rename to tests/data/pptx/powerpoint_sample.pptx diff --git a/tests/test_backend_html.py b/tests/test_backend_html.py index f5ec0351..3bd27242 100644 --- a/tests/test_backend_html.py +++ b/tests/test_backend_html.py @@ -1,12 +1,21 @@ +import json +import os from pathlib import Path from docling.backend.html_backend import HTMLDocumentBackend from docling.datamodel.base_models import InputFormat -from docling.datamodel.document import InputDocument, SectionHeaderItem +from docling.datamodel.document import ( + ConversionResult, + InputDocument, + SectionHeaderItem, +) +from docling.document_converter import DocumentConverter + +GENERATE = False def test_heading_levels(): - in_path = Path("tests/data/wiki_duck.html") + in_path = Path("tests/data/html/wiki_duck.html") in_doc = InputDocument( path_or_stream=in_path, format=InputFormat.HTML, @@ -28,3 +37,66 @@ def test_heading_levels(): found_lvl_3 = True assert item.level == 3 assert found_lvl_2 and found_lvl_3 + + +def get_html_paths(): + + # Define the directory you want to search + directory = Path("./tests/data/html/") + + # List all PDF files in the directory and its subdirectories + html_files = sorted(directory.rglob("*.html")) + return html_files + + +def get_converter(): + + converter = DocumentConverter(allowed_formats=[InputFormat.HTML]) + + return converter + + +def verify_export(pred_text: str, gtfile: str): + + if not os.path.exists(gtfile) or GENERATE: + with open(gtfile, "w") as fw: + fw.write(pred_text) + + return True + + else: + with open(gtfile, "r") as fr: + true_text = fr.read() + + assert pred_text == true_text, f"pred_text!=true_text for {gtfile}" + return pred_text == true_text + + +def test_e2e_html_conversions(): + + html_paths = get_html_paths() + converter = get_converter() + + for html_path in html_paths: + # print(f"converting {html_path}") + + gt_path = ( + html_path.parent.parent / "groundtruth" / "docling_v2" / html_path.name + ) + + conv_result: ConversionResult = converter.convert(html_path) + + doc: DoclingDocument = conv_result.document + + pred_md: str = doc.export_to_markdown() + assert verify_export(pred_md, str(gt_path) + ".md"), "export to md" + + pred_itxt: str = doc._export_to_indented_text( + max_text_len=70, explicit_tables=False + ) + assert verify_export( + pred_itxt, str(gt_path) + ".itxt" + ), "export to indented-text" + + pred_json: str = json.dumps(doc.export_to_dict(), indent=2) + assert verify_export(pred_json, str(gt_path) + ".json"), "export to json" diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py index 4544e717..24db6775 100644 --- a/tests/test_backend_msword.py +++ b/tests/test_backend_msword.py @@ -1,12 +1,21 @@ +import json +import os from pathlib import Path from docling.backend.msword_backend import MsWordDocumentBackend from docling.datamodel.base_models import InputFormat -from docling.datamodel.document import InputDocument, SectionHeaderItem +from docling.datamodel.document import ( + ConversionResult, + InputDocument, + SectionHeaderItem, +) +from docling.document_converter import DocumentConverter + +GENERATE = False def test_heading_levels(): - in_path = Path("tests/data/word_sample.docx") + in_path = Path("tests/data/docx/word_sample.docx") in_doc = InputDocument( path_or_stream=in_path, format=InputFormat.DOCX, @@ -28,3 +37,66 @@ def test_heading_levels(): found_lvl_2 = True assert item.level == 2 assert found_lvl_1 and found_lvl_2 + + +def get_docx_paths(): + + # Define the directory you want to search + directory = Path("./tests/data/docx/") + + # List all PDF files in the directory and its subdirectories + pdf_files = sorted(directory.rglob("*.docx")) + return pdf_files + + +def get_converter(): + + converter = DocumentConverter(allowed_formats=[InputFormat.DOCX]) + + return converter + + +def verify_export(pred_text: str, gtfile: str): + + if not os.path.exists(gtfile) or GENERATE: + with open(gtfile, "w") as fw: + fw.write(pred_text) + + return True + + else: + with open(gtfile, "r") as fr: + true_text = fr.read() + + assert pred_text == true_text, "pred_itxt==true_itxt" + return pred_text == true_text + + +def test_e2e_docx_conversions(): + + docx_paths = get_docx_paths() + converter = get_converter() + + for docx_path in docx_paths: + # print(f"converting {docx_path}") + + gt_path = ( + docx_path.parent.parent / "groundtruth" / "docling_v2" / docx_path.name + ) + + conv_result: ConversionResult = converter.convert(docx_path) + + doc: DoclingDocument = conv_result.document + + pred_md: str = doc.export_to_markdown() + assert verify_export(pred_md, str(gt_path) + ".md"), "export to md" + + pred_itxt: str = doc._export_to_indented_text( + max_text_len=70, explicit_tables=False + ) + assert verify_export( + pred_itxt, str(gt_path) + ".itxt" + ), "export to indented-text" + + pred_json: str = json.dumps(doc.export_to_dict(), indent=2) + assert verify_export(pred_json, str(gt_path) + ".json"), "export to json" diff --git a/tests/test_e2e_conversion.py b/tests/test_e2e_conversion.py index 1e166116..4911c248 100644 --- a/tests/test_e2e_conversion.py +++ b/tests/test_e2e_conversion.py @@ -43,7 +43,7 @@ def get_converter(): return converter -def test_e2e_conversions(): +def test_e2e_pdfs_conversions(): pdf_paths = get_pdf_paths() converter = get_converter() diff --git a/tests/test_legacy_format_transform.py b/tests/test_legacy_format_transform.py index 4f764542..28800edd 100644 --- a/tests/test_legacy_format_transform.py +++ b/tests/test_legacy_format_transform.py @@ -11,10 +11,10 @@ @pytest.fixture def test_doc_paths(): return [ - Path("tests/data/wiki_duck.html"), - Path("tests/data/word_sample.docx"), - Path("tests/data/lorem_ipsum.docx"), - Path("tests/data/powerpoint_sample.pptx"), + Path("tests/data/html/wiki_duck.html"), + Path("tests/data/docx/word_sample.docx"), + Path("tests/data/docx/lorem_ipsum.docx"), + Path("tests/data/pptx/powerpoint_sample.pptx"), Path("tests/data/2305.03393v1-pg9-img.png"), Path("tests/data/2206.01062.pdf"), ]