diff --git a/README.md b/README.md index 09aa6d1..82d4c99 100644 --- a/README.md +++ b/README.md @@ -15,13 +15,13 @@ [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) [![License MIT](https://img.shields.io/github/license/DS4SD/quackling)](https://opensource.org/licenses/MIT) -Quackling enables document-native generative AI applications, such as RAG, based on [Docling](https://github.com/DS4SD/docling). +Easily build document-native generative AI applications, such as RAG, leveraging [Docling](https://github.com/DS4SD/docling)'s efficient PDF extraction and rich data model — while still using your favorite framework, [🦙 LlamaIndex](https://docs.llamaindex.ai/en/stable/) or [🦜🔗 LangChain](https://python.langchain.com/). ## Features - 🧠 Enables rich gen AI applications by providing capabilities on native document level — not just plain text / Markdown! - ⚡️ Leverages Docling's conversion quality and speed. -- ⚙️ Integrates with standard LLM application frameworks, such as LlamaIndex, for building powerful applications like RAG. +- ⚙️ Plug-and-play integration with LlamaIndex and LangChain for building powerful applications like RAG.

@@ -40,12 +40,11 @@ pip install quackling ## Usage -Quackling offers core capabilities (`quackling.core`), as well as framework integration components -e.g. for LlamaIndex (`quackling.llama_index`). Below you find examples of both. +Quackling offers core capabilities (`quackling.core`), as well as framework integration components (`quackling.llama_index` and `quackling.langchain`). Below you find examples of both. ### Basic RAG -Below you find a basic RAG pipeline using LlamaIndex. +Here is a basic RAG pipeline using LlamaIndex: > [!NOTE] > To use as is, first `pip install llama-index-embeddings-huggingface llama-index-llms-huggingface-api` @@ -67,7 +66,7 @@ QUESTION = "How many pages were human annotated?" EMBED_MODEL = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5") LLM = HuggingFaceInferenceAPI( token=os.getenv("HF_TOKEN"), - model_name="mistralai/Mixtral-8x7B-Instruct-v0.1", + model_name="mistralai/Mistral-7B-Instruct-v0.3", ) index = VectorStoreIndex.from_documents( @@ -105,13 +104,18 @@ chunks = list(HierarchicalChunker().chunk(doc)) ``` ## More examples -Check out the [examples](examples) — showcasing different variants of RAG incl. vector ingestion & retrieval: -- [[LlamaIndex] Milvus basic RAG (dense embeddings)](examples/basic_pipeline.ipynb) -- [[LlamaIndex] Milvus hybrid RAG (dense & sparse embeddings combined e.g. via RRF) & reranker model usage](examples/hybrid_pipeline.ipynb) -- [[LlamaIndex] Milvus RAG also fetching native document metadata for search results](examples/native_nodes.ipynb) -- [[LlamaIndex] Local node transformations (e.g. embeddings)](examples/node_transformations.ipynb) + +### LlamaIndex + +- [Milvus basic RAG (dense embeddings)](examples/llama_index/basic_pipeline.ipynb) +- [Milvus hybrid RAG (dense & sparse embeddings combined e.g. via RRF) & reranker model usage](examples/llama_index/hybrid_pipeline.ipynb) +- [Milvus RAG also fetching native document metadata for search results](examples/llama_index/native_nodes.ipynb) +- [Local node transformations (e.g. embeddings)](examples/llama_index/node_transformations.ipynb) - ... +### LangChain +- [Milvus basic RAG (dense embeddings)](examples/langchain/basic_pipeline.ipynb) + ## Contributing Please read [Contributing to Quackling](./CONTRIBUTING.md) for details. diff --git a/examples/langchain/basic_pipeline.ipynb b/examples/langchain/basic_pipeline.ipynb new file mode 100644 index 0000000..9416f4b --- /dev/null +++ b/examples/langchain/basic_pipeline.ipynb @@ -0,0 +1,350 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "# requirements for this example:\n", + "%pip install -qq \\\n", + " quackling \\\n", + " python-dotenv \\\n", + " langchain-text-splitters \\\n", + " langchain-huggingface \\\n", + " langchain-milvus" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv()\n", + "\n", + "FILE_PATH = \"https://arxiv.org/pdf/2206.01062\" # DocLayNet paper" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "\n", + "warnings.filterwarnings(action=\"ignore\", category=UserWarning, module=\"pydantic|torch\")\n", + "warnings.filterwarnings(action=\"ignore\", category=FutureWarning, module=\"easyocr\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Loader and splitter" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from quackling.langchain.loaders import DoclingPDFLoader" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Below we set up:\n", + "- a `Loader` which will be used to create LangChain documents, and\n", + "- a splitter, which will be used to split these documents" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Using JSON" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To leverage Docling's rich document structure format, we namely set the parse type to JSON and use a `HierarchicalJSONSplitter` accordingly:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "37a35b6883bd444293bae3a589be56e5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Fetching 7 files: 0%| | 0/7 [00:00=19.2.0" +[[package]] +name = "jsonpatch" +version = "1.33" +description = "Apply JSON-Patches (RFC 6902)" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*" +files = [ + {file = "jsonpatch-1.33-py2.py3-none-any.whl", hash = "sha256:0ae28c0cd062bbd8b8ecc26d7d164fbbea9652a1a3693f3b956c1eae5145dade"}, + {file = "jsonpatch-1.33.tar.gz", hash = "sha256:9fcd4009c41e6d12348b4a0ff2563ba56a2923a7dfee731d004e212e1ee5030c"}, +] + +[package.dependencies] +jsonpointer = ">=1.9" + [[package]] name = "jsonpath-ng" version = "1.6.1" @@ -2245,6 +2259,17 @@ files = [ [package.dependencies] ply = "*" +[[package]] +name = "jsonpointer" +version = "3.0.0" +description = "Identify specific nodes in a JSON document (RFC 6901)" +optional = false +python-versions = ">=3.7" +files = [ + {file = "jsonpointer-3.0.0-py2.py3-none-any.whl", hash = "sha256:13e088adc14fca8b6aa8177c044e12701e6ad4b28ff10e65f2267a90109c9942"}, + {file = "jsonpointer-3.0.0.tar.gz", hash = "sha256:2b2d729f2091522d61c3b31f82e11870f60b68f43fbc705cb76bf4b832af59ef"}, +] + [[package]] name = "jsonref" version = "1.1.0" @@ -2480,6 +2505,100 @@ files = [ {file = "kiwisolver-1.4.5.tar.gz", hash = "sha256:e57e563a57fb22a142da34f38acc2fc1a5c864bc29ca1517a88abc963e60d6ec"}, ] +[[package]] +name = "langchain-core" +version = "0.2.38" +description = "Building applications with LLMs through composability" +optional = false +python-versions = "<4.0,>=3.8.1" +files = [ + {file = "langchain_core-0.2.38-py3-none-any.whl", hash = "sha256:8a5729bc7e68b4af089af20eff44fe4e7ca21d0e0c87ec21cef7621981fd1a4a"}, + {file = "langchain_core-0.2.38.tar.gz", hash = "sha256:eb69dbedd344f2ee1f15bcea6c71a05884b867588fadc42d04632e727c1238f3"}, +] + +[package.dependencies] +jsonpatch = ">=1.33,<2.0" +langsmith = ">=0.1.75,<0.2.0" +packaging = ">=23.2,<25" +pydantic = [ + {version = ">=1,<3", markers = "python_full_version < \"3.12.4\""}, + {version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""}, +] +PyYAML = ">=5.3" +tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<9.0.0" +typing-extensions = ">=4.7" + +[[package]] +name = "langchain-huggingface" +version = "0.0.3" +description = "An integration package connecting Hugging Face and LangChain" +optional = true +python-versions = "<4.0,>=3.8.1" +files = [ + {file = "langchain_huggingface-0.0.3-py3-none-any.whl", hash = "sha256:d6827adf3c7c8fcc0bca8c43c7e900c3bf68af9a1532a83d4b8ace137e02887e"}, + {file = "langchain_huggingface-0.0.3.tar.gz", hash = "sha256:0637acf484c47323cf3dcc46745a93467f6955989af9b7c01e2382fe1b630aaf"}, +] + +[package.dependencies] +huggingface-hub = ">=0.23.0" +langchain-core = ">=0.1.52,<0.3" +sentence-transformers = ">=2.6.0" +tokenizers = ">=0.19.1" +transformers = ">=4.39.0" + +[[package]] +name = "langchain-milvus" +version = "0.1.4" +description = "An integration package connecting Milvus and LangChain" +optional = true +python-versions = "<4.0,>=3.8.1" +files = [ + {file = "langchain_milvus-0.1.4-py3-none-any.whl", hash = "sha256:f5c1f2d023c6853d1acc22dc8d0b61ca4d99015c1b095b0cf84ec84a9ba2936e"}, + {file = "langchain_milvus-0.1.4.tar.gz", hash = "sha256:1cd67f127d60c73ffb07cd789705766479137630d43f8ff547c69eee4775dae8"}, +] + +[package.dependencies] +langchain-core = ">=0.2.20,<0.3.0" +pymilvus = ">=2.4.3,<3.0.0" +scipy = [ + {version = ">=1.7,<2.0", markers = "python_version < \"3.12\""}, + {version = ">=1.9,<2.0", markers = "python_version >= \"3.12\""}, +] + +[[package]] +name = "langchain-text-splitters" +version = "0.2.4" +description = "LangChain text splitting utilities" +optional = true +python-versions = "<4.0,>=3.8.1" +files = [ + {file = "langchain_text_splitters-0.2.4-py3-none-any.whl", hash = "sha256:2702dee5b7cbdd595ccbe43b8d38d01a34aa8583f4d6a5a68ad2305ae3e7b645"}, + {file = "langchain_text_splitters-0.2.4.tar.gz", hash = "sha256:f7daa7a3b0aa8309ce248e2e2b6fc8115be01118d336c7f7f7dfacda0e89bf29"}, +] + +[package.dependencies] +langchain-core = ">=0.2.38,<0.3.0" + +[[package]] +name = "langsmith" +version = "0.1.116" +description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform." +optional = false +python-versions = "<4.0,>=3.8.1" +files = [ + {file = "langsmith-0.1.116-py3-none-any.whl", hash = "sha256:4b5ea64c81ba5ca309695c85dc3fb4617429a985129ed5d9eca00d1c9d6483f4"}, + {file = "langsmith-0.1.116.tar.gz", hash = "sha256:5ccd7f5c1840f7c507ab3ee56334a1391de28c8bf72669782e2d82cafeefffa7"}, +] + +[package.dependencies] +httpx = ">=0.23.0,<1" +orjson = ">=3.9.14,<4.0.0" +pydantic = [ + {version = ">=1,<3", markers = "python_full_version < \"3.12.4\""}, + {version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""}, +] +requests = ">=2,<3" + [[package]] name = "lazy-loader" version = "0.4" @@ -3706,6 +3825,72 @@ numpy = [ {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] +[[package]] +name = "orjson" +version = "3.10.7" +description = "Fast, correct Python JSON library supporting dataclasses, datetimes, and numpy" +optional = false +python-versions = ">=3.8" +files = [ + {file = "orjson-3.10.7-cp310-cp310-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:74f4544f5a6405b90da8ea724d15ac9c36da4d72a738c64685003337401f5c12"}, + {file = "orjson-3.10.7-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:34a566f22c28222b08875b18b0dfbf8a947e69df21a9ed5c51a6bf91cfb944ac"}, + {file = "orjson-3.10.7-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bf6ba8ebc8ef5792e2337fb0419f8009729335bb400ece005606336b7fd7bab7"}, + {file = "orjson-3.10.7-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ac7cf6222b29fbda9e3a472b41e6a5538b48f2c8f99261eecd60aafbdb60690c"}, + {file = "orjson-3.10.7-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:de817e2f5fc75a9e7dd350c4b0f54617b280e26d1631811a43e7e968fa71e3e9"}, + {file = "orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:348bdd16b32556cf8d7257b17cf2bdb7ab7976af4af41ebe79f9796c218f7e91"}, + {file = "orjson-3.10.7-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:479fd0844ddc3ca77e0fd99644c7fe2de8e8be1efcd57705b5c92e5186e8a250"}, + {file = "orjson-3.10.7-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:fdf5197a21dd660cf19dfd2a3ce79574588f8f5e2dbf21bda9ee2d2b46924d84"}, + {file = "orjson-3.10.7-cp310-none-win32.whl", hash = "sha256:d374d36726746c81a49f3ff8daa2898dccab6596864ebe43d50733275c629175"}, + {file = "orjson-3.10.7-cp310-none-win_amd64.whl", hash = "sha256:cb61938aec8b0ffb6eef484d480188a1777e67b05d58e41b435c74b9d84e0b9c"}, + {file = "orjson-3.10.7-cp311-cp311-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:7db8539039698ddfb9a524b4dd19508256107568cdad24f3682d5773e60504a2"}, + {file = "orjson-3.10.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:480f455222cb7a1dea35c57a67578848537d2602b46c464472c995297117fa09"}, + {file = "orjson-3.10.7-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8a9c9b168b3a19e37fe2778c0003359f07822c90fdff8f98d9d2a91b3144d8e0"}, + {file = "orjson-3.10.7-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8de062de550f63185e4c1c54151bdddfc5625e37daf0aa1e75d2a1293e3b7d9a"}, + {file = "orjson-3.10.7-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6b0dd04483499d1de9c8f6203f8975caf17a6000b9c0c54630cef02e44ee624e"}, + {file = "orjson-3.10.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b58d3795dafa334fc8fd46f7c5dc013e6ad06fd5b9a4cc98cb1456e7d3558bd6"}, + {file = "orjson-3.10.7-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:33cfb96c24034a878d83d1a9415799a73dc77480e6c40417e5dda0710d559ee6"}, + {file = "orjson-3.10.7-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:e724cebe1fadc2b23c6f7415bad5ee6239e00a69f30ee423f319c6af70e2a5c0"}, + {file = "orjson-3.10.7-cp311-none-win32.whl", hash = "sha256:82763b46053727a7168d29c772ed5c870fdae2f61aa8a25994c7984a19b1021f"}, + {file = "orjson-3.10.7-cp311-none-win_amd64.whl", hash = "sha256:eb8d384a24778abf29afb8e41d68fdd9a156cf6e5390c04cc07bbc24b89e98b5"}, + {file = "orjson-3.10.7-cp312-cp312-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:44a96f2d4c3af51bfac6bc4ef7b182aa33f2f054fd7f34cc0ee9a320d051d41f"}, + {file = "orjson-3.10.7-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76ac14cd57df0572453543f8f2575e2d01ae9e790c21f57627803f5e79b0d3c3"}, + {file = "orjson-3.10.7-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bdbb61dcc365dd9be94e8f7df91975edc9364d6a78c8f7adb69c1cdff318ec93"}, + {file = "orjson-3.10.7-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b48b3db6bb6e0a08fa8c83b47bc169623f801e5cc4f24442ab2b6617da3b5313"}, + {file = "orjson-3.10.7-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:23820a1563a1d386414fef15c249040042b8e5d07b40ab3fe3efbfbbcbcb8864"}, + {file = "orjson-3.10.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a0c6a008e91d10a2564edbb6ee5069a9e66df3fbe11c9a005cb411f441fd2c09"}, + {file = "orjson-3.10.7-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d352ee8ac1926d6193f602cbe36b1643bbd1bbcb25e3c1a657a4390f3000c9a5"}, + {file = "orjson-3.10.7-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:d2d9f990623f15c0ae7ac608103c33dfe1486d2ed974ac3f40b693bad1a22a7b"}, + {file = "orjson-3.10.7-cp312-none-win32.whl", hash = "sha256:7c4c17f8157bd520cdb7195f75ddbd31671997cbe10aee559c2d613592e7d7eb"}, + {file = "orjson-3.10.7-cp312-none-win_amd64.whl", hash = "sha256:1d9c0e733e02ada3ed6098a10a8ee0052dd55774de3d9110d29868d24b17faa1"}, + {file = "orjson-3.10.7-cp313-cp313-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:77d325ed866876c0fa6492598ec01fe30e803272a6e8b10e992288b009cbe149"}, + {file = "orjson-3.10.7-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ea2c232deedcb605e853ae1db2cc94f7390ac776743b699b50b071b02bea6fe"}, + {file = "orjson-3.10.7-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:3dcfbede6737fdbef3ce9c37af3fb6142e8e1ebc10336daa05872bfb1d87839c"}, + {file = "orjson-3.10.7-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:11748c135f281203f4ee695b7f80bb1358a82a63905f9f0b794769483ea854ad"}, + {file = "orjson-3.10.7-cp313-none-win32.whl", hash = "sha256:a7e19150d215c7a13f39eb787d84db274298d3f83d85463e61d277bbd7f401d2"}, + {file = "orjson-3.10.7-cp313-none-win_amd64.whl", hash = "sha256:eef44224729e9525d5261cc8d28d6b11cafc90e6bd0be2157bde69a52ec83024"}, + {file = "orjson-3.10.7-cp38-cp38-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:6ea2b2258eff652c82652d5e0f02bd5e0463a6a52abb78e49ac288827aaa1469"}, + {file = "orjson-3.10.7-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:430ee4d85841e1483d487e7b81401785a5dfd69db5de01314538f31f8fbf7ee1"}, + {file = "orjson-3.10.7-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4b6146e439af4c2472c56f8540d799a67a81226e11992008cb47e1267a9b3225"}, + {file = "orjson-3.10.7-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:084e537806b458911137f76097e53ce7bf5806dda33ddf6aaa66a028f8d43a23"}, + {file = "orjson-3.10.7-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4829cf2195838e3f93b70fd3b4292156fc5e097aac3739859ac0dcc722b27ac0"}, + {file = "orjson-3.10.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1193b2416cbad1a769f868b1749535d5da47626ac29445803dae7cc64b3f5c98"}, + {file = "orjson-3.10.7-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:4e6c3da13e5a57e4b3dca2de059f243ebec705857522f188f0180ae88badd354"}, + {file = "orjson-3.10.7-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:c31008598424dfbe52ce8c5b47e0752dca918a4fdc4a2a32004efd9fab41d866"}, + {file = "orjson-3.10.7-cp38-none-win32.whl", hash = "sha256:7122a99831f9e7fe977dc45784d3b2edc821c172d545e6420c375e5a935f5a1c"}, + {file = "orjson-3.10.7-cp38-none-win_amd64.whl", hash = "sha256:a763bc0e58504cc803739e7df040685816145a6f3c8a589787084b54ebc9f16e"}, + {file = "orjson-3.10.7-cp39-cp39-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:e76be12658a6fa376fcd331b1ea4e58f5a06fd0220653450f0d415b8fd0fbe20"}, + {file = "orjson-3.10.7-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ed350d6978d28b92939bfeb1a0570c523f6170efc3f0a0ef1f1df287cd4f4960"}, + {file = "orjson-3.10.7-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:144888c76f8520e39bfa121b31fd637e18d4cc2f115727865fdf9fa325b10412"}, + {file = "orjson-3.10.7-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:09b2d92fd95ad2402188cf51573acde57eb269eddabaa60f69ea0d733e789fe9"}, + {file = "orjson-3.10.7-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b24a579123fa884f3a3caadaed7b75eb5715ee2b17ab5c66ac97d29b18fe57f"}, + {file = "orjson-3.10.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e72591bcfe7512353bd609875ab38050efe3d55e18934e2f18950c108334b4ff"}, + {file = "orjson-3.10.7-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:f4db56635b58cd1a200b0a23744ff44206ee6aa428185e2b6c4a65b3197abdcd"}, + {file = "orjson-3.10.7-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:0fa5886854673222618638c6df7718ea7fe2f3f2384c452c9ccedc70b4a510a5"}, + {file = "orjson-3.10.7-cp39-none-win32.whl", hash = "sha256:8272527d08450ab16eb405f47e0f4ef0e5ff5981c3d82afe0efd25dcbef2bcd2"}, + {file = "orjson-3.10.7-cp39-none-win_amd64.whl", hash = "sha256:974683d4618c0c7dbf4f69c95a979734bf183d0658611760017f6e70a145af58"}, + {file = "orjson-3.10.7.tar.gz", hash = "sha256:75ef0640403f945f3a1f9f6400686560dbfb0fb5b16589ad62cd477043c4eee3"}, +] + [[package]] name = "packaging" version = "24.1" @@ -7480,9 +7665,9 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", type = ["pytest-mypy"] [extras] -examples = ["flagembedding", "jsonpath-ng", "llama-index-embeddings-huggingface", "llama-index-llms-huggingface-api", "llama-index-postprocessor-flag-embedding-reranker", "llama-index-vector-stores-milvus", "peft", "python-dotenv"] +examples = ["flagembedding", "jsonpath-ng", "langchain-huggingface", "langchain-milvus", "langchain-text-splitters", "llama-index-embeddings-huggingface", "llama-index-llms-huggingface-api", "llama-index-postprocessor-flag-embedding-reranker", "llama-index-vector-stores-milvus", "peft", "python-dotenv"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "9341adde375afd06a61daae54b8b87383fb98868f8e5b2c7f32c9d3cb3bcf897" +content-hash = "417e7168718425cb9f2c3d487329d75c7ffbd47733ce88861209f0ac5f61d509" diff --git a/pyproject.toml b/pyproject.toml index 1eaf8c8..ca1576a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ python = "^3.10" docling-core = "^1.1.2" llama-index-core = "^0.11.1" +langchain-core = "^0.2.38" docling = "^1.8.2" ######### @@ -49,6 +50,9 @@ llama-index-postprocessor-flag-embedding-reranker = {version = "^0.2.0", option flagembedding = { version = "^1.2.10", optional = true } peft = { version = "^0.12.0", optional = true } # TODO: remove once we can update FlagEmbedding past 1.2.11 to include https://github.com/FlagOpen/FlagEmbedding/commit/1613625417e293bf98311cb8ae0819a0a3af5297 jsonpath-ng = { version = "^1.6.1", optional = true } +langchain-huggingface = { version = "^0.0.3", optional = true} +langchain-milvus = { version = "^0.1.4", optional = true } +langchain-text-splitters = { version = "^0.2.4", optional = true } ############## # constraints: @@ -65,6 +69,7 @@ torchvision = [ [tool.poetry.extras] examples = [ "python-dotenv", + # LlamaIndex examples: "llama-index-embeddings-huggingface", "llama-index-llms-huggingface-api", "llama-index-vector-stores-milvus", @@ -72,6 +77,10 @@ examples = [ "flagembedding", "peft", # TODO: remove once we can update FlagEmbedding past 1.2.11 to include https://github.com/FlagOpen/FlagEmbedding/commit/1613625417e293bf98311cb8ae0819a0a3af5297 "jsonpath-ng", + # LangChain examples: + "langchain-huggingface", + "langchain-milvus", + "langchain-text-splitters", ] [tool.poetry.group.dev.dependencies] diff --git a/quackling/langchain/__init__.py b/quackling/langchain/__init__.py new file mode 100644 index 0000000..bacc58a --- /dev/null +++ b/quackling/langchain/__init__.py @@ -0,0 +1,4 @@ +# +# Copyright IBM Corp. 2024 - 2024 +# SPDX-License-Identifier: MIT +# diff --git a/quackling/langchain/loaders/__init__.py b/quackling/langchain/loaders/__init__.py new file mode 100644 index 0000000..31e2d18 --- /dev/null +++ b/quackling/langchain/loaders/__init__.py @@ -0,0 +1,6 @@ +# +# Copyright IBM Corp. 2024 - 2024 +# SPDX-License-Identifier: MIT +# + +from quackling.langchain.loaders.docling_pdf_loader import DoclingPDFLoader # noqa diff --git a/quackling/langchain/loaders/base.py b/quackling/langchain/loaders/base.py new file mode 100644 index 0000000..aa756bd --- /dev/null +++ b/quackling/langchain/loaders/base.py @@ -0,0 +1,43 @@ +# +# Copyright IBM Corp. 2024 - 2024 +# SPDX-License-Identifier: MIT +# + +from enum import Enum + +from docling.document_converter import DocumentConverter +from docling_core.types import Document as DLDocument +from langchain_core.document_loaders import BaseLoader +from langchain_core.documents import Document as LCDocument +from pydantic import BaseModel + + +class DocumentMetadata(BaseModel): + dl_doc_hash: str + # source: str + + +class BaseDoclingLoader(BaseLoader): + class ParseType(str, Enum): + MARKDOWN = "markdown" + JSON = "json" + + def __init__(self, file_path: str | list[str], parse_type: ParseType) -> None: + self._file_paths = file_path if isinstance(file_path, list) else [file_path] + self._parse_type = parse_type + self._converter = DocumentConverter() + + def _create_lc_doc_from_dl_doc(self, dl_doc: DLDocument) -> LCDocument: + if self._parse_type == self.ParseType.MARKDOWN: + text = dl_doc.export_to_markdown() + elif self._parse_type == self.ParseType.JSON: + text = dl_doc.model_dump_json() + else: + raise RuntimeError(f"Unexpected parse type encountered: {self._parse_type}") + lc_doc = LCDocument( + page_content=text, + metadata=DocumentMetadata( + dl_doc_hash=dl_doc.file_info.document_hash, + ).model_dump(), + ) + return lc_doc diff --git a/quackling/langchain/loaders/docling_pdf_loader.py b/quackling/langchain/loaders/docling_pdf_loader.py new file mode 100644 index 0000000..70a4eec --- /dev/null +++ b/quackling/langchain/loaders/docling_pdf_loader.py @@ -0,0 +1,19 @@ +# +# Copyright IBM Corp. 2024 - 2024 +# SPDX-License-Identifier: MIT +# + +from typing import Iterator + +from langchain_core.documents import Document as LCDocument + +from quackling.langchain.loaders.base import BaseDoclingLoader + + +class DoclingPDFLoader(BaseDoclingLoader): + + def lazy_load(self) -> Iterator[LCDocument]: + for source in self._file_paths: + dl_doc = self._converter.convert_single(source).output + lc_doc = self._create_lc_doc_from_dl_doc(dl_doc=dl_doc) + yield lc_doc diff --git a/quackling/langchain/splitters/__init__.py b/quackling/langchain/splitters/__init__.py new file mode 100644 index 0000000..2ea44be --- /dev/null +++ b/quackling/langchain/splitters/__init__.py @@ -0,0 +1,8 @@ +# +# Copyright IBM Corp. 2024 - 2024 +# SPDX-License-Identifier: MIT +# + +from quackling.langchain.splitters.hier_json_splitter import ( # noqa + HierarchicalJSONSplitter, +) diff --git a/quackling/langchain/splitters/hier_json_splitter.py b/quackling/langchain/splitters/hier_json_splitter.py new file mode 100644 index 0000000..d4ff65e --- /dev/null +++ b/quackling/langchain/splitters/hier_json_splitter.py @@ -0,0 +1,48 @@ +# +# Copyright IBM Corp. 2024 - 2024 +# SPDX-License-Identifier: MIT +# + +from typing import Iterable, List + +from docling_core.types import Document as DLDocument +from langchain_core.documents import Document as LCDocument +from pydantic import BaseModel + +from quackling.core.chunkers.base import BaseChunker +from quackling.core.chunkers.hierarchical_chunker import HierarchicalChunker + + +class ChunkDocMetadata(BaseModel): + dl_doc_id: str + path: str + + +class HierarchicalJSONSplitter: + + def __init__( + self, + chunker: BaseChunker | None = None, + ) -> None: + self.chunker: BaseChunker = chunker or HierarchicalChunker() + + def split_documents(self, documents: Iterable[LCDocument]) -> List[LCDocument]: + + all_chunk_docs: list[LCDocument] = [] + for doc in documents: + lc_doc: LCDocument = LCDocument.parse_obj(doc) + dl_doc: DLDocument = DLDocument.model_validate_json(lc_doc.page_content) + chunk_iter = self.chunker.chunk(dl_doc=dl_doc) + chunk_docs = [ + LCDocument( + page_content=chunk.text, + metadata=ChunkDocMetadata( + dl_doc_id=dl_doc.file_info.document_hash, + path=chunk.path, + ).model_dump(), + ) + for chunk in chunk_iter + ] + all_chunk_docs.extend(chunk_docs) + + return all_chunk_docs