diff --git a/README.md b/README.md
index 09aa6d1..82d4c99 100644
--- a/README.md
+++ b/README.md
@@ -15,13 +15,13 @@
[![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
[![License MIT](https://img.shields.io/github/license/DS4SD/quackling)](https://opensource.org/licenses/MIT)
-Quackling enables document-native generative AI applications, such as RAG, based on [Docling](https://github.com/DS4SD/docling).
+Easily build document-native generative AI applications, such as RAG, leveraging [Docling](https://github.com/DS4SD/docling)'s efficient PDF extraction and rich data model — while still using your favorite framework, [🦙 LlamaIndex](https://docs.llamaindex.ai/en/stable/) or [🦜🔗 LangChain](https://python.langchain.com/).
## Features
- 🧠 Enables rich gen AI applications by providing capabilities on native document level — not just plain text / Markdown!
- ⚡️ Leverages Docling's conversion quality and speed.
-- ⚙️ Integrates with standard LLM application frameworks, such as LlamaIndex, for building powerful applications like RAG.
+- ⚙️ Plug-and-play integration with LlamaIndex and LangChain for building powerful applications like RAG.
@@ -40,12 +40,11 @@ pip install quackling
## Usage
-Quackling offers core capabilities (`quackling.core`), as well as framework integration components
-e.g. for LlamaIndex (`quackling.llama_index`). Below you find examples of both.
+Quackling offers core capabilities (`quackling.core`), as well as framework integration components (`quackling.llama_index` and `quackling.langchain`). Below you find examples of both.
### Basic RAG
-Below you find a basic RAG pipeline using LlamaIndex.
+Here is a basic RAG pipeline using LlamaIndex:
> [!NOTE]
> To use as is, first `pip install llama-index-embeddings-huggingface llama-index-llms-huggingface-api`
@@ -67,7 +66,7 @@ QUESTION = "How many pages were human annotated?"
EMBED_MODEL = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
LLM = HuggingFaceInferenceAPI(
token=os.getenv("HF_TOKEN"),
- model_name="mistralai/Mixtral-8x7B-Instruct-v0.1",
+ model_name="mistralai/Mistral-7B-Instruct-v0.3",
)
index = VectorStoreIndex.from_documents(
@@ -105,13 +104,18 @@ chunks = list(HierarchicalChunker().chunk(doc))
```
## More examples
-Check out the [examples](examples) — showcasing different variants of RAG incl. vector ingestion & retrieval:
-- [[LlamaIndex] Milvus basic RAG (dense embeddings)](examples/basic_pipeline.ipynb)
-- [[LlamaIndex] Milvus hybrid RAG (dense & sparse embeddings combined e.g. via RRF) & reranker model usage](examples/hybrid_pipeline.ipynb)
-- [[LlamaIndex] Milvus RAG also fetching native document metadata for search results](examples/native_nodes.ipynb)
-- [[LlamaIndex] Local node transformations (e.g. embeddings)](examples/node_transformations.ipynb)
+
+### LlamaIndex
+
+- [Milvus basic RAG (dense embeddings)](examples/llama_index/basic_pipeline.ipynb)
+- [Milvus hybrid RAG (dense & sparse embeddings combined e.g. via RRF) & reranker model usage](examples/llama_index/hybrid_pipeline.ipynb)
+- [Milvus RAG also fetching native document metadata for search results](examples/llama_index/native_nodes.ipynb)
+- [Local node transformations (e.g. embeddings)](examples/llama_index/node_transformations.ipynb)
- ...
+### LangChain
+- [Milvus basic RAG (dense embeddings)](examples/langchain/basic_pipeline.ipynb)
+
## Contributing
Please read [Contributing to Quackling](./CONTRIBUTING.md) for details.
diff --git a/examples/langchain/basic_pipeline.ipynb b/examples/langchain/basic_pipeline.ipynb
new file mode 100644
index 0000000..9416f4b
--- /dev/null
+++ b/examples/langchain/basic_pipeline.ipynb
@@ -0,0 +1,350 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Note: you may need to restart the kernel to use updated packages.\n"
+ ]
+ }
+ ],
+ "source": [
+ "# requirements for this example:\n",
+ "%pip install -qq \\\n",
+ " quackling \\\n",
+ " python-dotenv \\\n",
+ " langchain-text-splitters \\\n",
+ " langchain-huggingface \\\n",
+ " langchain-milvus"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "\n",
+ "from dotenv import load_dotenv\n",
+ "\n",
+ "load_dotenv()\n",
+ "\n",
+ "FILE_PATH = \"https://arxiv.org/pdf/2206.01062\" # DocLayNet paper"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import warnings\n",
+ "\n",
+ "warnings.filterwarnings(action=\"ignore\", category=UserWarning, module=\"pydantic|torch\")\n",
+ "warnings.filterwarnings(action=\"ignore\", category=FutureWarning, module=\"easyocr\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Setup"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Loader and splitter"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from quackling.langchain.loaders import DoclingPDFLoader"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Below we set up:\n",
+ "- a `Loader` which will be used to create LangChain documents, and\n",
+ "- a splitter, which will be used to split these documents"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Using JSON"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "To leverage Docling's rich document structure format, we namely set the parse type to JSON and use a `HierarchicalJSONSplitter` accordingly:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "37a35b6883bd444293bae3a589be56e5",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Fetching 7 files: 0%| | 0/7 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "from quackling.langchain.loaders import DoclingPDFLoader\n",
+ "from quackling.langchain.splitters import HierarchicalJSONSplitter\n",
+ "\n",
+ "loader = DoclingPDFLoader(\n",
+ " file_path=FILE_PATH,\n",
+ " parse_type=DoclingPDFLoader.ParseType.JSON,\n",
+ ")\n",
+ "text_splitter = HierarchicalJSONSplitter()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Using Markdown"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Alternatively, to just use the flat Markdown export instead of the native document format, one can uncomment and use the following:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
+ "\n",
+ "# loader = DoclingPDFLoader(\n",
+ "# file_path=FILE_PATH,\n",
+ "# parse_type=DoclingPDFLoader.ParseType.MARKDOWN,\n",
+ "# )\n",
+ "# text_splitter = RecursiveCharacterTextSplitter(\n",
+ "# chunk_size=1000,\n",
+ "# chunk_overlap=200,\n",
+ "# )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We now used the above-defined objects to get the document splits:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "docs = loader.load()\n",
+ "splits = text_splitter.split_documents(docs)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Embeddings"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain_huggingface.embeddings import HuggingFaceEmbeddings\n",
+ "\n",
+ "HF_EMBED_MODEL_ID = \"BAAI/bge-small-en-v1.5\"\n",
+ "embeddings = HuggingFaceEmbeddings(model_name=HF_EMBED_MODEL_ID)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Vector store"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from tempfile import TemporaryDirectory\n",
+ "\n",
+ "from langchain_milvus import Milvus\n",
+ "\n",
+ "MILVUS_URI = os.environ.get(\n",
+ " \"MILVUS_URL\", f\"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db\"\n",
+ ")\n",
+ "\n",
+ "vectorstore = Milvus.from_documents(\n",
+ " splits,\n",
+ " embeddings,\n",
+ " connection_args={\"uri\": MILVUS_URI},\n",
+ " drop_old=True,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### LLM"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\n",
+ "Token is valid (permission: write).\n",
+ "Your token has been saved to /Users/pva/.cache/huggingface/token\n",
+ "Login successful\n"
+ ]
+ }
+ ],
+ "source": [
+ "from langchain_huggingface import HuggingFaceEndpoint\n",
+ "\n",
+ "HF_API_KEY = os.environ.get(\"HF_API_KEY\")\n",
+ "HF_LLM_MODEL_ID = \"mistralai/Mistral-7B-Instruct-v0.3\"\n",
+ "\n",
+ "llm = HuggingFaceEndpoint(\n",
+ " repo_id=HF_LLM_MODEL_ID,\n",
+ " huggingfacehub_api_token=HF_API_KEY,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## RAG"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from typing import Iterable\n",
+ "\n",
+ "from langchain_core.documents import Document as LCDocument\n",
+ "from langchain_core.output_parsers import StrOutputParser\n",
+ "from langchain_core.prompts import PromptTemplate\n",
+ "from langchain_core.runnables import RunnablePassthrough\n",
+ "\n",
+ "\n",
+ "def format_docs(docs: Iterable[LCDocument]):\n",
+ " return \"\\n\\n\".join(doc.page_content for doc in docs)\n",
+ "\n",
+ "\n",
+ "retriever = vectorstore.as_retriever()\n",
+ "\n",
+ "prompt = PromptTemplate.from_template(\n",
+ " \"Context information is below.\\n---------------------\\n{context}\\n---------------------\\nGiven the context information and not prior knowledge, answer the query.\\nQuery: {question}\\nAnswer:\\n\"\n",
+ ")\n",
+ "\n",
+ "rag_chain = (\n",
+ " {\"context\": retriever | format_docs, \"question\": RunnablePassthrough()}\n",
+ " | prompt\n",
+ " | llm\n",
+ " | StrOutputParser()\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'The number of pages annotated for DocLayNet is 80,863.'"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "rag_chain.invoke(\"How many pages were human annotated for DocLayNet?\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": ".venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/basic_pipeline.ipynb b/examples/llama_index/basic_pipeline.ipynb
similarity index 100%
rename from examples/basic_pipeline.ipynb
rename to examples/llama_index/basic_pipeline.ipynb
diff --git a/examples/hybrid_pipeline.ipynb b/examples/llama_index/hybrid_pipeline.ipynb
similarity index 100%
rename from examples/hybrid_pipeline.ipynb
rename to examples/llama_index/hybrid_pipeline.ipynb
diff --git a/examples/native_nodes.ipynb b/examples/llama_index/native_nodes.ipynb
similarity index 100%
rename from examples/native_nodes.ipynb
rename to examples/llama_index/native_nodes.ipynb
diff --git a/examples/node_transformations.ipynb b/examples/llama_index/node_transformations.ipynb
similarity index 100%
rename from examples/node_transformations.ipynb
rename to examples/llama_index/node_transformations.ipynb
diff --git a/examples/prev_next_augmentation.ipynb b/examples/llama_index/prev_next_augmentation.ipynb
similarity index 100%
rename from examples/prev_next_augmentation.ipynb
rename to examples/llama_index/prev_next_augmentation.ipynb
diff --git a/poetry.lock b/poetry.lock
index 55c4d31..30e18c1 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2231,6 +2231,20 @@ files = [
[package.dependencies]
attrs = ">=19.2.0"
+[[package]]
+name = "jsonpatch"
+version = "1.33"
+description = "Apply JSON-Patches (RFC 6902)"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*"
+files = [
+ {file = "jsonpatch-1.33-py2.py3-none-any.whl", hash = "sha256:0ae28c0cd062bbd8b8ecc26d7d164fbbea9652a1a3693f3b956c1eae5145dade"},
+ {file = "jsonpatch-1.33.tar.gz", hash = "sha256:9fcd4009c41e6d12348b4a0ff2563ba56a2923a7dfee731d004e212e1ee5030c"},
+]
+
+[package.dependencies]
+jsonpointer = ">=1.9"
+
[[package]]
name = "jsonpath-ng"
version = "1.6.1"
@@ -2245,6 +2259,17 @@ files = [
[package.dependencies]
ply = "*"
+[[package]]
+name = "jsonpointer"
+version = "3.0.0"
+description = "Identify specific nodes in a JSON document (RFC 6901)"
+optional = false
+python-versions = ">=3.7"
+files = [
+ {file = "jsonpointer-3.0.0-py2.py3-none-any.whl", hash = "sha256:13e088adc14fca8b6aa8177c044e12701e6ad4b28ff10e65f2267a90109c9942"},
+ {file = "jsonpointer-3.0.0.tar.gz", hash = "sha256:2b2d729f2091522d61c3b31f82e11870f60b68f43fbc705cb76bf4b832af59ef"},
+]
+
[[package]]
name = "jsonref"
version = "1.1.0"
@@ -2480,6 +2505,100 @@ files = [
{file = "kiwisolver-1.4.5.tar.gz", hash = "sha256:e57e563a57fb22a142da34f38acc2fc1a5c864bc29ca1517a88abc963e60d6ec"},
]
+[[package]]
+name = "langchain-core"
+version = "0.2.38"
+description = "Building applications with LLMs through composability"
+optional = false
+python-versions = "<4.0,>=3.8.1"
+files = [
+ {file = "langchain_core-0.2.38-py3-none-any.whl", hash = "sha256:8a5729bc7e68b4af089af20eff44fe4e7ca21d0e0c87ec21cef7621981fd1a4a"},
+ {file = "langchain_core-0.2.38.tar.gz", hash = "sha256:eb69dbedd344f2ee1f15bcea6c71a05884b867588fadc42d04632e727c1238f3"},
+]
+
+[package.dependencies]
+jsonpatch = ">=1.33,<2.0"
+langsmith = ">=0.1.75,<0.2.0"
+packaging = ">=23.2,<25"
+pydantic = [
+ {version = ">=1,<3", markers = "python_full_version < \"3.12.4\""},
+ {version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""},
+]
+PyYAML = ">=5.3"
+tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<9.0.0"
+typing-extensions = ">=4.7"
+
+[[package]]
+name = "langchain-huggingface"
+version = "0.0.3"
+description = "An integration package connecting Hugging Face and LangChain"
+optional = true
+python-versions = "<4.0,>=3.8.1"
+files = [
+ {file = "langchain_huggingface-0.0.3-py3-none-any.whl", hash = "sha256:d6827adf3c7c8fcc0bca8c43c7e900c3bf68af9a1532a83d4b8ace137e02887e"},
+ {file = "langchain_huggingface-0.0.3.tar.gz", hash = "sha256:0637acf484c47323cf3dcc46745a93467f6955989af9b7c01e2382fe1b630aaf"},
+]
+
+[package.dependencies]
+huggingface-hub = ">=0.23.0"
+langchain-core = ">=0.1.52,<0.3"
+sentence-transformers = ">=2.6.0"
+tokenizers = ">=0.19.1"
+transformers = ">=4.39.0"
+
+[[package]]
+name = "langchain-milvus"
+version = "0.1.4"
+description = "An integration package connecting Milvus and LangChain"
+optional = true
+python-versions = "<4.0,>=3.8.1"
+files = [
+ {file = "langchain_milvus-0.1.4-py3-none-any.whl", hash = "sha256:f5c1f2d023c6853d1acc22dc8d0b61ca4d99015c1b095b0cf84ec84a9ba2936e"},
+ {file = "langchain_milvus-0.1.4.tar.gz", hash = "sha256:1cd67f127d60c73ffb07cd789705766479137630d43f8ff547c69eee4775dae8"},
+]
+
+[package.dependencies]
+langchain-core = ">=0.2.20,<0.3.0"
+pymilvus = ">=2.4.3,<3.0.0"
+scipy = [
+ {version = ">=1.7,<2.0", markers = "python_version < \"3.12\""},
+ {version = ">=1.9,<2.0", markers = "python_version >= \"3.12\""},
+]
+
+[[package]]
+name = "langchain-text-splitters"
+version = "0.2.4"
+description = "LangChain text splitting utilities"
+optional = true
+python-versions = "<4.0,>=3.8.1"
+files = [
+ {file = "langchain_text_splitters-0.2.4-py3-none-any.whl", hash = "sha256:2702dee5b7cbdd595ccbe43b8d38d01a34aa8583f4d6a5a68ad2305ae3e7b645"},
+ {file = "langchain_text_splitters-0.2.4.tar.gz", hash = "sha256:f7daa7a3b0aa8309ce248e2e2b6fc8115be01118d336c7f7f7dfacda0e89bf29"},
+]
+
+[package.dependencies]
+langchain-core = ">=0.2.38,<0.3.0"
+
+[[package]]
+name = "langsmith"
+version = "0.1.116"
+description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
+optional = false
+python-versions = "<4.0,>=3.8.1"
+files = [
+ {file = "langsmith-0.1.116-py3-none-any.whl", hash = "sha256:4b5ea64c81ba5ca309695c85dc3fb4617429a985129ed5d9eca00d1c9d6483f4"},
+ {file = "langsmith-0.1.116.tar.gz", hash = "sha256:5ccd7f5c1840f7c507ab3ee56334a1391de28c8bf72669782e2d82cafeefffa7"},
+]
+
+[package.dependencies]
+httpx = ">=0.23.0,<1"
+orjson = ">=3.9.14,<4.0.0"
+pydantic = [
+ {version = ">=1,<3", markers = "python_full_version < \"3.12.4\""},
+ {version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""},
+]
+requests = ">=2,<3"
+
[[package]]
name = "lazy-loader"
version = "0.4"
@@ -3706,6 +3825,72 @@ numpy = [
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
]
+[[package]]
+name = "orjson"
+version = "3.10.7"
+description = "Fast, correct Python JSON library supporting dataclasses, datetimes, and numpy"
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "orjson-3.10.7-cp310-cp310-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:74f4544f5a6405b90da8ea724d15ac9c36da4d72a738c64685003337401f5c12"},
+ {file = "orjson-3.10.7-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:34a566f22c28222b08875b18b0dfbf8a947e69df21a9ed5c51a6bf91cfb944ac"},
+ {file = "orjson-3.10.7-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bf6ba8ebc8ef5792e2337fb0419f8009729335bb400ece005606336b7fd7bab7"},
+ {file = "orjson-3.10.7-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ac7cf6222b29fbda9e3a472b41e6a5538b48f2c8f99261eecd60aafbdb60690c"},
+ {file = "orjson-3.10.7-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:de817e2f5fc75a9e7dd350c4b0f54617b280e26d1631811a43e7e968fa71e3e9"},
+ {file = "orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:348bdd16b32556cf8d7257b17cf2bdb7ab7976af4af41ebe79f9796c218f7e91"},
+ {file = "orjson-3.10.7-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:479fd0844ddc3ca77e0fd99644c7fe2de8e8be1efcd57705b5c92e5186e8a250"},
+ {file = "orjson-3.10.7-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:fdf5197a21dd660cf19dfd2a3ce79574588f8f5e2dbf21bda9ee2d2b46924d84"},
+ {file = "orjson-3.10.7-cp310-none-win32.whl", hash = "sha256:d374d36726746c81a49f3ff8daa2898dccab6596864ebe43d50733275c629175"},
+ {file = "orjson-3.10.7-cp310-none-win_amd64.whl", hash = "sha256:cb61938aec8b0ffb6eef484d480188a1777e67b05d58e41b435c74b9d84e0b9c"},
+ {file = "orjson-3.10.7-cp311-cp311-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:7db8539039698ddfb9a524b4dd19508256107568cdad24f3682d5773e60504a2"},
+ {file = "orjson-3.10.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:480f455222cb7a1dea35c57a67578848537d2602b46c464472c995297117fa09"},
+ {file = "orjson-3.10.7-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8a9c9b168b3a19e37fe2778c0003359f07822c90fdff8f98d9d2a91b3144d8e0"},
+ {file = "orjson-3.10.7-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8de062de550f63185e4c1c54151bdddfc5625e37daf0aa1e75d2a1293e3b7d9a"},
+ {file = "orjson-3.10.7-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6b0dd04483499d1de9c8f6203f8975caf17a6000b9c0c54630cef02e44ee624e"},
+ {file = "orjson-3.10.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b58d3795dafa334fc8fd46f7c5dc013e6ad06fd5b9a4cc98cb1456e7d3558bd6"},
+ {file = "orjson-3.10.7-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:33cfb96c24034a878d83d1a9415799a73dc77480e6c40417e5dda0710d559ee6"},
+ {file = "orjson-3.10.7-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:e724cebe1fadc2b23c6f7415bad5ee6239e00a69f30ee423f319c6af70e2a5c0"},
+ {file = "orjson-3.10.7-cp311-none-win32.whl", hash = "sha256:82763b46053727a7168d29c772ed5c870fdae2f61aa8a25994c7984a19b1021f"},
+ {file = "orjson-3.10.7-cp311-none-win_amd64.whl", hash = "sha256:eb8d384a24778abf29afb8e41d68fdd9a156cf6e5390c04cc07bbc24b89e98b5"},
+ {file = "orjson-3.10.7-cp312-cp312-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:44a96f2d4c3af51bfac6bc4ef7b182aa33f2f054fd7f34cc0ee9a320d051d41f"},
+ {file = "orjson-3.10.7-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76ac14cd57df0572453543f8f2575e2d01ae9e790c21f57627803f5e79b0d3c3"},
+ {file = "orjson-3.10.7-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bdbb61dcc365dd9be94e8f7df91975edc9364d6a78c8f7adb69c1cdff318ec93"},
+ {file = "orjson-3.10.7-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b48b3db6bb6e0a08fa8c83b47bc169623f801e5cc4f24442ab2b6617da3b5313"},
+ {file = "orjson-3.10.7-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:23820a1563a1d386414fef15c249040042b8e5d07b40ab3fe3efbfbbcbcb8864"},
+ {file = "orjson-3.10.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a0c6a008e91d10a2564edbb6ee5069a9e66df3fbe11c9a005cb411f441fd2c09"},
+ {file = "orjson-3.10.7-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d352ee8ac1926d6193f602cbe36b1643bbd1bbcb25e3c1a657a4390f3000c9a5"},
+ {file = "orjson-3.10.7-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:d2d9f990623f15c0ae7ac608103c33dfe1486d2ed974ac3f40b693bad1a22a7b"},
+ {file = "orjson-3.10.7-cp312-none-win32.whl", hash = "sha256:7c4c17f8157bd520cdb7195f75ddbd31671997cbe10aee559c2d613592e7d7eb"},
+ {file = "orjson-3.10.7-cp312-none-win_amd64.whl", hash = "sha256:1d9c0e733e02ada3ed6098a10a8ee0052dd55774de3d9110d29868d24b17faa1"},
+ {file = "orjson-3.10.7-cp313-cp313-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:77d325ed866876c0fa6492598ec01fe30e803272a6e8b10e992288b009cbe149"},
+ {file = "orjson-3.10.7-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ea2c232deedcb605e853ae1db2cc94f7390ac776743b699b50b071b02bea6fe"},
+ {file = "orjson-3.10.7-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:3dcfbede6737fdbef3ce9c37af3fb6142e8e1ebc10336daa05872bfb1d87839c"},
+ {file = "orjson-3.10.7-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:11748c135f281203f4ee695b7f80bb1358a82a63905f9f0b794769483ea854ad"},
+ {file = "orjson-3.10.7-cp313-none-win32.whl", hash = "sha256:a7e19150d215c7a13f39eb787d84db274298d3f83d85463e61d277bbd7f401d2"},
+ {file = "orjson-3.10.7-cp313-none-win_amd64.whl", hash = "sha256:eef44224729e9525d5261cc8d28d6b11cafc90e6bd0be2157bde69a52ec83024"},
+ {file = "orjson-3.10.7-cp38-cp38-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:6ea2b2258eff652c82652d5e0f02bd5e0463a6a52abb78e49ac288827aaa1469"},
+ {file = "orjson-3.10.7-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:430ee4d85841e1483d487e7b81401785a5dfd69db5de01314538f31f8fbf7ee1"},
+ {file = "orjson-3.10.7-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4b6146e439af4c2472c56f8540d799a67a81226e11992008cb47e1267a9b3225"},
+ {file = "orjson-3.10.7-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:084e537806b458911137f76097e53ce7bf5806dda33ddf6aaa66a028f8d43a23"},
+ {file = "orjson-3.10.7-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4829cf2195838e3f93b70fd3b4292156fc5e097aac3739859ac0dcc722b27ac0"},
+ {file = "orjson-3.10.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1193b2416cbad1a769f868b1749535d5da47626ac29445803dae7cc64b3f5c98"},
+ {file = "orjson-3.10.7-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:4e6c3da13e5a57e4b3dca2de059f243ebec705857522f188f0180ae88badd354"},
+ {file = "orjson-3.10.7-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:c31008598424dfbe52ce8c5b47e0752dca918a4fdc4a2a32004efd9fab41d866"},
+ {file = "orjson-3.10.7-cp38-none-win32.whl", hash = "sha256:7122a99831f9e7fe977dc45784d3b2edc821c172d545e6420c375e5a935f5a1c"},
+ {file = "orjson-3.10.7-cp38-none-win_amd64.whl", hash = "sha256:a763bc0e58504cc803739e7df040685816145a6f3c8a589787084b54ebc9f16e"},
+ {file = "orjson-3.10.7-cp39-cp39-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:e76be12658a6fa376fcd331b1ea4e58f5a06fd0220653450f0d415b8fd0fbe20"},
+ {file = "orjson-3.10.7-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ed350d6978d28b92939bfeb1a0570c523f6170efc3f0a0ef1f1df287cd4f4960"},
+ {file = "orjson-3.10.7-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:144888c76f8520e39bfa121b31fd637e18d4cc2f115727865fdf9fa325b10412"},
+ {file = "orjson-3.10.7-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:09b2d92fd95ad2402188cf51573acde57eb269eddabaa60f69ea0d733e789fe9"},
+ {file = "orjson-3.10.7-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b24a579123fa884f3a3caadaed7b75eb5715ee2b17ab5c66ac97d29b18fe57f"},
+ {file = "orjson-3.10.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e72591bcfe7512353bd609875ab38050efe3d55e18934e2f18950c108334b4ff"},
+ {file = "orjson-3.10.7-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:f4db56635b58cd1a200b0a23744ff44206ee6aa428185e2b6c4a65b3197abdcd"},
+ {file = "orjson-3.10.7-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:0fa5886854673222618638c6df7718ea7fe2f3f2384c452c9ccedc70b4a510a5"},
+ {file = "orjson-3.10.7-cp39-none-win32.whl", hash = "sha256:8272527d08450ab16eb405f47e0f4ef0e5ff5981c3d82afe0efd25dcbef2bcd2"},
+ {file = "orjson-3.10.7-cp39-none-win_amd64.whl", hash = "sha256:974683d4618c0c7dbf4f69c95a979734bf183d0658611760017f6e70a145af58"},
+ {file = "orjson-3.10.7.tar.gz", hash = "sha256:75ef0640403f945f3a1f9f6400686560dbfb0fb5b16589ad62cd477043c4eee3"},
+]
+
[[package]]
name = "packaging"
version = "24.1"
@@ -7480,9 +7665,9 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools",
type = ["pytest-mypy"]
[extras]
-examples = ["flagembedding", "jsonpath-ng", "llama-index-embeddings-huggingface", "llama-index-llms-huggingface-api", "llama-index-postprocessor-flag-embedding-reranker", "llama-index-vector-stores-milvus", "peft", "python-dotenv"]
+examples = ["flagembedding", "jsonpath-ng", "langchain-huggingface", "langchain-milvus", "langchain-text-splitters", "llama-index-embeddings-huggingface", "llama-index-llms-huggingface-api", "llama-index-postprocessor-flag-embedding-reranker", "llama-index-vector-stores-milvus", "peft", "python-dotenv"]
[metadata]
lock-version = "2.0"
python-versions = "^3.10"
-content-hash = "9341adde375afd06a61daae54b8b87383fb98868f8e5b2c7f32c9d3cb3bcf897"
+content-hash = "417e7168718425cb9f2c3d487329d75c7ffbd47733ce88861209f0ac5f61d509"
diff --git a/pyproject.toml b/pyproject.toml
index 1eaf8c8..ca1576a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,6 +36,7 @@ python = "^3.10"
docling-core = "^1.1.2"
llama-index-core = "^0.11.1"
+langchain-core = "^0.2.38"
docling = "^1.8.2"
#########
@@ -49,6 +50,9 @@ llama-index-postprocessor-flag-embedding-reranker = {version = "^0.2.0", option
flagembedding = { version = "^1.2.10", optional = true }
peft = { version = "^0.12.0", optional = true } # TODO: remove once we can update FlagEmbedding past 1.2.11 to include https://github.com/FlagOpen/FlagEmbedding/commit/1613625417e293bf98311cb8ae0819a0a3af5297
jsonpath-ng = { version = "^1.6.1", optional = true }
+langchain-huggingface = { version = "^0.0.3", optional = true}
+langchain-milvus = { version = "^0.1.4", optional = true }
+langchain-text-splitters = { version = "^0.2.4", optional = true }
##############
# constraints:
@@ -65,6 +69,7 @@ torchvision = [
[tool.poetry.extras]
examples = [
"python-dotenv",
+ # LlamaIndex examples:
"llama-index-embeddings-huggingface",
"llama-index-llms-huggingface-api",
"llama-index-vector-stores-milvus",
@@ -72,6 +77,10 @@ examples = [
"flagembedding",
"peft", # TODO: remove once we can update FlagEmbedding past 1.2.11 to include https://github.com/FlagOpen/FlagEmbedding/commit/1613625417e293bf98311cb8ae0819a0a3af5297
"jsonpath-ng",
+ # LangChain examples:
+ "langchain-huggingface",
+ "langchain-milvus",
+ "langchain-text-splitters",
]
[tool.poetry.group.dev.dependencies]
diff --git a/quackling/langchain/__init__.py b/quackling/langchain/__init__.py
new file mode 100644
index 0000000..bacc58a
--- /dev/null
+++ b/quackling/langchain/__init__.py
@@ -0,0 +1,4 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
diff --git a/quackling/langchain/loaders/__init__.py b/quackling/langchain/loaders/__init__.py
new file mode 100644
index 0000000..31e2d18
--- /dev/null
+++ b/quackling/langchain/loaders/__init__.py
@@ -0,0 +1,6 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+
+from quackling.langchain.loaders.docling_pdf_loader import DoclingPDFLoader # noqa
diff --git a/quackling/langchain/loaders/base.py b/quackling/langchain/loaders/base.py
new file mode 100644
index 0000000..aa756bd
--- /dev/null
+++ b/quackling/langchain/loaders/base.py
@@ -0,0 +1,43 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+
+from enum import Enum
+
+from docling.document_converter import DocumentConverter
+from docling_core.types import Document as DLDocument
+from langchain_core.document_loaders import BaseLoader
+from langchain_core.documents import Document as LCDocument
+from pydantic import BaseModel
+
+
+class DocumentMetadata(BaseModel):
+ dl_doc_hash: str
+ # source: str
+
+
+class BaseDoclingLoader(BaseLoader):
+ class ParseType(str, Enum):
+ MARKDOWN = "markdown"
+ JSON = "json"
+
+ def __init__(self, file_path: str | list[str], parse_type: ParseType) -> None:
+ self._file_paths = file_path if isinstance(file_path, list) else [file_path]
+ self._parse_type = parse_type
+ self._converter = DocumentConverter()
+
+ def _create_lc_doc_from_dl_doc(self, dl_doc: DLDocument) -> LCDocument:
+ if self._parse_type == self.ParseType.MARKDOWN:
+ text = dl_doc.export_to_markdown()
+ elif self._parse_type == self.ParseType.JSON:
+ text = dl_doc.model_dump_json()
+ else:
+ raise RuntimeError(f"Unexpected parse type encountered: {self._parse_type}")
+ lc_doc = LCDocument(
+ page_content=text,
+ metadata=DocumentMetadata(
+ dl_doc_hash=dl_doc.file_info.document_hash,
+ ).model_dump(),
+ )
+ return lc_doc
diff --git a/quackling/langchain/loaders/docling_pdf_loader.py b/quackling/langchain/loaders/docling_pdf_loader.py
new file mode 100644
index 0000000..70a4eec
--- /dev/null
+++ b/quackling/langchain/loaders/docling_pdf_loader.py
@@ -0,0 +1,19 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+
+from typing import Iterator
+
+from langchain_core.documents import Document as LCDocument
+
+from quackling.langchain.loaders.base import BaseDoclingLoader
+
+
+class DoclingPDFLoader(BaseDoclingLoader):
+
+ def lazy_load(self) -> Iterator[LCDocument]:
+ for source in self._file_paths:
+ dl_doc = self._converter.convert_single(source).output
+ lc_doc = self._create_lc_doc_from_dl_doc(dl_doc=dl_doc)
+ yield lc_doc
diff --git a/quackling/langchain/splitters/__init__.py b/quackling/langchain/splitters/__init__.py
new file mode 100644
index 0000000..2ea44be
--- /dev/null
+++ b/quackling/langchain/splitters/__init__.py
@@ -0,0 +1,8 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+
+from quackling.langchain.splitters.hier_json_splitter import ( # noqa
+ HierarchicalJSONSplitter,
+)
diff --git a/quackling/langchain/splitters/hier_json_splitter.py b/quackling/langchain/splitters/hier_json_splitter.py
new file mode 100644
index 0000000..d4ff65e
--- /dev/null
+++ b/quackling/langchain/splitters/hier_json_splitter.py
@@ -0,0 +1,48 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+
+from typing import Iterable, List
+
+from docling_core.types import Document as DLDocument
+from langchain_core.documents import Document as LCDocument
+from pydantic import BaseModel
+
+from quackling.core.chunkers.base import BaseChunker
+from quackling.core.chunkers.hierarchical_chunker import HierarchicalChunker
+
+
+class ChunkDocMetadata(BaseModel):
+ dl_doc_id: str
+ path: str
+
+
+class HierarchicalJSONSplitter:
+
+ def __init__(
+ self,
+ chunker: BaseChunker | None = None,
+ ) -> None:
+ self.chunker: BaseChunker = chunker or HierarchicalChunker()
+
+ def split_documents(self, documents: Iterable[LCDocument]) -> List[LCDocument]:
+
+ all_chunk_docs: list[LCDocument] = []
+ for doc in documents:
+ lc_doc: LCDocument = LCDocument.parse_obj(doc)
+ dl_doc: DLDocument = DLDocument.model_validate_json(lc_doc.page_content)
+ chunk_iter = self.chunker.chunk(dl_doc=dl_doc)
+ chunk_docs = [
+ LCDocument(
+ page_content=chunk.text,
+ metadata=ChunkDocMetadata(
+ dl_doc_id=dl_doc.file_info.document_hash,
+ path=chunk.path,
+ ).model_dump(),
+ )
+ for chunk in chunk_iter
+ ]
+ all_chunk_docs.extend(chunk_docs)
+
+ return all_chunk_docs