From 38401512bfde3932e989babadc3cde7f489b6018 Mon Sep 17 00:00:00 2001 From: Rob Rudin Date: Tue, 24 Sep 2024 15:55:45 -0400 Subject: [PATCH] Converting docs --- .github/workflows/jekyll.yml | 62 ++++ README.md | 31 +- docs/.gitignore | 5 + docs/Gemfile | 32 ++ docs/Gemfile.lock | 286 ++++++++++++++++++ docs/_config.yml | 20 ++ .../README.md => docs/embedding.md | 56 ++-- docs/index.md | 41 +++ docs/rag-examples/rag-examples.md | 10 + .../rag-examples/rag-java.md | 59 ++-- .../rag-examples/rag-javascript.md | 25 +- .../rag-examples/rag-python.md | 69 +++-- setup/README.md => docs/setup.md | 13 +- .../README.md => docs/splitting.md | 47 +-- 14 files changed, 623 insertions(+), 133 deletions(-) create mode 100644 .github/workflows/jekyll.yml create mode 100644 docs/.gitignore create mode 100644 docs/Gemfile create mode 100644 docs/Gemfile.lock create mode 100644 docs/_config.yml rename embedding-langchain-java/README.md => docs/embedding.md (70%) create mode 100644 docs/index.md create mode 100644 docs/rag-examples/rag-examples.md rename rag-langchain-java/README.md => docs/rag-examples/rag-java.md (86%) rename rag-langchain-js/README.md => docs/rag-examples/rag-javascript.md (88%) rename rag-langchain-python/README.md => docs/rag-examples/rag-python.md (78%) rename setup/README.md => docs/setup.md (88%) rename splitting-langchain-java/README.md => docs/splitting.md (86%) diff --git a/.github/workflows/jekyll.yml b/.github/workflows/jekyll.yml new file mode 100644 index 0000000..424bd9e --- /dev/null +++ b/.github/workflows/jekyll.yml @@ -0,0 +1,62 @@ +# Started with: https://github.com/actions/starter-workflows/blob/main/pages/jekyll.yml . +# Have to use a custom Jekyll workflow as the default GitHub Jekyll workflow does not whitelist the +# "jekyll-tabs" plugin that lets us show code tabs in the documentation. + +name: Deploy Jekyll site to Pages + +on: + push: + branches: ["main"] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages +permissions: + contents: read + pages: write + id-token: write + +# Allow one concurrent deployment +concurrency: + group: "pages" + cancel-in-progress: true + +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Setup Ruby and install Gemfile bundles + uses: ruby/setup-ruby@8575951200e472d5f2d95c625da0c7bec8217c42 # v1.161.0 + with: + ruby-version: '3.1' # Not needed with a .ruby-version file + bundler-cache: true # runs 'bundle install' and caches installed gems automatically + working-directory: ./docs + cache-version: 0 # Increment this number if you need to re-download cached gems + - name: Setup Pages + id: pages + uses: actions/configure-pages@v5 + - name: Build with Jekyll + working-directory: ./docs + # Outputs to the './_site' directory by default + run: bundle exec jekyll build --verbose --baseurl "${{ steps.pages.outputs.base_path }}" + env: + JEKYLL_ENV: production + - name: Upload artifact + # Automatically uploads an artifact from the './_site' directory by default + uses: actions/upload-pages-artifact@v3 + with: + path: "./docs/_site" + + deploy: + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: ubuntu-latest + needs: build + steps: + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/README.md b/README.md index bf44fa5..c1f949e 100644 --- a/README.md +++ b/README.md @@ -4,33 +4,4 @@ This repository contains a set of examples for demonstrating common AI use cases MarkLogic. These examples are intended to serve as a starting point for your own applications; you are encouraged to copy and modify the code as needed. -The examples in this repository depend on the -[Azure OpenAI Service](https://azure.microsoft.com/en-us/products/ai-services/openai-service). They can be easily -tailored to work with any LLM supported by the LLM framework used by each example. Note though that if you wish to -execute these examples as-is, you will need an Azure OpenAI account and API key. - -## Setup - -If you would like to try out the example programs, please [follow these instructions](setup/README.md). - -## RAG Examples - -MarkLogic excels at supporting RAG, or ["Retrieval-Augmented Generation"](https://python.langchain.com/docs/tutorials/rag/), -via its schema-agnostic nature as well as it's powerful and flexible indexing. This repository contains the following -examples of RAG with MarkLogic: - -- The [rag-langchain-python](rag-langchain-python/README.md) project demonstrates RAG with Python, langchain, and MarkLogic. -- The [rag-langchain-java](rag-langchain-java/README.md) project demonstrates RAG with Java, langchain4j, and MarkLogic. -- The [rag-langchain-js](rag-langchain-js/README.md) project demonstrates RAG with JavaScript, langchain.js, and MarkLogic. - -## Splitting / Chunking Examples - -A RAG approach typically benefits from sending multiple smaller segments or "chunks" of text to an LLM. Please -see [this guide on splitting documents](splitting-langchain-java/README.md) for more information on how to split -your documents and why you may wish to do so. - -## Embedding examples - -To utilize the vector queries shown in the RAG Examples listed above, embeddings - vector representations of text - -should be added to your documents in MarkLogic. -See [this guide on adding embeddings](embedding-langchain-java/README.md) for more information. +For more information, please see [the user guide](TODO This will be changed once the docs are being published). diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 0000000..f40fbd8 --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1,5 @@ +_site +.sass-cache +.jekyll-cache +.jekyll-metadata +vendor diff --git a/docs/Gemfile b/docs/Gemfile new file mode 100644 index 0000000..9a3fa31 --- /dev/null +++ b/docs/Gemfile @@ -0,0 +1,32 @@ +source "https://rubygems.org" +# Hello! This is where you manage which Jekyll version is used to run. +# When you want to use a different version, change it below, save the +# file and run `bundle install`. Run Jekyll with `bundle exec`, like so: +# +# bundle exec jekyll serve +# +# This will help ensure the proper Jekyll version is running. +# Happy Jekylling! + +gem "github-pages", "~> 229", group: :jekyll_plugins + +gem "webrick" + +# If you have any plugins, put them here! +group :jekyll_plugins do + gem "jekyll-tabs" +end + +# Windows and JRuby does not include zoneinfo files, so bundle the tzinfo-data gem +# and associated library. +platforms :mingw, :x64_mingw, :mswin, :jruby do + gem "tzinfo", ">= 1", "< 3" + gem "tzinfo-data" +end + +# Performance-booster for watching directories on Windows +gem "wdm", "~> 0.1.1", :platforms => [:mingw, :x64_mingw, :mswin] + +# Lock `http_parser.rb` gem to `v0.6.x` on JRuby builds since newer versions of the gem +# do not have a Java counterpart. +gem "http_parser.rb", "~> 0.6.0", :platforms => [:jruby] diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock new file mode 100644 index 0000000..7c98b46 --- /dev/null +++ b/docs/Gemfile.lock @@ -0,0 +1,286 @@ +GEM + remote: https://rubygems.org/ + specs: + activesupport (7.1.3) + base64 + bigdecimal + concurrent-ruby (~> 1.0, >= 1.0.2) + connection_pool (>= 2.2.5) + drb + i18n (>= 1.6, < 2) + minitest (>= 5.1) + mutex_m + tzinfo (~> 2.0) + addressable (2.8.6) + public_suffix (>= 2.0.2, < 6.0) + base64 (0.2.0) + bigdecimal (3.1.6) + coffee-script (2.4.1) + coffee-script-source + execjs + coffee-script-source (1.12.2) + colorator (1.1.0) + commonmarker (0.23.10) + concurrent-ruby (1.2.3) + connection_pool (2.4.1) + dnsruby (1.70.0) + simpleidn (~> 0.2.1) + drb (2.2.0) + ruby2_keywords + em-websocket (0.5.3) + eventmachine (>= 0.12.9) + http_parser.rb (~> 0) + ethon (0.16.0) + ffi (>= 1.15.0) + eventmachine (1.2.7) + execjs (2.9.1) + faraday (2.9.0) + faraday-net_http (>= 2.0, < 3.2) + faraday-net_http (3.1.0) + net-http + ffi (1.16.3) + forwardable-extended (2.6.0) + gemoji (4.1.0) + github-pages (229) + github-pages-health-check (= 1.18.2) + jekyll (= 3.9.4) + jekyll-avatar (= 0.8.0) + jekyll-coffeescript (= 1.2.2) + jekyll-commonmark-ghpages (= 0.4.0) + jekyll-default-layout (= 0.1.5) + jekyll-feed (= 0.17.0) + jekyll-gist (= 1.5.0) + jekyll-github-metadata (= 2.16.1) + jekyll-include-cache (= 0.2.1) + jekyll-mentions (= 1.6.0) + jekyll-optional-front-matter (= 0.3.2) + jekyll-paginate (= 1.1.0) + jekyll-readme-index (= 0.3.0) + jekyll-redirect-from (= 0.16.0) + jekyll-relative-links (= 0.7.0) + jekyll-remote-theme (= 0.4.3) + jekyll-sass-converter (= 1.5.2) + jekyll-seo-tag (= 2.8.0) + jekyll-sitemap (= 1.4.0) + jekyll-swiss (= 1.0.0) + jekyll-theme-architect (= 0.2.0) + jekyll-theme-cayman (= 0.2.0) + jekyll-theme-dinky (= 0.2.0) + jekyll-theme-hacker (= 0.2.0) + jekyll-theme-leap-day (= 0.2.0) + jekyll-theme-merlot (= 0.2.0) + jekyll-theme-midnight (= 0.2.0) + jekyll-theme-minimal (= 0.2.0) + jekyll-theme-modernist (= 0.2.0) + jekyll-theme-primer (= 0.6.0) + jekyll-theme-slate (= 0.2.0) + jekyll-theme-tactile (= 0.2.0) + jekyll-theme-time-machine (= 0.2.0) + jekyll-titles-from-headings (= 0.5.3) + jemoji (= 0.13.0) + kramdown (= 2.4.0) + kramdown-parser-gfm (= 1.1.0) + liquid (= 4.0.4) + mercenary (~> 0.3) + minima (= 2.5.1) + nokogiri (>= 1.13.6, < 2.0) + rouge (= 3.30.0) + terminal-table (~> 1.4) + github-pages-health-check (1.18.2) + addressable (~> 2.3) + dnsruby (~> 1.60) + octokit (>= 4, < 8) + public_suffix (>= 3.0, < 6.0) + typhoeus (~> 1.3) + html-pipeline (2.14.3) + activesupport (>= 2) + nokogiri (>= 1.4) + http_parser.rb (0.8.0) + i18n (1.14.1) + concurrent-ruby (~> 1.0) + jekyll (3.9.4) + addressable (~> 2.4) + colorator (~> 1.0) + em-websocket (~> 0.5) + i18n (>= 0.7, < 2) + jekyll-sass-converter (~> 1.0) + jekyll-watch (~> 2.0) + kramdown (>= 1.17, < 3) + liquid (~> 4.0) + mercenary (~> 0.3.3) + pathutil (~> 0.9) + rouge (>= 1.7, < 4) + safe_yaml (~> 1.0) + jekyll-avatar (0.8.0) + jekyll (>= 3.0, < 5.0) + jekyll-coffeescript (1.2.2) + coffee-script (~> 2.2) + coffee-script-source (~> 1.12) + jekyll-commonmark (1.4.0) + commonmarker (~> 0.22) + jekyll-commonmark-ghpages (0.4.0) + commonmarker (~> 0.23.7) + jekyll (~> 3.9.0) + jekyll-commonmark (~> 1.4.0) + rouge (>= 2.0, < 5.0) + jekyll-default-layout (0.1.5) + jekyll (>= 3.0, < 5.0) + jekyll-feed (0.17.0) + jekyll (>= 3.7, < 5.0) + jekyll-gist (1.5.0) + octokit (~> 4.2) + jekyll-github-metadata (2.16.1) + jekyll (>= 3.4, < 5.0) + octokit (>= 4, < 7, != 4.4.0) + jekyll-include-cache (0.2.1) + jekyll (>= 3.7, < 5.0) + jekyll-mentions (1.6.0) + html-pipeline (~> 2.3) + jekyll (>= 3.7, < 5.0) + jekyll-optional-front-matter (0.3.2) + jekyll (>= 3.0, < 5.0) + jekyll-paginate (1.1.0) + jekyll-readme-index (0.3.0) + jekyll (>= 3.0, < 5.0) + jekyll-redirect-from (0.16.0) + jekyll (>= 3.3, < 5.0) + jekyll-relative-links (0.7.0) + jekyll (>= 3.3, < 5.0) + jekyll-remote-theme (0.4.3) + addressable (~> 2.0) + jekyll (>= 3.5, < 5.0) + jekyll-sass-converter (>= 1.0, <= 3.0.0, != 2.0.0) + rubyzip (>= 1.3.0, < 3.0) + jekyll-sass-converter (1.5.2) + sass (~> 3.4) + jekyll-seo-tag (2.8.0) + jekyll (>= 3.8, < 5.0) + jekyll-sitemap (1.4.0) + jekyll (>= 3.7, < 5.0) + jekyll-swiss (1.0.0) + jekyll-tabs (1.2.1) + jekyll (>= 3.0, < 5.0) + jekyll-theme-architect (0.2.0) + jekyll (> 3.5, < 5.0) + jekyll-seo-tag (~> 2.0) + jekyll-theme-cayman (0.2.0) + jekyll (> 3.5, < 5.0) + jekyll-seo-tag (~> 2.0) + jekyll-theme-dinky (0.2.0) + jekyll (> 3.5, < 5.0) + jekyll-seo-tag (~> 2.0) + jekyll-theme-hacker (0.2.0) + jekyll (> 3.5, < 5.0) + jekyll-seo-tag (~> 2.0) + jekyll-theme-leap-day (0.2.0) + jekyll (> 3.5, < 5.0) + jekyll-seo-tag (~> 2.0) + jekyll-theme-merlot (0.2.0) + jekyll (> 3.5, < 5.0) + jekyll-seo-tag (~> 2.0) + jekyll-theme-midnight (0.2.0) + jekyll (> 3.5, < 5.0) + jekyll-seo-tag (~> 2.0) + jekyll-theme-minimal (0.2.0) + jekyll (> 3.5, < 5.0) + jekyll-seo-tag (~> 2.0) + jekyll-theme-modernist (0.2.0) + jekyll (> 3.5, < 5.0) + jekyll-seo-tag (~> 2.0) + jekyll-theme-primer (0.6.0) + jekyll (> 3.5, < 5.0) + jekyll-github-metadata (~> 2.9) + jekyll-seo-tag (~> 2.0) + jekyll-theme-slate (0.2.0) + jekyll (> 3.5, < 5.0) + jekyll-seo-tag (~> 2.0) + jekyll-theme-tactile (0.2.0) + jekyll (> 3.5, < 5.0) + jekyll-seo-tag (~> 2.0) + jekyll-theme-time-machine (0.2.0) + jekyll (> 3.5, < 5.0) + jekyll-seo-tag (~> 2.0) + jekyll-titles-from-headings (0.5.3) + jekyll (>= 3.3, < 5.0) + jekyll-watch (2.2.1) + listen (~> 3.0) + jemoji (0.13.0) + gemoji (>= 3, < 5) + html-pipeline (~> 2.2) + jekyll (>= 3.0, < 5.0) + kramdown (2.4.0) + rexml + kramdown-parser-gfm (1.1.0) + kramdown (~> 2.0) + liquid (4.0.4) + listen (3.8.0) + rb-fsevent (~> 0.10, >= 0.10.3) + rb-inotify (~> 0.9, >= 0.9.10) + mercenary (0.3.6) + minima (2.5.1) + jekyll (>= 3.5, < 5.0) + jekyll-feed (~> 0.9) + jekyll-seo-tag (~> 2.1) + minitest (5.22.2) + mutex_m (0.2.0) + net-http (0.4.1) + uri + nokogiri (1.16.5-arm64-darwin) + racc (~> 1.4) + nokogiri (1.16.5-x86_64-linux) + racc (~> 1.4) + octokit (4.25.1) + faraday (>= 1, < 3) + sawyer (~> 0.9) + pathutil (0.16.2) + forwardable-extended (~> 2.6) + public_suffix (5.0.4) + racc (1.8.0) + rb-fsevent (0.11.2) + rb-inotify (0.10.1) + ffi (~> 1.0) + rexml (3.3.6) + strscan + rouge (3.30.0) + ruby2_keywords (0.0.5) + rubyzip (2.3.2) + safe_yaml (1.0.5) + sass (3.7.4) + sass-listen (~> 4.0.0) + sass-listen (4.0.0) + rb-fsevent (~> 0.9, >= 0.9.4) + rb-inotify (~> 0.9, >= 0.9.7) + sawyer (0.9.2) + addressable (>= 2.3.5) + faraday (>= 0.17.3, < 3) + simpleidn (0.2.1) + unf (~> 0.1.4) + strscan (3.1.0) + terminal-table (1.8.0) + unicode-display_width (~> 1.1, >= 1.1.1) + typhoeus (1.4.1) + ethon (>= 0.9.0) + tzinfo (2.0.6) + concurrent-ruby (~> 1.0) + unf (0.1.4) + unf_ext + unf_ext (0.0.9.1) + unicode-display_width (1.8.0) + uri (0.13.0) + webrick (1.8.1) + +PLATFORMS + arm64-darwin-23 + x86_64-linux + +DEPENDENCIES + github-pages (~> 229) + http_parser.rb (~> 0.6.0) + jekyll-tabs + tzinfo (>= 1, < 3) + tzinfo-data + wdm (~> 0.1.1) + webrick + +BUNDLED WITH + 2.4.7 diff --git a/docs/_config.yml b/docs/_config.yml new file mode 100644 index 0000000..31572f8 --- /dev/null +++ b/docs/_config.yml @@ -0,0 +1,20 @@ +title: MarkLogic AI Examples +repository: marklogic/marklogic-ai-examples +remote_theme: just-the-docs/just-the-docs +plugins: + - jekyll-remote-theme + - jekyll-tabs # See https://github.com/Ovski4/jekyll-tabs. + +heading_anchors: true + +logo: "/assets/ProgressMarkLogic_PrimaryLogo_Stacked.svg" + +# Aux links for the upper right navigation +aux_links: + "marklogic/marklogic-ai-examples": + - "https://github.com/marklogic/marklogic-ai-examples" + +# Makes Aux links open in a new tab. Default is false +aux_links_new_tab: false + +enable_copy_code_button: true diff --git a/embedding-langchain-java/README.md b/docs/embedding.md similarity index 70% rename from embedding-langchain-java/README.md rename to docs/embedding.md index 585e18d..ee5696d 100644 --- a/embedding-langchain-java/README.md +++ b/docs/embedding.md @@ -1,21 +1,33 @@ -# Adding embeddings with langchain4j +--- +layout: default +title: Embedding Examples +nav_order: 5 +--- -The vector queries shown in the [langchain](../rag-langchain-python/README.md), +## Table of contents +{: .no_toc .text-delta } + +- TOC +{:toc} + +## Adding embeddings with langchain4j + +The vector queries shown in the [langchain](../rag-langchain-python/README.md), [langchain4j](../rag-langchain-java), and [langchain.js](../rag-langchain-js/README.md) RAG examples -depend on embeddings - vector representations of text - being added to documents in MarkLogic. Vector queries can -then be implemented using [the new vector functions](https://docs.marklogic.com/12.0/js/vec) in MarkLogic 12. -This project demonstrates the use of a -[langchain4j in-process embedding model](https://docs.langchain4j.dev/integrations/embedding-models/in-process) and -the [MarkLogic Data Movement SDK](https://docs.marklogic.com/guide/java/data-movement) for adding embeddings to +depend on embeddings - vector representations of text - being added to documents in MarkLogic. Vector queries can +then be implemented using [the new vector functions](https://docs.marklogic.com/12.0/js/vec) in MarkLogic 12. +This project demonstrates the use of a +[langchain4j in-process embedding model](https://docs.langchain4j.dev/integrations/embedding-models/in-process) and +the [MarkLogic Data Movement SDK](https://docs.marklogic.com/guide/java/data-movement) for adding embeddings to documents in MarkLogic. ## Setup -This example depends both on the [main setup for all examples](../setup/README.md) and also on having run the -"Split to multiple documents" example program in the +This example depends both on the [main setup for all examples](../setup/README.md) and also on having run the +"Split to multiple documents" example program in the [document splitting examples](../splitting-langchain-java/README.md). That example program used langchain4j to split the text in Enron email documents and write each chunk of text to a separate document. This example will then use -langchain4j to generate an embedding for the chunk of text and add it to each chunk document. +langchain4j to generate an embedding for the chunk of text and add it to each chunk document. ## Add embeddings example @@ -23,15 +35,15 @@ To try the embedding example, run the following Gradle task: ../gradlew addEmbeddings -After the task completes, each document in the `enron-chunk` collection will now have an `embedding` field -consisting of an array of floating point numbers. Each document will also have been added to the -`enron-chunk-with-embedding` collection. +After the task completes, each document in the `enron-chunk` collection will now have an `embedding` field +consisting of an array of floating point numbers. Each document will also have been added to the +`enron-chunk-with-embedding` collection. -As a next step, you would likely create a [MarkLogic TDE view](https://docs.marklogic.com/guide/app-dev/TDE) that +As a next step, you would likely create a [MarkLogic TDE view](https://docs.marklogic.com/guide/app-dev/TDE) that allows you to use the [MarkLogic Optic API](https://docs.marklogic.com/guide/app-dev/OpticAPI) for querying for rows -with similar embeddings. This is the exact approach used in the vector queries for each of the RAG examples mentioned -above. Your TDE could look like the one shown below. Note that the value of `dimension` for the `embedding` column -must match that of the embedding model that you used. In this example, the langchain4j in-process embedding model +with similar embeddings. This is the exact approach used in the vector queries for each of the RAG examples mentioned +above. Your TDE could look like the one shown below. Note that the value of `dimension` for the `embedding` column +must match that of the embedding model that you used. In this example, the langchain4j in-process embedding model requires a value of 384 for the `dimension` column. ``` @@ -70,10 +82,10 @@ requires a value of 384 for the `dimension` column. } ``` -When performing a vector query with MarkLogic, you need to ensure that the embedding that you compare to the values +When performing a vector query with MarkLogic, you need to ensure that the embedding that you compare to the values in the `vector` column defined in your TDE have the same dimension value. Otherwise, MarkLogic will throw a -`XDMP-DIMMISMATCH` error. For example, since an in-process langchain4j embedding model is used in this example program, -you would want to use the same embedding model to generate an embedding of a user's chat question. If you wished to -use an [Azure OpenAI embedding model](https://docs.langchain4j.dev/integrations/embedding-models/azure-open-ai) -in the above example program, you would then need to use the same embedding model when generating an embedding of a +`XDMP-DIMMISMATCH` error. For example, since an in-process langchain4j embedding model is used in this example program, +you would want to use the same embedding model to generate an embedding of a user's chat question. If you wished to +use an [Azure OpenAI embedding model](https://docs.langchain4j.dev/integrations/embedding-models/azure-open-ai) +in the above example program, you would then need to use the same embedding model when generating an embedding of a user's chat question. diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..e382da5 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,41 @@ +--- +layout: default +title: Overview +nav_order: 1 +--- + +This repository contains a set of examples for demonstrating common AI use cases for applications built on top of +MarkLogic. These examples are intended to serve as a starting point for your own applications; you are encouraged to +copy and modify the code as needed. + +The examples in this repository depend on the +[Azure OpenAI Service](https://azure.microsoft.com/en-us/products/ai-services/openai-service). They can be easily +tailored to work with any LLM supported by the LLM framework used by each example. Note though that if you wish to +execute these examples as-is, you will need an Azure OpenAI account and API key. + +## Setup + +If you would like to try out the example programs, please [follow these instructions](setup/README.md). + +## RAG Examples + +MarkLogic excels at supporting RAG, or ["Retrieval-Augmented Generation"](https://python.langchain.com/docs/tutorials/rag/), +via its schema-agnostic nature as well as it's powerful and flexible indexing. This repository contains the following +examples of RAG with MarkLogic: + +- The [rag-langchain-python](rag-langchain-python/README.md) project demonstrates RAG with Python, langchain, and MarkLogic. +- The [rag-langchain-java](rag-langchain-java/README.md) project demonstrates RAG with Java, langchain4j, and MarkLogic. +- The [rag-langchain-js](rag-langchain-js/README.md) project demonstrates RAG with JavaScript, langchain.js, and MarkLogic. + +## Splitting / Chunking Examples + +A RAG approach typically benefits from sending multiple smaller segments or "chunks" of text to an LLM. Please +see [this guide on splitting documents](splitting-langchain-java/README.md) for more information on how to split +your documents and why you may wish to do so. + +## Embedding examples + +To utilize the vector queries shown in the RAG Examples listed above, embeddings - vector representations of text - +should be added to your documents in MarkLogic. +See [this guide on adding embeddings](embedding-langchain-java/README.md) for more information. + diff --git a/docs/rag-examples/rag-examples.md b/docs/rag-examples/rag-examples.md new file mode 100644 index 0000000..d498381 --- /dev/null +++ b/docs/rag-examples/rag-examples.md @@ -0,0 +1,10 @@ +--- +layout: default +title: RAG Examples +nav_order: 3 +has_children: true +--- + +MarkLogic excels at supporting RAG, or ["Retrieval-Augmented Generation"](https://python.langchain.com/docs/tutorials/rag/), +via its schema-agnostic nature as well as it's powerful and flexible indexing. See the below pages for examples of RAG +in different languages. diff --git a/rag-langchain-java/README.md b/docs/rag-examples/rag-java.md similarity index 86% rename from rag-langchain-java/README.md rename to docs/rag-examples/rag-java.md index b607ee5..d2c3e63 100644 --- a/rag-langchain-java/README.md +++ b/docs/rag-examples/rag-java.md @@ -1,22 +1,33 @@ -# RAG with langchain4j and MarkLogic - -[Retrieval Augmented Generation (RAG)](https://docs.langchain4j.dev/tutorials/rag) is implemented with +--- +layout: default +title: RAG with langchain4j +parent: RAG Examples +nav_order: 2 +--- + +[Retrieval Augmented Generation (RAG)](https://docs.langchain4j.dev/tutorials/rag) can be implemented in Java with [langchain4j](https://docs.langchain4j.dev/intro) and MarkLogic via a "retriever". The examples in this directory demonstrate three different kinds of retrievers that you can consider for your own AI application. +## Table of contents +{: .no_toc .text-delta } + +- TOC +{:toc} + ## Setup -The only system requirement for running these examples is Java 8 or higher. You can run the examples via an IDE such as +The only system requirement for running these examples is Java 8 or higher. You can run the examples via an IDE such as Visual Code or IntelliJ. You can also use [Gradle](https://gradle.org/) to run the examples, but you do not -need Gradle installed - this repository uses the [Gradle wrapper](https://docs.gradle.org/current/userguide/gradle_wrapper.html) +need Gradle installed - this repository uses the [Gradle wrapper](https://docs.gradle.org/current/userguide/gradle_wrapper.html) to download an appropriate version of Gradle. ## RAG with a simple word query A key feature of MarkLogic is its ability to index all text in a document during ingest. Thus, a simple approach to RAG -with MarkLogic is to select documents based on the words in a user's question. +with MarkLogic is to select documents based on the words in a user's question. -To demonstrate this, you can run the Gradle `askWordQuery` task with any question. This example program uses a custom +To demonstrate this, you can run the Gradle `askWordQuery` task with any question. This example program uses a custom langchain retriever that selects documents in the `ai-examples-content` MarkLogic database containing one or more words in the given question. It then includes the top 10 most relevant documents in the request that it sends to Azure OpenAI. For example: @@ -26,19 +37,19 @@ For example: Running this will yield an answer similar to the below (the answer can vary based on the LLM in use and the nature of the configured deployment model): -> Jane Johnson is a suspect in several incidents, including cybercrime, public intoxication, vandalism, assault, -> looting, and shoplifting. She is described as a Caucasian female in her mid-30s, with blonde hair and blue eyes. -> She is approximately 5'6" and has a slim build. In some incidents, she was wearing a black hoodie and jeans, -> while in others she wore a red coat or a black jacket. The motives for her actions are unclear, but there are -> some speculations that she is struggling financially, dealing with personal issues, or protesting against +> Jane Johnson is a suspect in several incidents, including cybercrime, public intoxication, vandalism, assault, +> looting, and shoplifting. She is described as a Caucasian female in her mid-30s, with blonde hair and blue eyes. +> She is approximately 5'6" and has a slim build. In some incidents, she was wearing a black hoodie and jeans, +> while in others she wore a red coat or a black jacket. The motives for her actions are unclear, but there are +> some speculations that she is struggling financially, dealing with personal issues, or protesting against > certain actions. The police have been notified and are investigating the incidents. You can alter the value of the `-Pquestion=` parameter to be any question you wish. Note as well that if you have tried the [Python langchain examples](../rag-langchain-python/README.md), you will notice -some differences in the results. These differences are primarily due to the different prompts used by langchain and +some differences in the results. These differences are primarily due to the different prompts used by langchain and langchain4j. See [the langchain4j documentation](https://docs.langchain4j.dev/intro) for more information on prompt -templates when using langchain4j. +templates when using langchain4j. ## RAG with a contextual query @@ -57,18 +68,18 @@ Try running the following: ../gradlew askContextualQuery -Pquestion="What disturbances has Jane Doe caused?" The answer will be similar to the one below. You can see how the results are based only on documents involving public -intoxication as opposed to the entire set of fictional crime events. In addition, due to the equal weighting of the -word query and the combined query, the retriever is likely to pull in documents involving public intoxication but +intoxication as opposed to the entire set of fictional crime events. In addition, due to the equal weighting of the +word query and the combined query, the retriever is likely to pull in documents involving public intoxication but not involving Jane Doe: -> Regarding Jane Doe, Ashley Frazier reports a public intoxication incident in which the suspect is wearing a -> red dress and has long blonde hair while stumbling around and slurring her words. The suspect is alone and -> causing a disturbance. Ashley's motive for calling 911 is to ensure everyone's safety and to get the suspect -> the help she needs. In the case of John Smith, Christina Mahoney reports a public intoxication incident at -> 754 Main Road in San Francisco. John is stumbling around, slurring his words, and causing a disturbance -> while wearing a red shirt. Christina's motive for calling 911 is to report the incident to the authorities -> as she was worried about John's safety and that of others. Rachel Sandoval reports a public intoxication -> incident involving Jane Smith at 542 Hill Lane in San Francisco. Jane is stumbling around and slurring +> Regarding Jane Doe, Ashley Frazier reports a public intoxication incident in which the suspect is wearing a +> red dress and has long blonde hair while stumbling around and slurring her words. The suspect is alone and +> causing a disturbance. Ashley's motive for calling 911 is to ensure everyone's safety and to get the suspect +> the help she needs. In the case of John Smith, Christina Mahoney reports a public intoxication incident at +> 754 Main Road in San Francisco. John is stumbling around, slurring his words, and causing a disturbance +> while wearing a red shirt. Christina's motive for calling 911 is to report the incident to the authorities +> as she was worried about John's safety and that of others. Rachel Sandoval reports a public intoxication +> incident involving Jane Smith at 542 Hill Lane in San Francisco. Jane is stumbling around and slurring > her words and causing a disturbance. Her motive for reporting the incident is also to ensure everyone's safety. diff --git a/rag-langchain-js/README.md b/docs/rag-examples/rag-javascript.md similarity index 88% rename from rag-langchain-js/README.md rename to docs/rag-examples/rag-javascript.md index 722681e..31fbdee 100644 --- a/rag-langchain-js/README.md +++ b/docs/rag-examples/rag-javascript.md @@ -1,18 +1,29 @@ -# RAG with LangChainJS and MarkLogic - -[Retrieval Augmented Generation (RAG)](https://docs.langchain4j.dev/tutorials/rag) is implemented with -[LangChainJS](https://js.langchain.com/docs/introduction/) and MarkLogic via a "retriever". The example in this +--- +layout: default +title: RAG with LangChain.js +parent: RAG Examples +nav_order: 3 +--- + +[Retrieval Augmented Generation (RAG)](https://docs.langchain4j.dev/tutorials/rag) can be implemented in JavaScript with +[LangChain.js](https://js.langchain.com/docs/introduction/) and MarkLogic via a "retriever". The example in this directory demonstrates one kind of retriever that you can consider for your own AI application. +## Table of contents +{: .no_toc .text-delta } + +- TOC +{:toc} + ## Setup The only system requirements for running these examples are Node 18.x, 19.x, or 20.x -(see [LangChainJS Installation]https://js.langchain.com/v0.1/docs/get_started/installation/) and npm. +(see [LangChain.js Installation]https://js.langchain.com/v0.1/docs/get_started/installation/) and npm. Minimum versions of npm are dependent on the version of Node. See [Node Releases](https://nodejs.org/en/about/previous-releases#looking-for-latest-release-of-a-version-branch) for more information. -For this LangChainJS example, in addition to the environment variables in the `.env` file described in the README in the +For this LangChain.js example, in addition to the environment variables in the `.env` file described in the README in the root directory of this project, you'll also need to add the `AZURE_OPENAI_API_INSTANCE_NAME` setting to the `.env` file. ``` OPENAI_API_VERSION=2023-12-01-preview @@ -33,7 +44,7 @@ npm install A key feature of MarkLogic is its ability to index all text in a document during ingest. Thus, a simple approach to RAG with MarkLogic is to select documents based on the words in a user's question. -To demonstrate this, you can run the `askWordQuery.js` module with any question. The module uses a custom LangChainJS +To demonstrate this, you can run the `askWordQuery.js` module with any question. The module uses a custom LangChain.js retriever that selects documents in the `ai-examples-content` MarkLogic database containing one or more of the words in the given question. It then includes the top 10 most relevant documents in the request that it sends to Azure OpenAI. For example: diff --git a/rag-langchain-python/README.md b/docs/rag-examples/rag-python.md similarity index 78% rename from rag-langchain-python/README.md rename to docs/rag-examples/rag-python.md index 3993855..7cd07d5 100644 --- a/rag-langchain-python/README.md +++ b/docs/rag-examples/rag-python.md @@ -1,13 +1,24 @@ -# RAG with langchain and MarkLogic - -[Retrieval Augmented Generation (RAG)](https://python.langchain.com/docs/tutorials/rag/) is implemented with -[langchain](https://python.langchain.com/docs/introduction/) and MarkLogic via a "retriever". The examples in this +--- +layout: default +title: RAG with LangChain +parent: RAG Examples +nav_order: 1 +--- + +[Retrieval Augmented Generation (RAG)](https://python.langchain.com/docs/tutorials/rag/) can be implemented in Python +with [langchain](https://python.langchain.com/docs/introduction/) and MarkLogic via a "retriever". The examples in this directory demonstrate three different kinds of retrievers that you can consider for your own AI application. +## Table of contents +{: .no_toc .text-delta } + +- TOC +{:toc} + ## Setup -To try these examples, you should first create a new Python virtual environment. There are many ways to do this; -you can use a tool such as [pyenv](https://github.com/pyenv/pyenv), or just follow these simple steps that +To try these examples, you should first create a new Python virtual environment. There are many ways to do this; +you can use a tool such as [pyenv](https://github.com/pyenv/pyenv), or just follow these simple steps that [create a virtual environment using `venv`](https://docs.python.org/3/library/venv.html): ``` @@ -17,21 +28,21 @@ python -m venv .venv source .venv/bin/activate ``` -Once you have a virtual environment created, run the following to install the necessary langchain dependencies along +Once you have a virtual environment created, run the following to install the necessary langchain dependencies along with the [MarkLogic Python client](https://pypi.org/project/marklogic-python-client/): pip install --quiet --upgrade langchain langchain-community langchain_openai marklogic_python_client -You are now ready to execute the example RAG programs. +You are now ready to execute the example RAG programs. ## RAG with a simple word query A key feature of MarkLogic is its ability to index all text in a document during ingest. Thus, a simple approach to RAG -with MarkLogic is to select documents based on the words in a user's question. +with MarkLogic is to select documents based on the words in a user's question. To demonstrate this, you can run the `ask_word_query.py` module with any question. The module uses a custom langchain retriever that selects documents in the `ai-examples-content` MarkLogic database containing one or more of the words -in the given question. It then includes the top 10 most relevant documents in the request that it sends to Azure OpenAI. +in the given question. It then includes the top 10 most relevant documents in the request that it sends to Azure OpenAI. For example: python ask_word_query.py "What disturbances has Jane Doe caused?" @@ -45,15 +56,15 @@ of the configured deployment model): ## RAG with a contextual query -In many applications built on MarkLogic, a user will search the documents in a database by leveraging a variety of +In many applications built on MarkLogic, a user will search the documents in a database by leveraging a variety of indexes in MarkLogic, such as the universal text index, date range indexes, and geospatial indexes. This query - which -can feature any of the many dozens of different query functions supported by MarkLogic - is referred to as a +can feature any of the many dozens of different query functions supported by MarkLogic - is referred to as a "contextual query" - it captures the user's context in terms of what documents they are interested in. A RAG approach -can then account for both this contextual query and a user's question by enhancing the contextual query with a word -query based on the words in a user's question. +can then account for both this contextual query and a user's question by enhancing the contextual query with a word +query based on the words in a user's question. -The `ask_contextual_query.py` module demonstrates this approach by defining a simple contextual query that only -selects documents containing a JSON property named `type` with a value of `public intoxication`. +The `ask_contextual_query.py` module demonstrates this approach by defining a simple contextual query that only +selects documents containing a JSON property named `type` with a value of `public intoxication`. Try running the following: python ask_contextual_query.py "What disturbances has Jane Doe caused?" @@ -61,25 +72,25 @@ Try running the following: The answer will be similar to the one below. You can see how the results are based only on documents involving public intoxication as opposed to the entire set of fictional crime events: -> Jane Doe has caused disturbances by stumbling around, slurring her words, and causing a disturbance in -> public areas. She has been reported to be yelling at people passing by and blocking the entrance to a +> Jane Doe has caused disturbances by stumbling around, slurring her words, and causing a disturbance in +> public areas. She has been reported to be yelling at people passing by and blocking the entrance to a > nearby store. There are concerns for her safety and the safety of others around her. -## RAG with a vector query +## RAG with a vector query -MarkLogic 12 has +MarkLogic 12 has [new support for generative AI capabilities](https://investors.progress.com/news-releases/news-release-details/progress-announces-powerful-new-generative-ai-capabilities) -via a set of [vector operations](https://docs.marklogic.com/12.0/vec/vector-operations). With this approach, +via a set of [vector operations](https://docs.marklogic.com/12.0/vec/vector-operations). With this approach, documents are first selected in a manner similar to the approaches shown above - by leveraging the powerful and flexible -set of indexes that have long been available in MarkLogic. The documents are then further filtered and sorted via +set of indexes that have long been available in MarkLogic. The documents are then further filtered and sorted via the following process: -1. An embedding of the user's question is generated using [langchain and Azure OpenAI](https://python.langchain.com/docs/integrations/text_embedding/). -2. Using MarkLogic's new vector API, the generated embedding is compared against the embeddings in each -selected crime event document to generate a similarity score for each document. +1. An embedding of the user's question is generated using [langchain and Azure OpenAI](https://python.langchain.com/docs/integrations/text_embedding/). +2. Using MarkLogic's new vector API, the generated embedding is compared against the embeddings in each + selected crime event document to generate a similarity score for each document. 3. The documents with the highest similarity scores are sent to the LLM to augment the user's question. -To try the `ask_vector_query.py` module, you will need to have installed MarkLogic 12 and also have defined +To try the `ask_vector_query.py` module, you will need to have installed MarkLogic 12 and also have defined `AZURE_EMBEDDING_DEPLOYMENT_NAME` in your `.env` file. Please see the [top-level README in this repository](../README.md) for more information. @@ -89,8 +100,8 @@ You can now run `ask_vector_query.py`: An example result is shown below: -> Jane Doe has caused disturbances of the peace, including yelling, screaming, banging on doors and windows, -> and vandalism. The motives for her behavior are unclear, but it may be related to personal issues or +> Jane Doe has caused disturbances of the peace, including yelling, screaming, banging on doors and windows, +> and vandalism. The motives for her behavior are unclear, but it may be related to personal issues or > mental health problems. She has been described as agitated, upset, and heavily intoxicated. The results are similar but slightly different to the results shown above for a simple word query. You can compare @@ -102,5 +113,5 @@ For an example of how to add embeddings to your data, please see [this embedding The three RAG approaches shown above - a simple word query, a contextual query, and a vector query - demonstrate how easily data can be queried and retrieved from MarkLogic using langchain. Identifying the optimal approach for your own -data will require testing the approaches you choose and possibly leveraging additional MarkLogic indexes and/or +data will require testing the approaches you choose and possibly leveraging additional MarkLogic indexes and/or further enriching your data. diff --git a/setup/README.md b/docs/setup.md similarity index 88% rename from setup/README.md rename to docs/setup.md index c7e9069..be55ae5 100644 --- a/setup/README.md +++ b/docs/setup.md @@ -1,7 +1,11 @@ -# Setup +--- +layout: default +title: Setup +nav_order: 2 +--- -If you would like to try out the example programs, please follow these steps to set up a local MarkLogic instance -and to deploy a small application to it: +If you would like to try out the example programs in this repository, please follow these steps to set up a local +MarkLogic instance and to deploy a small application to it: 1. Ensure you have Java 8 or higher installed. 2. In a terminal window, run `cd setup`. @@ -17,7 +21,7 @@ MarkLogic via Docker instead of the command above: docker compose -f docker-compose-12.yml up -d --build -### Azure OpenAI configuration +## Azure OpenAI configuration For any AI example program, it needs to connect to an AI service. The examples in this repository depend on [the Azure OpenAI Service](https://azure.microsoft.com/en-us/products/ai-services/openai-service), though they can be @@ -39,3 +43,4 @@ to add the following to your `.env` file. ``` AZURE_EMBEDDING_DEPLOYMENT_NAME= ``` + diff --git a/splitting-langchain-java/README.md b/docs/splitting.md similarity index 86% rename from splitting-langchain-java/README.md rename to docs/splitting.md index 1bc6c8c..262a121 100644 --- a/splitting-langchain-java/README.md +++ b/docs/splitting.md @@ -1,47 +1,59 @@ -# Splitting documents with langchain4j +--- +layout: default +title: Splitting Examples +nav_order: 4 +--- + +## Table of contents +{: .no_toc .text-delta } + +- TOC +{:toc} + +## Splitting documents with langchain4j A RAG approach typically benefits from sending multiple smaller segments or "chunks" of text to an LLM. While MarkLogic can efficiently ingest and index large documents, sending all the text in even a single document may either exceed -the number of tokens allowed by your LLM or may result in slower and more expensive responses from the LLM. Thus, -when importing or reprocessing documents in MarkLogic, your RAG approach may benefit from splitting the searchable +the number of tokens allowed by your LLM or may result in slower and more expensive responses from the LLM. Thus, +when importing or reprocessing documents in MarkLogic, your RAG approach may benefit from splitting the searchable text in a document into smaller segments or "chunks" that allow for much smaller and more relevent segments of text -to be sent to the LLM. +to be sent to the LLM. This project demonstrates two different approaches to splitting documents: 1. Splitting the text in a document and storing each chunk in a new separate document. -2. Splitting the text in a document and storing the set of chunks in a new separate document. +2. Splitting the text in a document and storing the set of chunks in a new separate document. -You are not limited to these approaches. For example, you may find it beneficial to not create a new document but +You are not limited to these approaches. For example, you may find it beneficial to not create a new document but rather store the set of chunks in the same document containing the searchable text. These two approaches are intended -to show how easily you can split and store chunks of text and thus get you started with splitting your own data. +to show how easily you can split and store chunks of text and thus get you started with splitting your own data. ## Setup Assuming you have followed the [setup instructions for these examples](../setup/README.md), then you already have a -database in your MarkLogic cluster named `ai-examples-content`. This database contains a small set - specifically, -3,034 text documents - of the -[Enron email dataset](https://www.loc.gov/item/2018487913/) in a collection named `enron`. These documents are good +database in your MarkLogic cluster named `ai-examples-content`. This database contains a small set - specifically, +3,034 text documents - of the +[Enron email dataset](https://www.loc.gov/item/2018487913/) in a collection named `enron`. These documents are good candidates for splitting as many of them have amounts of text large enough to exceed common LLM token limits. As the documents are text, they also are good candidates for the two approaches shown here - i.e. creating separate documents -and leaving the original text documents untouched. +and leaving the original text documents untouched. -You also need Java 8 in order to run these examples, which is the same version of Java needed by the aforementioned +You also need Java 8 in order to run these examples, which is the same version of Java needed by the aforementioned setup instructions. ## Splitting chunks to separate documents In this approach, the [langchain4j document splitter API](https://docs.langchain4j.dev/tutorials/rag#document-splitter) and the [MarkLogic Data Movement SDK](https://docs.marklogic.com/guide/java/data-movement) -are used to create chunks of no more than 1,000 characters each. Each chunk is then saved to a new JSON document in a +are used to create chunks of no more than 1,000 characters each. Each chunk is then saved to a new JSON document in a collection named `enron-chunk` with the following fields: - `sourceUri` = the URI of the document that the chunk was extracted from. - `text` = the chunk of text extracted from the document identified by `sourceUri`. -By limiting the number of tokens in each chunk, a RAG approach can typically send a dozen or more chunks to the LLM +By limiting the number of tokens in each chunk, a RAG approach can typically send a dozen or more chunks to the LLM without exceeding a token limit. The exact number of chunks will depend on the max number of characters you specify -along with your LLM token limit. +along with your LLM token limit. To create these chunks in separate documents, run the following Gradle task: @@ -81,7 +93,7 @@ is used to create chunks of no more than 1,000 characters each. Each chunk is th collection named `enron-chunk` with the following fields: - `sourceUri` = the URI of the document that the chunks were extracted from. -- `chunks` = a JSON array containing each chunk. +- `chunks` = a JSON array containing each chunk. To create these documents, run the following Gradle task: @@ -92,7 +104,7 @@ following format: (source URI)-chunks-(number of chunks).json -An example document, with two chunks, is shown below. Note that it includes some overlap between the two chunks, +An example document, with two chunks, is shown below. Note that it includes some overlap between the two chunks, as the program defaults to 100 characters of overlap between chunks: ``` @@ -163,3 +175,4 @@ as the program defaults to 100 characters of overlap between chunks: ] } ``` +