Skip to content

Commit

Permalink
Merge branch 'main' of github.com:DS4SD/docling into cau/optimize-tab…
Browse files Browse the repository at this point in the history
…le-quality
  • Loading branch information
cau-git committed Jul 17, 2024
2 parents 5acb7b5 + 0dfa454 commit 1f77dc1
Show file tree
Hide file tree
Showing 12 changed files with 258 additions and 77 deletions.
19 changes: 19 additions & 0 deletions .github/actions/setup-poetry/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
name: 'Set up Poetry and install'
description: 'Set up a specific version of Poetry and install dependencies using caching.'
inputs:
python-version:
description: "Version range or exact version of Python or PyPy to use, using SemVer's version range syntax."
default: '3.11'
runs:
using: 'composite'
steps:
- name: Install poetry
run: pipx install poetry==1.8.3
shell: bash
- uses: actions/setup-python@v4
with:
python-version: ${{ inputs.python-version }}
cache: 'poetry'
- name: Install dependencies
run: poetry install --all-extras
shell: bash
39 changes: 39 additions & 0 deletions .github/scripts/release.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/bin/bash

set -e # trigger failure on error - do not remove!
set -x # display command on output

if [ -z "${TARGET_VERSION}" ]; then
>&2 echo "No TARGET_VERSION specified"
exit 1
fi
CHGLOG_FILE="${CHGLOG_FILE:-CHANGELOG.md}"

# update package version
poetry version "${TARGET_VERSION}"

# collect release notes
REL_NOTES=$(mktemp)
poetry run semantic-release changelog --unreleased >> "${REL_NOTES}"

# update changelog
TMP_CHGLOG=$(mktemp)
TARGET_TAG_NAME="v${TARGET_VERSION}"
RELEASE_URL="$(gh repo view --json url -q ".url")/releases/tag/${TARGET_TAG_NAME}"
printf "## [${TARGET_TAG_NAME}](${RELEASE_URL}) - $(date -Idate)\n\n" >> "${TMP_CHGLOG}"
cat "${REL_NOTES}" >> "${TMP_CHGLOG}"
if [ -f "${CHGLOG_FILE}" ]; then
printf "\n" | cat - "${CHGLOG_FILE}" >> "${TMP_CHGLOG}"
fi
mv "${TMP_CHGLOG}" "${CHGLOG_FILE}"

# push changes
git config --global user.name 'github-actions[bot]'
git config --global user.email 'github-actions[bot]@users.noreply.github.com'
git add pyproject.toml "${CHGLOG_FILE}"
COMMIT_MSG="chore: bump version to ${TARGET_VERSION} [skip ci]"
git commit -m "${COMMIT_MSG}"
git push origin main

# create GitHub release (incl. Git tag)
gh release create "${TARGET_TAG_NAME}" -F "${REL_NOTES}"
61 changes: 61 additions & 0 deletions .github/workflows/cd.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
name: "Run CD"

on:
push:
branches:
- main

env:
# disable keyring (https://github.com/actions/runner-images/issues/6185):
PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring

jobs:
# To be enabled when we add docs
# docs:
# permissions:
# contents: write
# runs-on: ubuntu-latest
# steps:
# - uses: actions/checkout@v3
# - uses: ./.github/actions/setup-poetry
# - name: Build and push docs
# run: poetry run mkdocs gh-deploy --force

code-checks:
uses: ./.github/workflows/checks.yml
pre-release-check:
runs-on: ubuntu-latest
outputs:
TARGET_TAG_V: ${{ steps.version_check.outputs.TRGT_VERSION }}
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0 # for fetching tags, required for semantic-release
- uses: ./.github/actions/setup-poetry
- name: Check version of potential release
id: version_check
run: |
TRGT_VERSION=$(poetry run semantic-release print-version)
echo "TRGT_VERSION=${TRGT_VERSION}" >> $GITHUB_OUTPUT
echo "${TRGT_VERSION}"
- name: Check notes of potential release
run: poetry run semantic-release changelog --unreleased
release:
needs: [code-checks, pre-release-check]
if: needs.pre-release-check.outputs.TARGET_TAG_V != ''
environment: auto-release
runs-on: ubuntu-latest
concurrency: release
steps:
- uses: actions/checkout@v3
with:
token: ${{ secrets.GH_PAT }}
fetch-depth: 0 # for fetching tags, required for semantic-release
- uses: ./.github/actions/setup-poetry
- name: Run release script
env:
GH_TOKEN: ${{ secrets.GH_PAT }}
TARGET_VERSION: ${{ needs.pre-release-check.outputs.TARGET_TAG_V }}
CHGLOG_FILE: CHANGELOG.md
run: ./.github/scripts/release.sh
shell: bash
16 changes: 16 additions & 0 deletions .github/workflows/checks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
on:
workflow_call:

jobs:
run-checks:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.11', '3.12']
steps:
- uses: actions/checkout@v3
- uses: ./.github/actions/setup-poetry
with:
python-version: ${{ matrix.python-version }}
- name: Run styling check
run: poetry run pre-commit run --all-files
28 changes: 28 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
name: "Run CI"

on:
pull_request:
types: [opened, reopened, synchronize, ready_for_review]
push:
branches:
- "**"
- "!main"
- "!gh-pages"

env:
# disable keyring (https://github.com/actions/runner-images/issues/6185):
PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring

jobs:
code-checks:
uses: ./.github/workflows/checks.yml

# To enable when we add the ./docs
# build-docs:
# runs-on: ubuntu-latest
# steps:
# - uses: actions/checkout@v3
# - uses: ./.github/actions/setup-poetry
# - name: Build docs
# run: poetry run mkdocs build --verbose --clean

21 changes: 21 additions & 0 deletions .github/workflows/pypi.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
name: "Build and publish package"

on:
release:
types: [published]

permissions:
contents: read

env:
# disable keyring (https://github.com/actions/runner-images/issues/6185):
PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring

jobs:
build-and-publish:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: ./.github/actions/setup-poetry
- name: Build and publish
run: poetry publish --build --no-interaction --username=__token__ --password=${{ secrets.PYPI_TOKEN }}
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,6 @@ tags
[Ll]ib
[Ll]ib64
[Ll]ocal
[Ss]cripts
pyvenv.cfg
pip-selfcheck.json

Expand Down
15 changes: 15 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
## [v0.3.0](https://github.com/DS4SD/docling/releases/tag/v0.3.0) - 2024-07-17

### Feature

* Enable python 3.12 support by updating glm ([#8](https://github.com/DS4SD/docling/issues/8)) ([`fb72688`](https://github.com/DS4SD/docling/commit/fb72688ff7413083c864fe62d2dbfc420c1e5268))

### Documentation

* Add setup with pypi to Readme ([#7](https://github.com/DS4SD/docling/issues/7)) ([`2803222`](https://github.com/DS4SD/docling/commit/2803222ee1708481c779d435dbf1c031929d3cf6))

## [v0.2.0](https://github.com/DS4SD/docling/releases/tag/v0.2.0) - 2024-07-16

### Feature

* Build with ci ([#6](https://github.com/DS4SD/docling/issues/6)) ([`b1479cf`](https://github.com/DS4SD/docling/commit/b1479cf4ecf8a586703b31c7cf6917b3293c6a85))
3 changes: 1 addition & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@ RUN apt-get update \
&& apt-get install -y libgl1 libglib2.0-0 curl wget git \
&& apt-get clean

RUN --mount=type=ssh \
pip install --no-cache-dir https://github.com/DS4SD/docling.git
RUN pip install --no-cache-dir docling

ENV HF_HOME=/tmp/
ENV TORCH_HOME=/tmp/
Expand Down
24 changes: 15 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
<p align="center">
<a href="https://github.com/ds4sd/docling"> <img loading="lazy" alt="Docling" src="logo.png" width="150" /> </a>
<a href="https://github.com/ds4sd/docling"> <img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/logo.png" width="150" /> </a>
</p>

# Docling

Dockling bundles PDF document conversion to JSON and Markdown in an easy, self-contained package.
Docling bundles PDF document conversion to JSON and Markdown in an easy, self-contained package.

## Features
* ⚡ Converts any PDF document to JSON or Markdown format, stable and lightning fast
Expand All @@ -14,23 +14,29 @@ Dockling bundles PDF document conversion to JSON and Markdown in an easy, self-c

## Setup

You need Python 3.11 and poetry. Install poetry from [here](https://python-poetry.org/docs/#installing-with-the-official-installer).
For general usage, you can simply install `docling` through `pip` from the pypi package index.
```
pip install docling
```

**Notes**:
* Works on macOS and Linux environments. Windows platforms are currently not tested.

### Development setup

Once you have `poetry` installed, create an environment and install the package:
To develop for `docling`, you need Python 3.11 and `poetry`. Install poetry from [here](https://python-poetry.org/docs/#installing-with-the-official-installer).

Once you have `poetry` installed and cloned this repo, create an environment and install `docling` from the repo root:

```bash
poetry env use $(which python3.11)
poetry shell
poetry install
```

**Notes**:
* Works on macOS and Linux environments. Windows platforms are currently not tested.


## Usage

For basic usage, see the [convert.py](examples/convert.py) example module. Run with:
For basic usage, see the [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py) example module. Run with:

```
python examples/convert.py
Expand Down
Loading

0 comments on commit 1f77dc1

Please sign in to comment.