Fix lora single device fine tune checkpoint saving & nan loss when use_dora=True #2381
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: GPU tests | |
on: | |
schedule: | |
# Runs at midnight every day | |
- cron: '0 0 * * *' | |
push: | |
branches: [ main ] | |
pull_request: | |
workflow_dispatch: | |
concurrency: | |
group: gpu-test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} | |
cancel-in-progress: true | |
permissions: | |
id-token: write | |
contents: read | |
defaults: | |
run: | |
shell: bash -l -eo pipefail {0} | |
jobs: | |
gpu_test: | |
if: github.repository_owner == 'pytorch' | |
runs-on: linux.8xlarge.nvidia.gpu | |
strategy: | |
matrix: | |
python-version: ['3.9', '3.10', '3.11'] | |
torch-version: ["stable", "nightly"] | |
# Do not run against nightlies on PR | |
exclude: | |
- torch-version: ${{ github.event_name == 'pull_request' && 'nightly' }} | |
steps: | |
- name: Check out repo | |
uses: actions/checkout@v3 | |
- name: Setup conda env | |
uses: conda-incubator/setup-miniconda@v2 | |
with: | |
auto-update-conda: true | |
miniconda-version: "latest" | |
activate-environment: test | |
python-version: ${{ matrix.python-version }} | |
- name: Update pip | |
run: python -m pip install --upgrade pip | |
- name: Install torch nightly | |
if: ${{ matrix.torch-version == 'nightly' }} | |
run: python -m pip install --pre torch torchvision torchao --index-url https://download.pytorch.org/whl/nightly/cu121 | |
- name: Install torch stable | |
if: ${{ matrix.torch-version == 'stable' }} | |
run: python -m pip install torch torchvision torchao | |
- name: Install remaining dependencies | |
run: | | |
python -m pip install -e ".[dev]" | |
python -m pip install lm-eval==0.4.5 | |
- name: Run recipe and unit tests with coverage | |
run: pytest tests --with-integration --cov=. --cov-report=xml --durations=20 -vv | |
- name: Upload Coverage to Codecov | |
uses: codecov/codecov-action@v3 |