From 6794722ae3b01bf2c843efa3d9e09f77f9af7205 Mon Sep 17 00:00:00 2001 From: Eric Brown Date: Sat, 19 Oct 2024 10:57:24 -0700 Subject: [PATCH 1/5] Add asset integrity check for tokenizer downloads --- src/fairseq2/assets/download_manager.py | 20 +++++++++++++++++++- src/fairseq2/data/text/text_tokenizer.py | 7 ++++++- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/src/fairseq2/assets/download_manager.py b/src/fairseq2/assets/download_manager.py index 2900f4d41..1e2851811 100644 --- a/src/fairseq2/assets/download_manager.py +++ b/src/fairseq2/assets/download_manager.py @@ -66,6 +66,7 @@ def download_checkpoint( def download_tokenizer( self, uri: str, + checksum: str, model_name: str, *, tokenizer_name: str | None = None, @@ -156,6 +157,7 @@ def download_checkpoint( def download_tokenizer( self, uri: str, + checksum: str, model_name: str, *, tokenizer_name: str | None = None, @@ -169,7 +171,10 @@ def download_tokenizer( op = _AssetDownloadOp(self._cache_dir, uri, display_name, force, progress) - return op.run() + path = op.run() + self._validate_asset_integrity(path, checksum) + + return path @override def download_dataset( @@ -186,6 +191,19 @@ def download_dataset( return op.run() + def _validate_asset_integrity(self, path: Path, checksum: str) -> None: + BYTES_PER_CHUNK = 65536 + sha = sha1() + + with open(path, "rb") as file: + while data := file.read(BYTES_PER_CHUNK): + sha.update(data) + + if sha.hexdigest() != checksum: + raise AssetDownloadError( + "Downloaded asset checksum does not match the expected checksum." + ) + class _AssetDownloadOp: _cache_dir: Path diff --git a/src/fairseq2/data/text/text_tokenizer.py b/src/fairseq2/data/text/text_tokenizer.py index 62dedfe58..bc80ba307 100644 --- a/src/fairseq2/data/text/text_tokenizer.py +++ b/src/fairseq2/data/text/text_tokenizer.py @@ -223,10 +223,15 @@ def __call__( return self(tokenizer_ref, force=force, progress=progress) tokenizer_uri = card.field("tokenizer").as_uri() + tokenizer_checksum = card.field("checksum").get_as_(str) try: path = self._download_manager.download_tokenizer( - tokenizer_uri, card.name, force=force, progress=progress + tokenizer_uri, + tokenizer_checksum, + card.name, + force=force, + progress=progress, ) except ValueError as ex: raise AssetCardError( From 4d0adf5d7dfdc96052c24974c312bc69764c1037 Mon Sep 17 00:00:00 2001 From: Eric Brown Date: Wed, 23 Oct 2024 07:35:48 -0700 Subject: [PATCH 2/5] Make asset integrity check optional --- ci/docker/build-manylinux-images.sh | 0 .../build-scripts/install-cuda-11.6.sh | 0 .../build-scripts/install-cuda-11.7.sh | 0 .../build-scripts/install-cuda-11.8.sh | 0 .../build-scripts/install-cuda-12.1.sh | 0 .../manylinux_x86_64/build-scripts/install-llvm.sh | 0 src/fairseq2/assets/download_manager.py | 10 ++++++++-- tools/run-shellcheck.sh | 0 tools/set-project-version.sh | 0 9 files changed, 8 insertions(+), 2 deletions(-) mode change 100755 => 100644 ci/docker/build-manylinux-images.sh mode change 100755 => 100644 ci/docker/manylinux_x86_64/build-scripts/install-cuda-11.6.sh mode change 100755 => 100644 ci/docker/manylinux_x86_64/build-scripts/install-cuda-11.7.sh mode change 100755 => 100644 ci/docker/manylinux_x86_64/build-scripts/install-cuda-11.8.sh mode change 100755 => 100644 ci/docker/manylinux_x86_64/build-scripts/install-cuda-12.1.sh mode change 100755 => 100644 ci/docker/manylinux_x86_64/build-scripts/install-llvm.sh mode change 100755 => 100644 tools/run-shellcheck.sh mode change 100755 => 100644 tools/set-project-version.sh diff --git a/ci/docker/build-manylinux-images.sh b/ci/docker/build-manylinux-images.sh old mode 100755 new mode 100644 diff --git a/ci/docker/manylinux_x86_64/build-scripts/install-cuda-11.6.sh b/ci/docker/manylinux_x86_64/build-scripts/install-cuda-11.6.sh old mode 100755 new mode 100644 diff --git a/ci/docker/manylinux_x86_64/build-scripts/install-cuda-11.7.sh b/ci/docker/manylinux_x86_64/build-scripts/install-cuda-11.7.sh old mode 100755 new mode 100644 diff --git a/ci/docker/manylinux_x86_64/build-scripts/install-cuda-11.8.sh b/ci/docker/manylinux_x86_64/build-scripts/install-cuda-11.8.sh old mode 100755 new mode 100644 diff --git a/ci/docker/manylinux_x86_64/build-scripts/install-cuda-12.1.sh b/ci/docker/manylinux_x86_64/build-scripts/install-cuda-12.1.sh old mode 100755 new mode 100644 diff --git a/ci/docker/manylinux_x86_64/build-scripts/install-llvm.sh b/ci/docker/manylinux_x86_64/build-scripts/install-llvm.sh old mode 100755 new mode 100644 diff --git a/src/fairseq2/assets/download_manager.py b/src/fairseq2/assets/download_manager.py index 1e2851811..5d90282dc 100644 --- a/src/fairseq2/assets/download_manager.py +++ b/src/fairseq2/assets/download_manager.py @@ -191,7 +191,13 @@ def download_dataset( return op.run() - def _validate_asset_integrity(self, path: Path, checksum: str) -> None: + def _validate_asset_integrity(self, path: Path, checksum: str | None) -> None: + if checksum is None: + log.warning( + f"Asset at {path} has no recorded checksum, skipping integrity check." + ) + return + BYTES_PER_CHUNK = 65536 sha = sha1() @@ -201,7 +207,7 @@ def _validate_asset_integrity(self, path: Path, checksum: str) -> None: if sha.hexdigest() != checksum: raise AssetDownloadError( - "Downloaded asset checksum does not match the expected checksum." + f"Checksum for {path} checksum does not match the expected checksum." ) diff --git a/tools/run-shellcheck.sh b/tools/run-shellcheck.sh old mode 100755 new mode 100644 diff --git a/tools/set-project-version.sh b/tools/set-project-version.sh old mode 100755 new mode 100644 From 074bc47724e4420e989c984ee558f23cf357d4e5 Mon Sep 17 00:00:00 2001 From: Eric Brown Date: Wed, 23 Oct 2024 08:55:37 -0700 Subject: [PATCH 3/5] Revert "Make asset integrity check optional" This reverts commit 4d0adf5d7dfdc96052c24974c312bc69764c1037. --- ci/docker/build-manylinux-images.sh | 0 .../build-scripts/install-cuda-11.6.sh | 0 .../build-scripts/install-cuda-11.7.sh | 0 .../build-scripts/install-cuda-11.8.sh | 0 .../build-scripts/install-cuda-12.1.sh | 0 .../manylinux_x86_64/build-scripts/install-llvm.sh | 0 src/fairseq2/assets/download_manager.py | 10 ++-------- tools/run-shellcheck.sh | 0 tools/set-project-version.sh | 0 9 files changed, 2 insertions(+), 8 deletions(-) mode change 100644 => 100755 ci/docker/build-manylinux-images.sh mode change 100644 => 100755 ci/docker/manylinux_x86_64/build-scripts/install-cuda-11.6.sh mode change 100644 => 100755 ci/docker/manylinux_x86_64/build-scripts/install-cuda-11.7.sh mode change 100644 => 100755 ci/docker/manylinux_x86_64/build-scripts/install-cuda-11.8.sh mode change 100644 => 100755 ci/docker/manylinux_x86_64/build-scripts/install-cuda-12.1.sh mode change 100644 => 100755 ci/docker/manylinux_x86_64/build-scripts/install-llvm.sh mode change 100644 => 100755 tools/run-shellcheck.sh mode change 100644 => 100755 tools/set-project-version.sh diff --git a/ci/docker/build-manylinux-images.sh b/ci/docker/build-manylinux-images.sh old mode 100644 new mode 100755 diff --git a/ci/docker/manylinux_x86_64/build-scripts/install-cuda-11.6.sh b/ci/docker/manylinux_x86_64/build-scripts/install-cuda-11.6.sh old mode 100644 new mode 100755 diff --git a/ci/docker/manylinux_x86_64/build-scripts/install-cuda-11.7.sh b/ci/docker/manylinux_x86_64/build-scripts/install-cuda-11.7.sh old mode 100644 new mode 100755 diff --git a/ci/docker/manylinux_x86_64/build-scripts/install-cuda-11.8.sh b/ci/docker/manylinux_x86_64/build-scripts/install-cuda-11.8.sh old mode 100644 new mode 100755 diff --git a/ci/docker/manylinux_x86_64/build-scripts/install-cuda-12.1.sh b/ci/docker/manylinux_x86_64/build-scripts/install-cuda-12.1.sh old mode 100644 new mode 100755 diff --git a/ci/docker/manylinux_x86_64/build-scripts/install-llvm.sh b/ci/docker/manylinux_x86_64/build-scripts/install-llvm.sh old mode 100644 new mode 100755 diff --git a/src/fairseq2/assets/download_manager.py b/src/fairseq2/assets/download_manager.py index 5d90282dc..1e2851811 100644 --- a/src/fairseq2/assets/download_manager.py +++ b/src/fairseq2/assets/download_manager.py @@ -191,13 +191,7 @@ def download_dataset( return op.run() - def _validate_asset_integrity(self, path: Path, checksum: str | None) -> None: - if checksum is None: - log.warning( - f"Asset at {path} has no recorded checksum, skipping integrity check." - ) - return - + def _validate_asset_integrity(self, path: Path, checksum: str) -> None: BYTES_PER_CHUNK = 65536 sha = sha1() @@ -207,7 +201,7 @@ def _validate_asset_integrity(self, path: Path, checksum: str | None) -> None: if sha.hexdigest() != checksum: raise AssetDownloadError( - f"Checksum for {path} checksum does not match the expected checksum." + "Downloaded asset checksum does not match the expected checksum." ) diff --git a/tools/run-shellcheck.sh b/tools/run-shellcheck.sh old mode 100644 new mode 100755 diff --git a/tools/set-project-version.sh b/tools/set-project-version.sh old mode 100644 new mode 100755 From 26298db215a93a7ace0a9b0fecec1ec42d297d5f Mon Sep 17 00:00:00 2001 From: Eric Brown Date: Wed, 23 Oct 2024 11:01:32 -0700 Subject: [PATCH 4/5] Make asset integrity check optional --- src/fairseq2/assets/download_manager.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/fairseq2/assets/download_manager.py b/src/fairseq2/assets/download_manager.py index 1e2851811..2389ac500 100644 --- a/src/fairseq2/assets/download_manager.py +++ b/src/fairseq2/assets/download_manager.py @@ -191,7 +191,13 @@ def download_dataset( return op.run() - def _validate_asset_integrity(self, path: Path, checksum: str) -> None: + def _validate_asset_integrity(self, path: Path, checksum: str | None) -> None: + if checksum is None: + log.warning( + f"Asset at {path} has no recorded checksum, skipping integrity check." + ) + return + BYTES_PER_CHUNK = 65536 sha = sha1() @@ -201,7 +207,7 @@ def _validate_asset_integrity(self, path: Path, checksum: str) -> None: if sha.hexdigest() != checksum: raise AssetDownloadError( - "Downloaded asset checksum does not match the expected checksum." + f"Checksum for {path} does not match the expected checksum." ) From faeefecf59e8a1032a5374ecafb2140db896803b Mon Sep 17 00:00:00 2001 From: Eric Brown Date: Fri, 25 Oct 2024 10:39:13 -0700 Subject: [PATCH 5/5] Add integrity check for remaining assets --- src/fairseq2/assets/download_manager.py | 14 ++++++++++++-- src/fairseq2/datasets/loader.py | 3 ++- src/fairseq2/models/loader.py | 2 ++ tests/integration/models/test_llama.py | 5 ++++- 4 files changed, 20 insertions(+), 4 deletions(-) diff --git a/src/fairseq2/assets/download_manager.py b/src/fairseq2/assets/download_manager.py index 2389ac500..fc97e1d7e 100644 --- a/src/fairseq2/assets/download_manager.py +++ b/src/fairseq2/assets/download_manager.py @@ -39,6 +39,7 @@ class AssetDownloadManager(ABC): def download_checkpoint( self, uri: str, + checksum: str | None, model_name: str, *, shard_idx: int | None = None, @@ -66,7 +67,7 @@ def download_checkpoint( def download_tokenizer( self, uri: str, - checksum: str, + checksum: str | None, model_name: str, *, tokenizer_name: str | None = None, @@ -94,6 +95,7 @@ def download_tokenizer( def download_dataset( self, uri: str, + checksum: str | None, dataset_name: str, *, force: bool = False, @@ -136,6 +138,7 @@ def __init__(self) -> None: def download_checkpoint( self, uri: str, + checksum: str | None, model_name: str, *, shard_idx: int | None = None, @@ -151,13 +154,16 @@ def download_checkpoint( self._cache_dir, uri, display_name, force, progress, shard_idx ) + path = op.run() + self._validate_asset_integrity(path, checksum) + return op.run() @override def download_tokenizer( self, uri: str, - checksum: str, + checksum: str | None, model_name: str, *, tokenizer_name: str | None = None, @@ -180,6 +186,7 @@ def download_tokenizer( def download_dataset( self, uri: str, + checksum: str | None, dataset_name: str, *, force: bool = False, @@ -189,6 +196,9 @@ def download_dataset( op = _AssetDownloadOp(self._cache_dir, uri, display_name, force, progress) + path = op.run() + self._validate_asset_integrity(path, checksum) + return op.run() def _validate_asset_integrity(self, path: Path, checksum: str | None) -> None: diff --git a/src/fairseq2/datasets/loader.py b/src/fairseq2/datasets/loader.py index 973a1f6cd..3d92a0e45 100644 --- a/src/fairseq2/datasets/loader.py +++ b/src/fairseq2/datasets/loader.py @@ -85,10 +85,11 @@ def __call__( card = self._asset_store.retrieve_card(dataset_name_or_card) dataset_uri = card.field("data").as_uri() + dataset_checksum = card.field("checksum").get_as_(str) try: path = self._download_manager.download_dataset( - dataset_uri, card.name, force=force, progress=progress + dataset_uri, dataset_checksum, card.name, force=force, progress=progress ) except ValueError as ex: raise AssetCardError( diff --git a/src/fairseq2/models/loader.py b/src/fairseq2/models/loader.py index 83cdb125a..6c6685b0d 100644 --- a/src/fairseq2/models/loader.py +++ b/src/fairseq2/models/loader.py @@ -275,12 +275,14 @@ def __call__( # Load the checkpoint. checkpoint_uri = card.field("checkpoint").as_uri() + checkpoint_checksum = card.field("checksum").get_as_(str) shard_idx = gang.rank if gang is not None and gang.size != 1 else None try: path = self._download_manager.download_checkpoint( checkpoint_uri, + checkpoint_checksum, card.name, shard_idx=shard_idx, force=force, diff --git a/tests/integration/models/test_llama.py b/tests/integration/models/test_llama.py index 45a110546..5148ebbd7 100644 --- a/tests/integration/models/test_llama.py +++ b/tests/integration/models/test_llama.py @@ -25,8 +25,11 @@ def test_convert_to_reference_checkpoint() -> None: card = default_asset_store.retrieve_card("llama2_7b") + checkpoint_uri = card.field("checkpoint").as_uri() + checkpoint_checksum = card.field("checksum").get_as_(str) + path = default_download_manager.download_checkpoint( - card.field("checkpoint").as_uri(), model_name="llama2_7b", progress=False + checkpoint_uri, checkpoint_checksum, model_name="llama2_7b", progress=False ) tensor_loader = StandardTensorLoader()