From 62165de22d992cff1508e8d93ad3f410fee9bcd0 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Mon, 1 Jul 2024 14:11:31 -0700 Subject: [PATCH] Relax hf hub pin (#1314) --- llmfoundry/data/finetuning/tasks.py | 4 +++- setup.py | 2 +- tests/data/test_dataloader.py | 6 ++++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index 9a0f680bd7..0adad8af4e 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -114,6 +114,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]: ), ) SUPPORTED_EXTENSIONS = ['.csv', '.json', '.jsonl', '.parquet'] +HUGGINGFACE_FOLDER_EXTENSIONS = ['.lock', '.metadata'] PromptResponseDict = Mapping[str, str] ChatFormattedDict = Mapping[str, List[Dict[str, str]]] @@ -886,7 +887,8 @@ def build_from_hf( f for _, _, files in os.walk(dataset_name) for f in files ] if not all( - Path(f).suffix in SUPPORTED_EXTENSIONS + Path(f).suffix in SUPPORTED_EXTENSIONS + + HUGGINGFACE_FOLDER_EXTENSIONS or f == '.gitignore' for f in dataset_files ): raise InvalidFileExtensionError( diff --git a/setup.py b/setup.py index e4dc861ae6..b2720d168e 100644 --- a/setup.py +++ b/setup.py @@ -70,7 +70,7 @@ 'onnx==1.14.0', 'onnxruntime==1.15.1', 'boto3>=1.21.45,<2', - 'huggingface-hub>=0.19.0,<0.23', + 'huggingface-hub>=0.19.0,<0.24', 'beautifulsoup4>=4.12.2,<5', # required for model download utils 'tenacity>=8.2.3,<9', 'catalogue>=2,<3', diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py index ec27df8121..a489002399 100644 --- a/tests/data/test_dataloader.py +++ b/tests/data/test_dataloader.py @@ -29,6 +29,7 @@ ) from llmfoundry.data.finetuning.tasks import ( DOWNLOADED_FT_DATASETS_DIRPATH, + HUGGINGFACE_FOLDER_EXTENSIONS, SUPPORTED_EXTENSIONS, dataset_constructor, is_valid_ift_example, @@ -471,14 +472,15 @@ def test_finetuning_dataloader_safe_load( ) # If no raised errors, we should expect downloaded files with only safe file types. - if expectation == does_not_raise(): + if isinstance(expectation, does_not_raise): download_dir = os.path.join(DOWNLOADED_FT_DATASETS_DIRPATH, hf_name) downloaded_files = [ file for _, _, files in os.walk(download_dir) for file in files ] assert len(downloaded_files) > 0 assert all( - Path(file).suffix in SUPPORTED_EXTENSIONS + Path(file).suffix in SUPPORTED_EXTENSIONS + + HUGGINGFACE_FOLDER_EXTENSIONS or file == '.gitignore' for file in downloaded_files )