allenai · soldni · Oct 23, 2024 · Oct 23, 2024 · 2015aroras · Oct 23, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 
+- Added support for safetensors in `hf_olmo` conversion script.
+
 ## [v0.5.1](https://github.com/allenai/OLMo/releases/tag/v0.5.1) - 2024-10-17
 
 ### Added
@@ -45,7 +47,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Swapped in correct flan data mix.
 - Fix bug where the attention norm, when applied before the attention block, was modifying the residual stream.
 - Fixed `OLMo.from_checkpoint()` so that it correctly loads `olmo_core` and `torch_new` style checkpoints.
-- Fixed `preserve_rng_state` being incorrectly set to False when doing gradient checkpointing with dropout 
+- Fixed `preserve_rng_state` being incorrectly set to False when doing gradient checkpointing with dropout
 
 
 ## [v0.4.0](https://github.com/allenai/OLMo/releases/tag/v0.4.0) - 2024-07-11

diff --git a/hf_olmo/convert_olmo_to_hf.py b/hf_olmo/convert_olmo_to_hf.py
@@ -9,15 +9,16 @@
 from urllib.parse import urlparse
 
 import torch
+from olmo import ModelConfig, Tokenizer, TrainConfig
+from olmo.checkpoint import build_sharded_checkpointer
+from olmo.util import _get_s3_client
 from omegaconf import OmegaConf as om
+from safetensors.torch import load_file
 from tqdm import tqdm
 
 from hf_olmo.configuration_olmo import OLMoConfig
 from hf_olmo.modeling_olmo import OLMoForCausalLM
 from hf_olmo.tokenization_olmo_fast import OLMoTokenizerFast
-from olmo import ModelConfig, Tokenizer, TrainConfig
-from olmo.checkpoint import build_sharded_checkpointer
-from olmo.util import _get_s3_client
 
 logger = logging.getLogger(__name__)
 
@@ -67,10 +68,16 @@ def write_model(checkpoint_dir: str, ignore_olmo_compatibility: bool = False):
     # For device_map = "auto", etc. the models are loaded in a way that start_prefix is not computed correctly.
     # So, we explicitly store the model with the expected prefix.
 
-    old_model_path = os.path.join(checkpoint_dir, "model.pt")
-    new_model_path = os.path.join(checkpoint_dir, "pytorch_model.bin")
+    if os.path.exists(os.path.join(checkpoint_dir, "model.pt")):
+        old_model_path = os.path.join(checkpoint_dir, "model.pt")
+        state_dict = torch.load(old_model_path, map_location="cpu")
+    elif os.path.exists(os.path.join(checkpoint_dir, "model.safetensors")):
+        old_model_path = os.path.join(checkpoint_dir, "model.safetensors")
+        state_dict = load_file(old_model_path, device="cpu")
+    else:
+        raise ValueError(f"No model found in {checkpoint_dir}")
 
-    state_dict = torch.load(old_model_path, map_location="cpu")
+    new_model_path = os.path.join(checkpoint_dir, "pytorch_model.bin")
 
     # this takes care of the case where the model was saved with a different prefix,
     # typically due to unsharding.
@@ -233,7 +240,9 @@ def upload_local_checkpoint(local_checkpoint_dir: str, destination_dir: str):
 
 
 def maybe_unshard(checkpoint_dir: str):
-    if os.path.exists(os.path.join(checkpoint_dir, "model.pt")):
+    if os.path.exists(os.path.join(checkpoint_dir, "model.pt")) or os.path.exists(
+        os.path.join(checkpoint_dir, "model.safetensors")
+    ):
         return
 
     print(f"Unsharding {checkpoint_dir}...")