Skip to content

Commit

Permalink
omg it works
Browse files Browse the repository at this point in the history
  • Loading branch information
nancyhung committed Oct 26, 2024
1 parent bb0dd6a commit c5ae4ff
Showing 1 changed file with 2 additions and 15 deletions.
17 changes: 2 additions & 15 deletions llmfoundry/callbacks/hf_checkpointer.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,6 @@ def _log_model_multiprocess(
)
logging.getLogger('llmfoundry').setLevel(python_logging_level)

log.info("----------------- REACHED MLFLOW LOG MODEL -----------------")
# monkey patch to prevent duplicate tokenizer upload
import mlflow
mlflow.start_run(
Expand All @@ -156,20 +155,13 @@ def _log_model_multiprocess(
original_save_model = mlflow.transformers.save_model
def save_model_patch(*args: Any, **kwargs: Any):
original_save_model(*args, **kwargs)
log.info(f"List of root path: {os.listdir(kwargs['path'])}")
components_path = os.path.join(kwargs['path'], 'components')
if os.path.exists(components_path):
log.info(f"List of components path: {components_path}: {os.listdir(components_path)}")
tokenizer_path = os.path.join(kwargs['path'], 'components', 'tokenizer')
tokenizer_files = []
if os.path.exists(tokenizer_path):
tokenizer_files = os.listdir(os.path.join(kwargs['path'], 'components', 'tokenizer'))
log.info(f"Tokenizer files: {tokenizer_files}")
# Check if there are duplicate tokenizer files in the model directory and remove them.
try:
for tokenizer_file_name in tokenizer_files:
dupe_file = os.path.isfile(os.path.join(kwargs['path'], 'model', tokenizer_file_name))
if dupe_file:
log.info(f"Removing duplicate tokenizer file: {tokenizer_file_name}")
log.debug(f"Removing duplicate tokenizer file: {tokenizer_file_name}")
os.remove(os.path.join(kwargs['path'], 'model', tokenizer_file_name))
except Exception as e:
log.error(f"Exception when removing duplicate tokenizer files in the model directory", e)
Expand Down Expand Up @@ -403,7 +395,6 @@ def run_event(self, event: Event, state: State, logger: Logger) -> None:
if self._any_register_processes_error(
state.device,
) and self.final_register_only:
time.sleep(60) # give me some debugging time
log.error(
'An error occurred in one or more registration processes. Fallback to saving the HuggingFace checkpoint.',
)
Expand Down Expand Up @@ -682,7 +673,6 @@ def tensor_hook(

log.debug('Saving Hugging Face checkpoint to disk')

log.debug(f"UPLOAD_TO_SAVE_FOLDER: {upload_to_save_folder}")
# This context manager casts the TE extra state in io.BytesIO format to tensor format
# Needed for proper hf ckpt saving.
context_manager = te.onnx_export(
Expand Down Expand Up @@ -784,7 +774,6 @@ def tensor_hook(

# Spawn a new process to register the model.
# Slower method to register the model via log_model.
log.info(f'USING MY BRANCH!!!!!!!!!!!!!! REGISTERED MODEL NAME: {self.mlflow_registered_model_name}')
process = SpawnProcess(
target=_log_model_multiprocess,
kwargs={
Expand All @@ -801,8 +790,6 @@ def tensor_hook(
'log_model_metadata': self.mlflow_logging_config['metadata'],
'registered_model_name':
self.mlflow_registered_model_name,
# 'model_name':
# self.pretrained_model_name,
'input_example':
self.mlflow_logging_config['input_example'],
'await_creation_for':
Expand Down

0 comments on commit c5ae4ff

Please sign in to comment.