Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Auto eval actually works #310

Merged
merged 15 commits into from
Aug 30, 2024
7 changes: 7 additions & 0 deletions .github/workflows/push-image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ on:
# - '!open_instruct/README.md'
# - 'requirements.txt'
# - 'Dockerfile'
# - '.github/workflows/push-image.yml'
workflow_dispatch: # This allows us to manually trigger a build through the GitHub UI.

env:
Expand All @@ -45,6 +46,12 @@ jobs:
steps:
- uses: actions/checkout@v3

- uses: actions/checkout@v3
with:
repository: allenai/oe-eval-internal
path: './oe-eval-internal'
ssh-key: ${{ secrets.OE_EVAL_GIT_CLONE_ACCESS_PRIVATE_SSH_DEPLOY_KEY }}

- name: Setup environment
uses: ./.github/actions/setup
with:
Expand Down
3 changes: 2 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -95,15 +95,16 @@ RUN pip install -r requirements.txt
# NLTK download
RUN python -m nltk.downloader punkt
COPY open_instruct open_instruct
COPY oe-eval-internal oe-eval-internal

# install the package in editable mode
COPY pyproject.toml .
RUN pip install -e .
COPY .git/ ./.git/

COPY eval eval
COPY configs configs
COPY scripts scripts
COPY mason.py mason.py
RUN chmod +x scripts/*

# for interactive session
Expand Down
49 changes: 32 additions & 17 deletions mason.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,17 +103,29 @@ def parse_commands(command_args: List[str]) -> List[List[str]]:
return commands


def get_env_vars(pure_docker_mode, no_mount_hf_cache):
env_vars = [
beaker.EnvVar(
name="HF_TOKEN",
secret="HF_TOKEN",
),
beaker.EnvVar(
name="WANDB_API_KEY",
secret="WANDB_API_KEY",
),
def get_env_vars(pure_docker_mode, no_mount_hf_cache, beaker_secrets, whoami):
env_vars = []
useful_secrets = [
"HF_TOKEN",
"WANDB_API_KEY",
"BEAKER_TOKEN",
]
for useful_secret in useful_secrets:
if f"{whoami}_{useful_secret}" in beaker_secrets:
env_vars.append(
beaker.EnvVar(
name=useful_secret,
secret=f"{whoami}_{useful_secret}",
)
)
elif useful_secret in beaker_secrets:
env_vars.append(
beaker.EnvVar(
name=useful_secret,
secret=useful_secret,
)
)

# use the user's PATH; including the conda / python PATH
if not pure_docker_mode:
env_vars.extend([
Expand Down Expand Up @@ -161,7 +173,7 @@ def get_datasets(beaker_datasets, no_mount_nfs):
return res


def make_task_spec(args, command, i):
def make_task_spec(args, command, i, beaker_secrets, whoami):
full_command = command
command = ['/bin/bash', '-c']
setup_commands = (
Expand All @@ -184,7 +196,7 @@ def make_task_spec(args, command, i):
context=beaker.TaskContext(priority=beaker.Priority(args.priority),
preemptible=args.preemptible),
constraints=beaker.Constraints(cluster=args.cluster),
env_vars=get_env_vars(args.pure_docker_mode, args.no_hf_cache_env),
env_vars=get_env_vars(args.pure_docker_mode, args.no_hf_cache_env, beaker_secrets, whoami),
resources=beaker.TaskResources(gpu_count=args.gpus),
)

Expand All @@ -193,16 +205,19 @@ def make_task_spec(args, command, i):

def main():
args, commands = get_args()
experiment_spec = beaker.ExperimentSpec(
description=args.description,
tasks=[make_task_spec(args, command, i) for i, command in enumerate(commands)],
budget=args.budget,
)
if args.workspace:
beaker_client = beaker.Beaker.from_env(default_workspace=args.workspace)
else:
beaker_client = beaker.Beaker.from_env()

beaker_secrets = [secret.name for secret in beaker_client.workspace.secrets()]
whoami = beaker_client.account.whoami().name
experiment_spec = beaker.ExperimentSpec(
description=args.description,
tasks=[make_task_spec(args, command, i, beaker_secrets, whoami) for i, command in enumerate(commands)],
budget=args.budget,
)

exp = beaker_client.experiment.create(spec=experiment_spec)
print(f"Kicked off Beaker job. https://beaker.org/ex/{exp.id}")

Expand Down
94 changes: 55 additions & 39 deletions open_instruct/dpo_tune.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import math
import os
import random
import subprocess
import time
from copy import deepcopy
from dataclasses import dataclass, field
Expand Down Expand Up @@ -71,7 +72,6 @@
maybe_get_beaker_config,
maybe_use_ai2_hf_entity,
maybe_use_ai2_wandb_entity,
submit_beaker_eval_jobs,
upload_metadata_to_hf,
)

Expand Down Expand Up @@ -853,6 +853,8 @@ def load_model():

# We need to initialize the trackers we use, and also store our configuration.
# The trackers initializes automatically on the main process.
if is_beaker_job():
beaker_config = maybe_get_beaker_config()
if args.with_tracking:
experiment_config = vars(args)
# TensorBoard cannot log Enums, need the raw value
Expand All @@ -861,8 +863,6 @@ def load_model():
# (Optional) Ai2 internal tracking
if args.wandb_entity is None:
args.wandb_entity = maybe_use_ai2_wandb_entity()
if is_beaker_job():
beaker_config = maybe_get_beaker_config()
experiment_config.update(vars(beaker_config))
accelerator.init_trackers(
"open_instruct_internal",
Expand Down Expand Up @@ -1012,7 +1012,7 @@ def load_model():
os.path.join(get_last_checkpoint_path(args, incomplete=True), "COMPLETED"), "w"
) as f:
f.write("COMPLETED") # annoyingly, empty files arent uploaded by beaker.
if accelerator.is_main_process:
if accelerator.is_local_main_process:
clean_last_n_checkpoints(args.output_dir, args.keep_last_n_checkpoints)
accelerator.wait_for_everyone()

Expand All @@ -1027,7 +1027,7 @@ def load_model():
# use this to mark the checkpoint as completely saved, to avoid restoring from garbled checkpoints
with open(os.path.join(get_last_checkpoint_path(args, incomplete=True), "COMPLETED"), "w") as f:
f.write("COMPLETED") # annoyingly, empty files arent uploaded by beaker.
if accelerator.is_main_process:
if accelerator.is_local_main_process:
clean_last_n_checkpoints(args.output_dir, args.keep_last_n_checkpoints)
accelerator.wait_for_everyone()

Expand All @@ -1041,49 +1041,65 @@ def load_model():
)

# remove all checkpoints to save space
if accelerator.is_main_process:
if accelerator.is_local_main_process:
clean_last_n_checkpoints(args.output_dir, keep_last_n_checkpoints=0)

if is_beaker_job() and accelerator.is_main_process:
if args.hf_metadata_dataset:
# dpo script only supports these two options right now for datasets
if args.dataset_mixer:
dataset_list = args.dataset_mixer.keys()
elif args.dataset_mixer_list:
dataset_list = args.dataset_mixer_list[::2] # even indices
elif args.dataset_name:
dataset_list = [args.dataset_name]
else:
dataset_list = [args.train_file]
# mainly just focussing here on what would be useful for the leaderboard.
# wandb will have even more useful information.
metadata_blob = {
"model_name": args.exp_name,
"model_type": "sft",
"datasets": dataset_list,
"base_model": args.model_name_or_path,
"wandb_path": wandb_tracker.run.get_url(),
"beaker_experiment": beaker_config.beaker_experiment_url,
"beaker_datasets": beaker_config.beaker_dataset_id_urls,
}
upload_metadata_to_hf(
metadata_blob,
"metadata.json",
args.hf_metadata_dataset,
"results/" + args.hf_repo_revision, # to match what the auto-evals name as.
)

if args.try_launch_beaker_eval_jobs:
command = f"""\
python mason.py \
--cluster ai2/allennlp-cirrascale ai2/general-cirrascale-a5000 ai2/general-cirrascale-a5000 ai2/s2-cirrascale ai2/general-cirrascale \
--priority low \
--preemptible \
--budget ai2/allennlp \
--workspace ai2/tulu-2-improvements \
--image nathanl/open_instruct_auto \
--pure_docker_mode \
--gpus 0 -- python scripts/wait_beaker_dataset_model_upload_then_evaluate_model.py \
--beaker_workload_id {beaker_config.beaker_workload_id} \
--model_name {args.hf_repo_revision}
"""
process = subprocess.Popen(["bash", "-c", command], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = process.communicate()
print(f"Submit jobs after model training is finished - Stdout:\n{stdout.decode()}")
print(f"Submit jobs after model training is finished - Stderr:\n{stderr.decode()}")
print(f"Submit jobs after model training is finished - process return code: {process.returncode}")

if args.push_to_hub:
push_folder_to_hub(
accelerator,
args.output_dir,
args.hf_repo_id,
args.hf_repo_revision,
)
if accelerator.is_main_process and is_beaker_job() and args.try_launch_beaker_eval_jobs:
submit_beaker_eval_jobs(
model_name=f"hf-{args.hf_repo_revision}",
location=args.hf_repo_id,
hf_repo_revision=args.hf_repo_revision,
)
if args.hf_metadata_dataset and accelerator.is_main_process and is_beaker_job():
if args.dataset_mixer:
dataset_list = args.dataset_mixer.keys()
elif args.dataset_mixer_list:
dataset_list = args.dataset_mixer_list[::2] # even indices
elif args.dataset_name:
dataset_list = [args.dataset_name]
else:
dataset_list = [args.train_file]
# mainly just focussing here on what would be useful for the leaderboard.
# wandb will have even more useful information.
metadata_blob = {
"model_name": args.exp_name,
"model_type": "sft",
"datasets": dataset_list,
"base_model": args.model_name_or_path,
"wandb_path": wandb_tracker.run.get_url(),
"beaker_experiment": beaker_config.beaker_experiment_url,
"beaker_datasets": beaker_config.beaker_dataset_id_urls,
}
upload_metadata_to_hf(
metadata_blob,
"metadata.json",
args.hf_metadata_dataset,
"results/" + args.hf_repo_revision, # to match what the auto-evals name as.
)

accelerator.wait_for_everyone()
if args.with_tracking:
accelerator.end_training()
Expand Down
Loading
Loading