add docs

allenai · Oct 24, 2024 · 80b5cf1 · 80b5cf1
1 parent b37cf0c
commit 80b5cf1
Show file tree

Hide file tree

Showing 3 changed files with 21 additions and 4 deletions.
diff --git a/scripts/README.md b/scripts/README.md
@@ -67,12 +67,11 @@ python scripts/submit_eval_jobs.py --model_name llama_31_tulu_2_8b --location 01
 python scripts/submit_eval_jobs.py --model_name hf-llama_31_tulu_2_8b --location allenai/llama-3-tulu-2-8b --is_tuned --workspace tulu-3-results --preemptible --use_hf_tokenizer_template --beaker_image nathanl/open_instruct_auto --upload_to_hf allenai/tulu-3-evals
 python scripts/submit_eval_jobs.py --model_name hf-llama_31_tulu_2_8b --location vwxyzjn/online_dpo_tulu_2 --is_tuned --workspace tulu-3-results --preemptible --use_hf_tokenizer_template --beaker_image nathanl/open_instruct_auto --upload_to_hf allenai/tulu-3-evals
 
-
 python scripts/submit_eval_jobs.py --model_name hf-online-dpo-llama-tulu2-longer --beaker_image costah/open_instruct_test --location vwxyzjn/online_dpo_vllm__allenai_llama-3-tulu-2-8b --hf_revision online_dpo_vllm__1__1724038538 --is_tuned --workspace tulu-3-results --preemptible --use_hf_tokenizer_template --upload_to_hf allenai/tulu-3-evals
 
-
-https://huggingface.co/vwxyzjn/online_dpo_vllm__allenai_llama-3-tulu-2-8b/tree/online_dpo_vllm__1__1724038538
 ```
+Here, it is important to know that for using `oe-eval`, normally we run `--skip_oi_evals`, `run_safety_evaluations`, and `run_oe_eval_experiments`.
+
 2. `submit_finetune_jobs.py`: **Core script** for submitting multiple and configurable instruction tuning jobs. This script works for both single- and multi-node configurations. It by default reads configs in `configs/train_configs`, but also can take in CLI arguments matching those in `open_instruct/utils.py` `FlatArguments` class. 
 Example of running this is in `scripts/submit_finetune_jobs.sh`. 
 ```

diff --git a/scripts/eval/tulu3_baselines.sh b/scripts/eval/tulu3_baselines.sh
@@ -0,0 +1,11 @@
+python scripts/submit_eval_jobs.py --model_name hf-NousResearch-Hermes-3-Llama-3.1-8B --location NousResearch/Hermes-3-Llama-3.1-8B --workspace tulu-3-results --preemptible --use_hf_tokenizer_template --beaker_image nathanl/open_instruct_auto --upload_to_hf allenai/tulu-3-evals --run_oe_eval_experiments --run_safety_evaluations --skip_oi_evals
+python scripts/submit_eval_jobs.py --model_name hf-NousResearch-Hermes-3-Llama-3.1-70B --location NousResearch/Hermes-3-Llama-3.1-70B --workspace tulu-3-results --preemptible --use_hf_tokenizer_template --beaker_image nathanl/open_instruct_auto --upload_to_hf allenai/tulu-3-evals --run_oe_eval_experiments --run_safety_evaluations --skip_oi_evals
+python scripts/submit_eval_jobs.py --model_name hf-llama_3_1_nemotron_70B_instruct_hf --location nvidia/Llama-3.1-Nemotron-70B-Instruct-HF --is_tuned --workspace tulu-3-results --preemptible --use_hf_tokenizer_template --beaker_image nathanl/open_instruct_auto --upload_to_hf allenai/tulu-3-evals --run_oe_eval_experiments --run_safety_evaluations --skip_oi_evals
+python scripts/submit_eval_jobs.py --model_name hf-athene_70b --location Nexusflow/Athene-70B --workspace tulu-3-results --preemptible --use_hf_tokenizer_template --beaker_image nathanl/open_instruct_auto --upload_to_hf allenai/tulu-3-evals --run_oe_eval_experiments --run_safety_evaluations --skip_oi_evals
+python scripts/submit_eval_jobs.py --model_name hf-llama_3_1_supernova_lite --location arcee-ai/Llama-3.1-SuperNova-Lite --workspace tulu-3-results --preemptible --use_hf_tokenizer_template --beaker_image nathanl/open_instruct_auto --upload_to_hf allenai/tulu-3-evals --run_oe_eval_experiments --run_safety_evaluations --skip_oi_evals
+python scripts/submit_eval_jobs.py --model_name hf-gemma_2_9b_it_simpo --location princeton-nlp/gemma-2-9b-it-SimPO --is_tuned --workspace tulu-3-results --preemptible --use_hf_tokenizer_template --beaker_image nathanl/open_instruct_auto --upload_to_hf allenai/tulu-3-evals --run_oe_eval_experiments --run_safety_evaluations --skip_oi_evals
+python scripts/submit_eval_jobs.py --model_name hf-magpielm_8b_chat_v0_1 --location Magpie-Align/MagpieLM-8B-Chat-v0.1 --is_tuned --workspace tulu-3-results --preemptible --use_hf_tokenizer_template --beaker_image nathanl/open_instruct_auto --upload_to_hf allenai/tulu-3-evals --run_oe_eval_experiments --run_safety_evaluations --skip_oi_evals
+python scripts/submit_eval_jobs.py --model_name hf-qwen2_5_72b_instruct --location Qwen/Qwen2.5-72B-Instruct --is_tuned --workspace tulu-3-results --preemptible --use_hf_tokenizer_template --beaker_image nathanl/open_instruct_auto --upload_to_hf allenai/tulu-3-evals --run_oe_eval_experiments --run_safety_evaluations --skip_oi_evals
+python scripts/submit_eval_jobs.py --model_name hf-qwen2_5_7b_instruct --location Qwen/Qwen2.5-7B-Instruct --is_tuned --workspace tulu-3-results --preemptible --use_hf_tokenizer_template --beaker_image nathanl/open_instruct_auto --upload_to_hf allenai/tulu-3-evals --run_oe_eval_experiments --run_safety_evaluations --skip_oi_evals
+python scripts/submit_eval_jobs.py --model_name hf-ministral_8b_instruct_2410 --location mistralai/Ministral-8B-Instruct-2410 --is_tuned --workspace tulu-3-results --preemptible --use_hf_tokenizer_template --beaker_image nathanl/open_instruct_auto --upload_to_hf allenai/tulu-3-evals --run_oe_eval_experiments --run_safety_evaluations --skip_oi_evals
+python scripts/submit_eval_jobs.py --model_name hf-llama_3_tulu_2_dpo_70b --location allenai/llama-3-tulu-2-dpo-70b --is_tuned --workspace tulu-3-results --preemptible --use_hf_tokenizer_template --beaker_image nathanl/open_instruct_auto --upload_to_hf allenai/tulu-3-evals --run_oe_eval_experiments --run_safety_evaluations --skip_oi_evals --oe_eval_max_length 2048
diff --git a/scripts/submit_eval_jobs.py b/scripts/submit_eval_jobs.py
@@ -587,7 +587,14 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier):
     if args.hf_revision:
         oe_eval_cmd += f" --revision {args.hf_revision}"
     # add string with number of gpus
-    oe_eval_cmd += f" --num_gpus {task_spec['resources']['gpuCount']}"
+    num_gpus = task_spec['resources']['gpuCount']
+    # if num_gpus > 1, double it again for oe-eval configs
+    # open_instruct GPT adjustment wasn't quite enough
+    # adjusted here so the GPU configs in open-instruct eval are not impacted by the change
+    # tested reasonably extensively with 70B models.
+    if num_gpus > 1:
+        num_gpus *= 2
+    oe_eval_cmd += f" --num_gpus {num_gpus}"
     if args.oe_eval_max_length:
         oe_eval_cmd += f" --max-length {args.oe_eval_max_length}"
     print(f"Running OE eval with command: {oe_eval_cmd}")