allenai · natolambert · Oct 25, 2024 · Oct 24, 2024 · Oct 24, 2024 · Oct 24, 2024
diff --git a/scripts/README.md b/scripts/README.md
@@ -67,12 +67,11 @@ python scripts/submit_eval_jobs.py --model_name llama_31_tulu_2_8b --location 01
 python scripts/submit_eval_jobs.py --model_name hf-llama_31_tulu_2_8b --location allenai/llama-3-tulu-2-8b --is_tuned --workspace tulu-3-results --preemptible --use_hf_tokenizer_template --beaker_image nathanl/open_instruct_auto --upload_to_hf allenai/tulu-3-evals
 python scripts/submit_eval_jobs.py --model_name hf-llama_31_tulu_2_8b --location vwxyzjn/online_dpo_tulu_2 --is_tuned --workspace tulu-3-results --preemptible --use_hf_tokenizer_template --beaker_image nathanl/open_instruct_auto --upload_to_hf allenai/tulu-3-evals
 
-
 python scripts/submit_eval_jobs.py --model_name hf-online-dpo-llama-tulu2-longer --beaker_image costah/open_instruct_test --location vwxyzjn/online_dpo_vllm__allenai_llama-3-tulu-2-8b --hf_revision online_dpo_vllm__1__1724038538 --is_tuned --workspace tulu-3-results --preemptible --use_hf_tokenizer_template --upload_to_hf allenai/tulu-3-evals
 
-
-https://huggingface.co/vwxyzjn/online_dpo_vllm__allenai_llama-3-tulu-2-8b/tree/online_dpo_vllm__1__1724038538
 ```
+Here, it is important to know that for using `oe-eval`, normally we run `--skip_oi_evals`, `run_safety_evaluations`, and `run_oe_eval_experiments`.
+
 2. `submit_finetune_jobs.py`: **Core script** for submitting multiple and configurable instruction tuning jobs. This script works for both single- and multi-node configurations. It by default reads configs in `configs/train_configs`, but also can take in CLI arguments matching those in `open_instruct/utils.py` `FlatArguments` class. 
 Example of running this is in `scripts/submit_finetune_jobs.sh`. 
 ```

diff --git a/scripts/eval/oe-eval.sh b/scripts/eval/oe-eval.sh
@@ -36,7 +36,7 @@ set -ex
 
 # Function to print usage
 usage() {
-    echo "Usage: $0 --model-name MODEL_NAME --model-location MODEL_LOCATION [--hf-upload] [--revision REVISION] [--max-length <max_length>]"
+    echo "Usage: $0 --model-name MODEL_NAME --model-location MODEL_LOCATION [--num_gpus GPUS] [--hf-upload] [--revision REVISION] [--max-length <max_length>]"
     exit 1
 }
 
@@ -45,13 +45,18 @@ while [[ "$#" -gt 0 ]]; do
     case $1 in
         --model-name) MODEL_NAME="$2"; shift ;;
         --model-location) MODEL_LOCATION="$2"; shift ;;
+        --num_gpus) NUM_GPUS="$2"; shift ;;
         --hf-upload) HF_UPLOAD="true" ;;
         --revision) REVISION="$2"; shift ;;
+        --max-length) MAX_LENGTH="$2"; shift ;;
         *) echo "Unknown parameter passed: $1"; usage ;;
     esac
     shift
 done
 
+# Optional: Default number of GPUs if not specified
+NUM_GPUS="${NUM_GPUS:-1}"
+
 # Check required arguments
 if [[ -z "$MODEL_NAME" || -z "$MODEL_LOCATION" ]]; then
     echo "Error: --model-name and --model-location are required."
@@ -95,8 +100,9 @@ TASKS=(
 MODEL_TYPE="--model-type vllm"
 BATCH_SIZE_VLLM=10000
 BATCH_SIZE_OTHER=1
-GPU_COUNT=1
-GPU_COUNT_OTHER=2
+# Set GPU_COUNT and GPU_COUNT_OTHER based on NUM_GPUS
+GPU_COUNT="$NUM_GPUS"
+GPU_COUNT_OTHER=$((NUM_GPUS * 2)) 
 MODEL_TYPE_OTHER=""
 
 for TASK in "${TASKS[@]}"; do
@@ -108,7 +114,7 @@ for TASK in "${TASKS[@]}"; do
     else
         BATCH_SIZE=$BATCH_SIZE_VLLM
         MODEL_TYPE="--model-type vllm"
-        GPU_COUNT=1
+        GPU_COUNT=$GPU_COUNT
     fi
 
     python oe-eval-internal/oe_eval/launch.py --model "$MODEL_NAME" --beaker-workspace "ai2/tulu-3-results" --beaker-budget ai2/oe-adapt --task "$TASK" $MODEL_TYPE --batch-size "$BATCH_SIZE" --model-args "{\"model_path\":\"${MODEL_LOCATION}\", \"max_length\": ${MAX_LENGTH}}" ${HF_UPLOAD_ARG} --gpus "$GPU_COUNT" --gantry-args '{"env-secret": "OPENAI_API_KEY=openai_api_key"}' ${REVISION_ARG}

diff --git a/scripts/eval/tulu3_baselines.sh b/scripts/eval/tulu3_baselines.sh
@@ -0,0 +1,11 @@
+python scripts/submit_eval_jobs.py --model_name hf-NousResearch-Hermes-3-Llama-3.1-8B --location NousResearch/Hermes-3-Llama-3.1-8B --workspace tulu-3-results --preemptible --use_hf_tokenizer_template --beaker_image nathanl/open_instruct_auto --upload_to_hf allenai/tulu-3-evals --run_oe_eval_experiments --run_safety_evaluations --skip_oi_evals
+python scripts/submit_eval_jobs.py --model_name hf-NousResearch-Hermes-3-Llama-3.1-70B --location NousResearch/Hermes-3-Llama-3.1-70B --workspace tulu-3-results --preemptible --use_hf_tokenizer_template --beaker_image nathanl/open_instruct_auto --upload_to_hf allenai/tulu-3-evals --run_oe_eval_experiments --run_safety_evaluations --skip_oi_evals
+python scripts/submit_eval_jobs.py --model_name hf-llama_3_1_nemotron_70B_instruct_hf --location nvidia/Llama-3.1-Nemotron-70B-Instruct-HF --is_tuned --workspace tulu-3-results --preemptible --use_hf_tokenizer_template --beaker_image nathanl/open_instruct_auto --upload_to_hf allenai/tulu-3-evals --run_oe_eval_experiments --run_safety_evaluations --skip_oi_evals
+python scripts/submit_eval_jobs.py --model_name hf-athene_70b --location Nexusflow/Athene-70B --workspace tulu-3-results --preemptible --use_hf_tokenizer_template --beaker_image nathanl/open_instruct_auto --upload_to_hf allenai/tulu-3-evals --run_oe_eval_experiments --run_safety_evaluations --skip_oi_evals
+python scripts/submit_eval_jobs.py --model_name hf-llama_3_1_supernova_lite --location arcee-ai/Llama-3.1-SuperNova-Lite --workspace tulu-3-results --preemptible --use_hf_tokenizer_template --beaker_image nathanl/open_instruct_auto --upload_to_hf allenai/tulu-3-evals --run_oe_eval_experiments --run_safety_evaluations --skip_oi_evals
+python scripts/submit_eval_jobs.py --model_name hf-gemma_2_9b_it_simpo --location princeton-nlp/gemma-2-9b-it-SimPO --is_tuned --workspace tulu-3-results --preemptible --use_hf_tokenizer_template --beaker_image nathanl/open_instruct_auto --upload_to_hf allenai/tulu-3-evals --run_oe_eval_experiments --run_safety_evaluations --skip_oi_evals
+python scripts/submit_eval_jobs.py --model_name hf-magpielm_8b_chat_v0_1 --location Magpie-Align/MagpieLM-8B-Chat-v0.1 --is_tuned --workspace tulu-3-results --preemptible --use_hf_tokenizer_template --beaker_image nathanl/open_instruct_auto --upload_to_hf allenai/tulu-3-evals --run_oe_eval_experiments --run_safety_evaluations --skip_oi_evals
+python scripts/submit_eval_jobs.py --model_name hf-qwen2_5_72b_instruct --location Qwen/Qwen2.5-72B-Instruct --is_tuned --workspace tulu-3-results --preemptible --use_hf_tokenizer_template --beaker_image nathanl/open_instruct_auto --upload_to_hf allenai/tulu-3-evals --run_oe_eval_experiments --run_safety_evaluations --skip_oi_evals
+python scripts/submit_eval_jobs.py --model_name hf-qwen2_5_7b_instruct --location Qwen/Qwen2.5-7B-Instruct --is_tuned --workspace tulu-3-results --preemptible --use_hf_tokenizer_template --beaker_image nathanl/open_instruct_auto --upload_to_hf allenai/tulu-3-evals --run_oe_eval_experiments --run_safety_evaluations --skip_oi_evals
+python scripts/submit_eval_jobs.py --model_name hf-ministral_8b_instruct_2410 --location mistralai/Ministral-8B-Instruct-2410 --is_tuned --workspace tulu-3-results --preemptible --use_hf_tokenizer_template --beaker_image nathanl/open_instruct_auto --upload_to_hf allenai/tulu-3-evals --run_oe_eval_experiments --run_safety_evaluations --skip_oi_evals
+python scripts/submit_eval_jobs.py --model_name hf-llama_3_tulu_2_dpo_70b --location allenai/llama-3-tulu-2-dpo-70b --is_tuned --workspace tulu-3-results --preemptible --use_hf_tokenizer_template --beaker_image nathanl/open_instruct_auto --upload_to_hf allenai/tulu-3-evals --run_oe_eval_experiments --run_safety_evaluations --skip_oi_evals --oe_eval_max_length 2048
diff --git a/scripts/submit_eval_jobs.py b/scripts/submit_eval_jobs.py
@@ -15,7 +15,7 @@
 def adjust_batch_size(task_spec, model_name, batch_size_reduction):
     "Adjust batch size using heuristics that are good for A100-size GPUs."
     reduce_by_2 = ["13B"]
-    reduce_by_4 = ["30B", "34B", "40B", "65B", "70B"]
+    reduce_by_4 = ["30B", "34B", "40B", "65B", "70B", "70b", "72B"]
     # If not given, choose a value based on the model name.
     if batch_size_reduction is None:
         if any([pattern in model_name for pattern in reduce_by_2]):
@@ -37,7 +37,7 @@ def adjust_batch_size(task_spec, model_name, batch_size_reduction):
 def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier):
     "Adjust GPU count using heuristics that are good for A100-size GPUs."
     medium = ["30B", "34B"]
-    large = ["40B", "65B", "70B"]
+    large = ["40B", "65B", "70B", "70b", "72B"]
     # If not given, choose a value based on model name. 
     if gpu_multiplier is None:
         if any([pattern in model_name for pattern in medium]):
@@ -101,6 +101,7 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier):
 parser.add_argument("--run_oe_eval_experiments", action="store_true", help="Run the OE eval tool and experiments too.")
 parser.add_argument("--run_safety_evaluations", action="store_true", help="Run the OE safety evaluations too.")
 parser.add_argument("--skip_oi_evals", action="store_true", help="Don't run open instruct evals.")
+parser.add_argument("--oe_eval_max_length", type=int, default=4096, help="Max length for OE eval.")
 args = parser.parse_args()
 
 
@@ -585,6 +586,18 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier):
         oe_eval_cmd += f" --model-location beaker://{model_info[1]}"
     if args.hf_revision:
         oe_eval_cmd += f" --revision {args.hf_revision}"
+    # add string with number of gpus
+    num_gpus = task_spec['resources']['gpuCount']
+    # if num_gpus > 1, double it again for oe-eval configs
+    # open_instruct GPT adjustment wasn't quite enough
+    # adjusted here so the GPU configs in open-instruct eval are not impacted by the change
+    # tested reasonably extensively with 70B models.
+    if num_gpus > 1:
+        num_gpus *= 2
+    oe_eval_cmd += f" --num_gpus {num_gpus}"
+    if args.oe_eval_max_length:
+        oe_eval_cmd += f" --max-length {args.oe_eval_max_length}"
+    print(f"Running OE eval with command: {oe_eval_cmd}")
     subprocess.Popen(oe_eval_cmd, shell=True)
 
 # create an experiment that runs the safety eval tasks