allenai · liujch1998 · Oct 15, 2024 · Oct 15, 2024 · Oct 15, 2024 · Oct 15, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,8 +13,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Added ability to try loading latest checkpoint from save folder using `--try_load_latest_save`.
 - Added support for flash attention and gradient checkpointing to `hf_olmo`.
+- Added an eval-only script that evaluates existing checkpoints on specified tasks.
 - Added `effective_n_kv_heads` to OLMoConfig for hacky VLLM support.
 
+
 ## [v0.5.0](https://github.com/allenai/OLMo/releases/tag/v0.5.0) - 2024-08-26
 
 - Fixed conversion to HuggingFace model for DDP-trained models.
@@ -45,7 +47,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Swapped in correct flan data mix.
 - Fix bug where the attention norm, when applied before the attention block, was modifying the residual stream.
 - Fixed `OLMo.from_checkpoint()` so that it correctly loads `olmo_core` and `torch_new` style checkpoints.
-- Fixed `preserve_rng_state` being incorrectly set to False when doing gradient checkpointing with dropout 
+- Fixed `preserve_rng_state` being incorrectly set to False when doing gradient checkpointing with dropout
 
 
 ## [v0.4.0](https://github.com/allenai/OLMo/releases/tag/v0.4.0) - 2024-07-11

diff --git a/configs/peteish1-weka.yaml b/configs/peteish1-weka.yaml
@@ -108,35 +108,35 @@ eval_interval: 1000
 eval_subset_num_batches: -1
 device_eval_batch_size: ${device_train_microbatch_size}
 evaluators:
-  # - label: all-small-ppl-validation
-  #   data:
-  #     num_workers: 0
-  #     drop_last: true
-  #     # generate_doc_lengths: true
-  #     memmap_dtype: uint32
-  #     datasets:
-  #       c4_en-validation:
-  #         - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy
-  #       dolma_books-validation:
-  #         - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy
-  #       dolma_common-crawl-validation:
-  #         - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy
-  #       dolma_pes2o-validation:
-  #         - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy
-  #       dolma_reddit-validation:
-  #         - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy
-  #       dolma_stack-validation:
-  #         - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy
-  #       dolma_wiki-validation:
-  #         - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy
-  #       ice-validation:
-  #         - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy
-  #       m2d2_s2orc-validation:
-  #         - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy
-  #       pile-validation:
-  #         - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy
-  #       wikitext_103-validation:
-  #         - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy
+  - label: all-small-ppl-validation
+    data:
+      num_workers: 0
+      drop_last: true
+      # generate_doc_lengths: true
+      memmap_dtype: uint32
+      datasets:
+        c4_en-validation:
+          - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy
+        dolma_books-validation:
+          - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy
+        dolma_common-crawl-validation:
+          - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy
+        dolma_pes2o-validation:
+          - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy
+        dolma_reddit-validation:
+          - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy
+        dolma_stack-validation:
+          - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy
+        dolma_wiki-validation:
+          - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy
+        ice-validation:
+          - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy
+        m2d2_s2orc-validation:
+          - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy
+        pile-validation:
+          - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy
+        wikitext_103-validation:
+          - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy
 
   ##########################
   # Downstream evaluations #
@@ -155,7 +155,7 @@ evaluators:
 
   - label: boolq
     type: downstream
- 
+
   - label: sciq
     type: downstream
 
@@ -231,6 +231,228 @@ evaluators:
   - label: arc_easy_ppl
     type: downstream
 
+  - label: piqa_rc_0shot
+    type: downstream
+
+  - label: piqa_rc_0shot_bpb
+    type: downstream
+
+  - label: piqa_rc_5shot
+    type: downstream
+
+  - label: piqa_rc_5shot_bpb
+    type: downstream
+
+  - label: piqa_mc_5shot
+    type: downstream
+
+  - label: piqa_mc_5shot_bpb
+    type: downstream
+
+  - label: hellaswag_rc_0shot
+    type: downstream
+
+  - label: hellaswag_rc_0shot_bpb
+    type: downstream
+
+  - label: hellaswag_rc_5shot
+    type: downstream
+
+  - label: hellaswag_rc_5shot_bpb
+    type: downstream
+
+  - label: hellaswag_mc_5shot
+    type: downstream
+
+  - label: hellaswag_mc_5shot_bpb
+    type: downstream
+
+  - label: winogrande_rc_0shot
+    type: downstream
+
+  - label: winogrande_rc_0shot_bpb
+    type: downstream
+
+  - label: winogrande_rc_5shot
+    type: downstream
+
+  - label: winogrande_rc_5shot_bpb
+    type: downstream
+
+  - label: winogrande_mc_5shot
+    type: downstream
+
+  - label: winogrande_mc_5shot_bpb
+    type: downstream
+
+  - label: openbookqa_rc_0shot
+    type: downstream
+
+  - label: openbookqa_rc_0shot_bpb
+    type: downstream
+
+  - label: openbookqa_rc_5shot
+    type: downstream
+
+  - label: openbookqa_rc_5shot_bpb
+    type: downstream
+
+  - label: openbookqa_mc_5shot
+    type: downstream
+
+  - label: openbookqa_mc_5shot_bpb
+    type: downstream
+
+  - label: boolq_rc_0shot
+    type: downstream
+
+  - label: boolq_rc_0shot_bpb
+    type: downstream
+
+  - label: boolq_rc_5shot
+    type: downstream
+
+  - label: boolq_rc_5shot_bpb
+    type: downstream
+
+  - label: boolq_mc_5shot
+    type: downstream
+
+  - label: boolq_mc_5shot_bpb
+    type: downstream
+
+  - label: sciq_rc_0shot
+    type: downstream
+
+  - label: sciq_rc_0shot_bpb
+    type: downstream
+
+  # - label: sciq_rc_5shot
+  #   type: downstream
+
+  # - label: sciq_rc_5shot_bpb
+  #   type: downstream
+
+  # - label: sciq_mc_5shot
+  #   type: downstream
+
+  # - label: sciq_mc_5shot_bpb
+  #   type: downstream
+
+  - label: arc_easy_rc_0shot
+    type: downstream
+
+  - label: arc_easy_rc_0shot_bpb
+    type: downstream
+
+  - label: arc_easy_rc_5shot
+    type: downstream
+
+  - label: arc_easy_rc_5shot_bpb
+    type: downstream
+
+  - label: arc_easy_mc_5shot
+    type: downstream
+
+  - label: arc_easy_mc_5shot_bpb
+    type: downstream
+
+  - label: arc_challenge_rc_0shot
+    type: downstream
+
+  - label: arc_challenge_rc_0shot_bpb
+    type: downstream
+
+  - label: arc_challenge_rc_5shot
+    type: downstream
+
+  - label: arc_challenge_rc_5shot_bpb
+    type: downstream
+
+  - label: arc_challenge_mc_5shot
+    type: downstream
+
+  - label: arc_challenge_mc_5shot_bpb
+    type: downstream
+
+  - label: copa_rc_0shot
+    type: downstream
+
+  - label: copa_rc_0shot_bpb
+    type: downstream
+
+  # - label: copa_rc_5shot
+  #   type: downstream
+
+  # - label: copa_rc_5shot_bpb
+  #   type: downstream
+
+  # - label: copa_mc_5shot
+  #   type: downstream
+
+  # - label: copa_mc_5shot_bpb
+  #   type: downstream
+
+  - label: csqa_rc_0shot
+    type: downstream
+
+  - label: csqa_rc_0shot_bpb
+    type: downstream
+
+  - label: csqa_rc_5shot
+    type: downstream
+
+  - label: csqa_rc_5shot_bpb
+    type: downstream
+
+  - label: csqa_mc_5shot
+    type: downstream
+
+  - label: csqa_mc_5shot_bpb
+    type: downstream
+
+  - label: socialiqa_rc_0shot
+    type: downstream
+
+  - label: socialiqa_rc_0shot_bpb
+    type: downstream
+
+  - label: socialiqa_rc_5shot
+    type: downstream
+
+  - label: socialiqa_rc_5shot_bpb
+    type: downstream
+
+  - label: socialiqa_mc_5shot
+    type: downstream
+
+  - label: socialiqa_mc_5shot_bpb
+    type: downstream
+
+  - label: mmlu_stem_var_bpb
+    type: downstream
+
+  - label: mmlu_humanities_var_bpb
+    type: downstream
+
+  - label: mmlu_social_sciences_var_bpb
+    type: downstream
+
+  - label: mmlu_other_var_bpb
+    type: downstream
+
+  - label: mmlu_stem_bpb
+    type: downstream
+
+  - label: mmlu_humanities_bpb
+    type: downstream
+
+  - label: mmlu_social_sciences_bpb
+    type: downstream
+
+  - label: mmlu_other_bpb
+    type: downstream
+
 data:
   pad_direction: right
   # generate_doc_lengths: true

diff --git a/configs/peteish7-weka.yaml b/configs/peteish7-weka.yaml
@@ -154,7 +154,7 @@ evaluators:
 
   - label: boolq
     type: downstream
- 
+
   - label: sciq
     type: downstream
 

diff --git a/olmo/train.py b/olmo/train.py
@@ -1368,8 +1368,8 @@ def close(self, exit_code: int = 0) -> None:
             gc.enable()
         else:
             gc.disable()
-        if wandb.run is not None:
-            wandb.finish(exit_code=exit_code, quiet=True)
+        # if wandb.run is not None:
+        #     wandb.finish(exit_code=exit_code, quiet=True)
 
     def __enter__(self) -> Trainer:
         return self

diff --git a/scripts/beaker/peteish/peteish1-eval-launch.sh b/scripts/beaker/peteish/peteish1-eval-launch.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+
+set -ex
+
+NUM_NODES=16
+
+gantry run \
+  --allow-dirty \
+  --workspace ai2/OLMo-pretraining-stability \
+  --task-name peteish1-eval \
+  --description "Pete-ish 1B eval" \
+  --priority high \
+  --preemptible \
+  --beaker-image petew/olmo-torch23-gantry \
+  --cluster ai2/jupiter-cirrascale-2 \
+  --gpus 8 \
+  --replicas "${NUM_NODES}" \
+  --leader-selection \
+  --host-networking \
+  --propagate-failure \
+  --propagate-preemption \
+  --synchronized-start-timeout 90m \
+  --budget ai2/oe-training \
+  --no-nfs \
+  --weka oe-training-default:/weka/oe-training-default \
+  --no-python \
+  --env LOG_FILTER_TYPE=local_rank0_only \
+  --env OMP_NUM_THREADS=8 \
+  --env OLMO_TASK=model \
+  --env R2_PROFILE=R2 \
+  --env S3_PROFILE=S3 \
+  --env WEKA_PROFILE=WEKA \
+  --env-secret AWS_CONFIG=PETEW_AWS_CONFIG \
+  --env-secret AWS_CREDENTIALS=PETEW_AWS_CREDENTIALS \
+  --env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \
+  --env-secret WEKA_ENDPOINT_URL=WEKA_ENDPOINT_URL \
+  --env-secret WANDB_API_KEY=JIACHENGL_WANDB_API_KEY \
+  --shared-memory 10GiB \
+  --yes \
+  --timeout=-1 \
+  -- /bin/bash -c "scripts/beaker/peteish/peteish1-eval.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK"