Revert Peteish7 changes

allenai · Oct 24, 2024 · 97d78ed · 97d78ed
1 parent a36b9e0
commit 97d78ed
Show file tree

Hide file tree

Showing 5 changed files with 50 additions and 267 deletions.
diff --git a/configs/peteish7-weka.yaml b/configs/peteish7-weka.yaml
@@ -107,35 +107,35 @@ eval_interval: 1000
 eval_subset_num_batches: -1
 device_eval_batch_size: ${device_train_microbatch_size}
 evaluators:
-  - label: all-small-ppl-validation
-    data:
-      num_workers: 0
-      drop_last: true
-      # generate_doc_lengths: true
-      memmap_dtype: uint32
-      datasets:
-        c4_en-validation:
-          - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy
-        dolma_books-validation:
-          - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy
-        dolma_common-crawl-validation:
-          - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy
-        dolma_pes2o-validation:
-          - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy
-        dolma_reddit-validation:
-          - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy
-        dolma_stack-validation:
-          - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy
-        dolma_wiki-validation:
-          - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy
-        ice-validation:
-          - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy
-        m2d2_s2orc-validation:
-          - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy
-        pile-validation:
-          - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy
-        wikitext_103-validation:
-          - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy
+  # - label: all-small-ppl-validation
+  #   data:
+  #     num_workers: 0
+  #     drop_last: true
+  #     # generate_doc_lengths: true
+  #     memmap_dtype: uint32
+  #     datasets:
+  #       c4_en-validation:
+  #         - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy
+  #       dolma_books-validation:
+  #         - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy
+  #       dolma_common-crawl-validation:
+  #         - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy
+  #       dolma_pes2o-validation:
+  #         - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy
+  #       dolma_reddit-validation:
+  #         - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy
+  #       dolma_stack-validation:
+  #         - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy
+  #       dolma_wiki-validation:
+  #         - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy
+  #       ice-validation:
+  #         - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy
+  #       m2d2_s2orc-validation:
+  #         - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy
+  #       pile-validation:
+  #         - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy
+  #       wikitext_103-validation:
+  #         - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy
 
   ##########################
   # Downstream evaluations #
@@ -230,228 +230,6 @@ evaluators:
   - label: arc_easy_ppl
     type: downstream
 
-  - label: piqa_rc_0shot
-    type: downstream
-
-  - label: piqa_rc_0shot_bpb
-    type: downstream
-
-  - label: piqa_rc_5shot
-    type: downstream
-
-  - label: piqa_rc_5shot_bpb
-    type: downstream
-
-  - label: piqa_mc_5shot
-    type: downstream
-
-  - label: piqa_mc_5shot_bpb
-    type: downstream
-
-  - label: hellaswag_rc_0shot
-    type: downstream
-
-  - label: hellaswag_rc_0shot_bpb
-    type: downstream
-
-  - label: hellaswag_rc_5shot
-    type: downstream
-
-  - label: hellaswag_rc_5shot_bpb
-    type: downstream
-
-  - label: hellaswag_mc_5shot
-    type: downstream
-
-  - label: hellaswag_mc_5shot_bpb
-    type: downstream
-
-  - label: winogrande_rc_0shot
-    type: downstream
-
-  - label: winogrande_rc_0shot_bpb
-    type: downstream
-
-  - label: winogrande_rc_5shot
-    type: downstream
-
-  - label: winogrande_rc_5shot_bpb
-    type: downstream
-
-  - label: winogrande_mc_5shot
-    type: downstream
-
-  - label: winogrande_mc_5shot_bpb
-    type: downstream
-
-  - label: openbookqa_rc_0shot
-    type: downstream
-
-  - label: openbookqa_rc_0shot_bpb
-    type: downstream
-
-  - label: openbookqa_rc_5shot
-    type: downstream
-
-  - label: openbookqa_rc_5shot_bpb
-    type: downstream
-
-  - label: openbookqa_mc_5shot
-    type: downstream
-
-  - label: openbookqa_mc_5shot_bpb
-    type: downstream
-
-  - label: boolq_rc_0shot
-    type: downstream
-
-  - label: boolq_rc_0shot_bpb
-    type: downstream
-
-  - label: boolq_rc_5shot
-    type: downstream
-
-  - label: boolq_rc_5shot_bpb
-    type: downstream
-
-  - label: boolq_mc_5shot
-    type: downstream
-
-  - label: boolq_mc_5shot_bpb
-    type: downstream
-
-  - label: sciq_rc_0shot
-    type: downstream
-
-  - label: sciq_rc_0shot_bpb
-    type: downstream
-
-  # - label: sciq_rc_5shot
-  #   type: downstream
-
-  # - label: sciq_rc_5shot_bpb
-  #   type: downstream
-
-  # - label: sciq_mc_5shot
-  #   type: downstream
-
-  # - label: sciq_mc_5shot_bpb
-  #   type: downstream
-
-  - label: arc_easy_rc_0shot
-    type: downstream
-
-  - label: arc_easy_rc_0shot_bpb
-    type: downstream
-
-  - label: arc_easy_rc_5shot
-    type: downstream
-
-  - label: arc_easy_rc_5shot_bpb
-    type: downstream
-
-  - label: arc_easy_mc_5shot
-    type: downstream
-
-  - label: arc_easy_mc_5shot_bpb
-    type: downstream
-
-  - label: arc_challenge_rc_0shot
-    type: downstream
-
-  - label: arc_challenge_rc_0shot_bpb
-    type: downstream
-
-  - label: arc_challenge_rc_5shot
-    type: downstream
-
-  - label: arc_challenge_rc_5shot_bpb
-    type: downstream
-
-  - label: arc_challenge_mc_5shot
-    type: downstream
-
-  - label: arc_challenge_mc_5shot_bpb
-    type: downstream
-
-  - label: copa_rc_0shot
-    type: downstream
-
-  - label: copa_rc_0shot_bpb
-    type: downstream
-
-  # - label: copa_rc_5shot
-  #   type: downstream
-
-  # - label: copa_rc_5shot_bpb
-  #   type: downstream
-
-  # - label: copa_mc_5shot
-  #   type: downstream
-
-  # - label: copa_mc_5shot_bpb
-  #   type: downstream
-
-  - label: csqa_rc_0shot
-    type: downstream
-
-  - label: csqa_rc_0shot_bpb
-    type: downstream
-
-  - label: csqa_rc_5shot
-    type: downstream
-
-  - label: csqa_rc_5shot_bpb
-    type: downstream
-
-  - label: csqa_mc_5shot
-    type: downstream
-
-  - label: csqa_mc_5shot_bpb
-    type: downstream
-
-  - label: socialiqa_rc_0shot
-    type: downstream
-
-  - label: socialiqa_rc_0shot_bpb
-    type: downstream
-
-  - label: socialiqa_rc_5shot
-    type: downstream
-
-  - label: socialiqa_rc_5shot_bpb
-    type: downstream
-
-  - label: socialiqa_mc_5shot
-    type: downstream
-
-  - label: socialiqa_mc_5shot_bpb
-    type: downstream
-
-  - label: mmlu_stem_var_bpb
-    type: downstream
-
-  - label: mmlu_humanities_var_bpb
-    type: downstream
-
-  - label: mmlu_social_sciences_var_bpb
-    type: downstream
-
-  - label: mmlu_other_var_bpb
-    type: downstream
-
-  - label: mmlu_stem_bpb
-    type: downstream
-
-  - label: mmlu_humanities_bpb
-    type: downstream
-
-  - label: mmlu_social_sciences_bpb
-    type: downstream
-
-  - label: mmlu_other_bpb
-    type: downstream
-
 data:
   pad_direction: right
   # generate_doc_lengths: true

diff --git a/scripts/beaker/peteish/peteish1-launch.sh b/scripts/beaker/peteish/peteish1-launch.sh
@@ -37,4 +37,4 @@ gantry run \
   --shared-memory 10GiB \
   --yes \
   --timeout=-1 \
-  -- /bin/bash -c "scripts/beaker/peteish/peteish1.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK"
+  -- /bin/bash -c "scripts/beaker/peteish/peteish1.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK"
diff --git a/scripts/beaker/peteish/peteish1.sh b/scripts/beaker/peteish/peteish1.sh
@@ -54,4 +54,4 @@ torchrun \
       --save_interval_ephemeral=null \
       --save_overwrite
 
-     # '--load_path=${path.last_checkpoint:${save_folder}}' \
+     # '--load_path=${path.last_checkpoint:${save_folder}}' \
diff --git a/scripts/beaker/peteish/peteish7-launch.sh b/scripts/beaker/peteish/peteish7-launch.sh
@@ -2,14 +2,13 @@
 
 set -ex
 
-NUM_NODES=8
+NUM_NODES=16
 
 gantry run \
-  --allow-dirty \
-  --workspace ai2/OLMo-tiny \
-  --task-name peteish7-anneal-eval \
+  --workspace ai2/OLMo-pretraining-stability \
+  --task-name peteish7 \
   --description "Pete-ish 7B" \
-  --priority high \
+  --priority urgent \
   --preemptible \
   --beaker-image petew/olmo-torch23-gantry \
   --cluster ai2/jupiter-cirrascale-2 \
@@ -27,9 +26,14 @@ gantry run \
   --env LOG_FILTER_TYPE=local_rank0_only \
   --env OMP_NUM_THREADS=8 \
   --env OLMO_TASK=model \
-  --env-secret WANDB_API_KEY=AKSHITAB_WANDB_API_KEY \
-  --env-secret AWS_ACCESS_KEY_ID=AKSHITAB_AWS_ACCESS_KEY_ID \
-  --env-secret AWS_SECRET_ACCESS_KEY=AKSHITAB_AWS_SECRET_ACCESS_KEY \
+  --env R2_PROFILE=R2 \
+  --env S3_PROFILE=S3 \
+  --env WEKA_PROFILE=WEKA \
+  --env-secret AWS_CONFIG=PETEW_AWS_CONFIG \
+  --env-secret AWS_CREDENTIALS=PETEW_AWS_CREDENTIALS \
+  --env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \
+  --env-secret WEKA_ENDPOINT_URL=WEKA_ENDPOINT_URL \
+  --env-secret WANDB_API_KEY=PETEW_WANDB_API_KEY \
   --shared-memory 10GiB \
   --yes \
   --timeout=-1 \

diff --git a/scripts/beaker/peteish/peteish7.sh b/scripts/beaker/peteish/peteish7.sh
@@ -25,9 +25,9 @@ pip install '.[train]'
 pip freeze
 
 # Move AWS credentials from env to relevant files
-# mkdir -p ~/.aws
-# printenv AWS_CONFIG > ~/.aws/config
-# printenv AWS_CREDENTIALS > ~/.aws/credentials
+mkdir -p ~/.aws
+printenv AWS_CONFIG > ~/.aws/config
+printenv AWS_CREDENTIALS > ~/.aws/credentials
 
 # Force processes to synchronize at init_process_group
 export TORCH_DIST_INIT_BARRIER=1
@@ -48,9 +48,10 @@ torchrun \
   --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \
   --node_rank "${BEAKER_REPLICA_RANK}" \
   --rdzv_conf 'read_timeout=420' \
-  scripts/eval.py \
+  scripts/train.py \
     configs/peteish7-weka.yaml \
       --run_name="${GANTRY_TASK_NAME}" \
       --save_interval_ephemeral=500 \
-      --save_folder="/weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-no-warmup" \
-     '--load_path=${path.last_checkpoint:${save_folder}}' \
+      --save_overwrite
+
+     # '--load_path=${path.last_checkpoint:${save_folder}}' \