Skip to content

Commit

Permalink
Revert Peteish7 changes
Browse files Browse the repository at this point in the history
  • Loading branch information
liujch1998 committed Oct 24, 2024
1 parent a36b9e0 commit 97d78ed
Show file tree
Hide file tree
Showing 5 changed files with 50 additions and 267 deletions.
280 changes: 29 additions & 251 deletions configs/peteish7-weka.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -107,35 +107,35 @@ eval_interval: 1000
eval_subset_num_batches: -1
device_eval_batch_size: ${device_train_microbatch_size}
evaluators:
- label: all-small-ppl-validation
data:
num_workers: 0
drop_last: true
# generate_doc_lengths: true
memmap_dtype: uint32
datasets:
c4_en-validation:
- /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy
dolma_books-validation:
- /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy
dolma_common-crawl-validation:
- /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy
dolma_pes2o-validation:
- /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy
dolma_reddit-validation:
- /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy
dolma_stack-validation:
- /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy
dolma_wiki-validation:
- /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy
ice-validation:
- /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy
m2d2_s2orc-validation:
- /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy
pile-validation:
- /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy
wikitext_103-validation:
- /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy
# - label: all-small-ppl-validation
# data:
# num_workers: 0
# drop_last: true
# # generate_doc_lengths: true
# memmap_dtype: uint32
# datasets:
# c4_en-validation:
# - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/c4_en/val/part-0-00000.npy
# dolma_books-validation:
# - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_books/val/part-0-00000.npy
# dolma_common-crawl-validation:
# - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_common-crawl/val/part-0-00000.npy
# dolma_pes2o-validation:
# - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_pes2o/val/part-0-00000.npy
# dolma_reddit-validation:
# - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_reddit/val/part-0-00000.npy
# dolma_stack-validation:
# - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_stack/val/part-0-00000.npy
# dolma_wiki-validation:
# - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/dolma_wiki/val/part-0-00000.npy
# ice-validation:
# - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/ice/val/part-0-00000.npy
# m2d2_s2orc-validation:
# - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/m2d2_s2orc/val/part-0-00000.npy
# pile-validation:
# - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/pile/val/part-0-00000.npy
# wikitext_103-validation:
# - /weka/oe-training-default/ai2-llm/eval-data/perplexity/v3_small_dolma2-tokenizer/wikitext_103/val/part-0-00000.npy

##########################
# Downstream evaluations #
Expand Down Expand Up @@ -230,228 +230,6 @@ evaluators:
- label: arc_easy_ppl
type: downstream

- label: piqa_rc_0shot
type: downstream

- label: piqa_rc_0shot_bpb
type: downstream

- label: piqa_rc_5shot
type: downstream

- label: piqa_rc_5shot_bpb
type: downstream

- label: piqa_mc_5shot
type: downstream

- label: piqa_mc_5shot_bpb
type: downstream

- label: hellaswag_rc_0shot
type: downstream

- label: hellaswag_rc_0shot_bpb
type: downstream

- label: hellaswag_rc_5shot
type: downstream

- label: hellaswag_rc_5shot_bpb
type: downstream

- label: hellaswag_mc_5shot
type: downstream

- label: hellaswag_mc_5shot_bpb
type: downstream

- label: winogrande_rc_0shot
type: downstream

- label: winogrande_rc_0shot_bpb
type: downstream

- label: winogrande_rc_5shot
type: downstream

- label: winogrande_rc_5shot_bpb
type: downstream

- label: winogrande_mc_5shot
type: downstream

- label: winogrande_mc_5shot_bpb
type: downstream

- label: openbookqa_rc_0shot
type: downstream

- label: openbookqa_rc_0shot_bpb
type: downstream

- label: openbookqa_rc_5shot
type: downstream

- label: openbookqa_rc_5shot_bpb
type: downstream

- label: openbookqa_mc_5shot
type: downstream

- label: openbookqa_mc_5shot_bpb
type: downstream

- label: boolq_rc_0shot
type: downstream

- label: boolq_rc_0shot_bpb
type: downstream

- label: boolq_rc_5shot
type: downstream

- label: boolq_rc_5shot_bpb
type: downstream

- label: boolq_mc_5shot
type: downstream

- label: boolq_mc_5shot_bpb
type: downstream

- label: sciq_rc_0shot
type: downstream

- label: sciq_rc_0shot_bpb
type: downstream

# - label: sciq_rc_5shot
# type: downstream

# - label: sciq_rc_5shot_bpb
# type: downstream

# - label: sciq_mc_5shot
# type: downstream

# - label: sciq_mc_5shot_bpb
# type: downstream

- label: arc_easy_rc_0shot
type: downstream

- label: arc_easy_rc_0shot_bpb
type: downstream

- label: arc_easy_rc_5shot
type: downstream

- label: arc_easy_rc_5shot_bpb
type: downstream

- label: arc_easy_mc_5shot
type: downstream

- label: arc_easy_mc_5shot_bpb
type: downstream

- label: arc_challenge_rc_0shot
type: downstream

- label: arc_challenge_rc_0shot_bpb
type: downstream

- label: arc_challenge_rc_5shot
type: downstream

- label: arc_challenge_rc_5shot_bpb
type: downstream

- label: arc_challenge_mc_5shot
type: downstream

- label: arc_challenge_mc_5shot_bpb
type: downstream

- label: copa_rc_0shot
type: downstream

- label: copa_rc_0shot_bpb
type: downstream

# - label: copa_rc_5shot
# type: downstream

# - label: copa_rc_5shot_bpb
# type: downstream

# - label: copa_mc_5shot
# type: downstream

# - label: copa_mc_5shot_bpb
# type: downstream

- label: csqa_rc_0shot
type: downstream

- label: csqa_rc_0shot_bpb
type: downstream

- label: csqa_rc_5shot
type: downstream

- label: csqa_rc_5shot_bpb
type: downstream

- label: csqa_mc_5shot
type: downstream

- label: csqa_mc_5shot_bpb
type: downstream

- label: socialiqa_rc_0shot
type: downstream

- label: socialiqa_rc_0shot_bpb
type: downstream

- label: socialiqa_rc_5shot
type: downstream

- label: socialiqa_rc_5shot_bpb
type: downstream

- label: socialiqa_mc_5shot
type: downstream

- label: socialiqa_mc_5shot_bpb
type: downstream

- label: mmlu_stem_var_bpb
type: downstream

- label: mmlu_humanities_var_bpb
type: downstream

- label: mmlu_social_sciences_var_bpb
type: downstream

- label: mmlu_other_var_bpb
type: downstream

- label: mmlu_stem_bpb
type: downstream

- label: mmlu_humanities_bpb
type: downstream

- label: mmlu_social_sciences_bpb
type: downstream

- label: mmlu_other_bpb
type: downstream

data:
pad_direction: right
# generate_doc_lengths: true
Expand Down
2 changes: 1 addition & 1 deletion scripts/beaker/peteish/peteish1-launch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,4 @@ gantry run \
--shared-memory 10GiB \
--yes \
--timeout=-1 \
-- /bin/bash -c "scripts/beaker/peteish/peteish1.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK"
-- /bin/bash -c "scripts/beaker/peteish/peteish1.sh \$BEAKER_LEADER_REPLICA_HOSTNAME ${NUM_NODES} \$BEAKER_REPLICA_RANK"
2 changes: 1 addition & 1 deletion scripts/beaker/peteish/peteish1.sh
Original file line number Diff line number Diff line change
Expand Up @@ -54,4 +54,4 @@ torchrun \
--save_interval_ephemeral=null \
--save_overwrite

# '--load_path=${path.last_checkpoint:${save_folder}}' \
# '--load_path=${path.last_checkpoint:${save_folder}}' \
20 changes: 12 additions & 8 deletions scripts/beaker/peteish/peteish7-launch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,13 @@

set -ex

NUM_NODES=8
NUM_NODES=16

gantry run \
--allow-dirty \
--workspace ai2/OLMo-tiny \
--task-name peteish7-anneal-eval \
--workspace ai2/OLMo-pretraining-stability \
--task-name peteish7 \
--description "Pete-ish 7B" \
--priority high \
--priority urgent \
--preemptible \
--beaker-image petew/olmo-torch23-gantry \
--cluster ai2/jupiter-cirrascale-2 \
Expand All @@ -27,9 +26,14 @@ gantry run \
--env LOG_FILTER_TYPE=local_rank0_only \
--env OMP_NUM_THREADS=8 \
--env OLMO_TASK=model \
--env-secret WANDB_API_KEY=AKSHITAB_WANDB_API_KEY \
--env-secret AWS_ACCESS_KEY_ID=AKSHITAB_AWS_ACCESS_KEY_ID \
--env-secret AWS_SECRET_ACCESS_KEY=AKSHITAB_AWS_SECRET_ACCESS_KEY \
--env R2_PROFILE=R2 \
--env S3_PROFILE=S3 \
--env WEKA_PROFILE=WEKA \
--env-secret AWS_CONFIG=PETEW_AWS_CONFIG \
--env-secret AWS_CREDENTIALS=PETEW_AWS_CREDENTIALS \
--env-secret R2_ENDPOINT_URL=R2_ENDPOINT_URL \
--env-secret WEKA_ENDPOINT_URL=WEKA_ENDPOINT_URL \
--env-secret WANDB_API_KEY=PETEW_WANDB_API_KEY \
--shared-memory 10GiB \
--yes \
--timeout=-1 \
Expand Down
13 changes: 7 additions & 6 deletions scripts/beaker/peteish/peteish7.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@ pip install '.[train]'
pip freeze

# Move AWS credentials from env to relevant files
# mkdir -p ~/.aws
# printenv AWS_CONFIG > ~/.aws/config
# printenv AWS_CREDENTIALS > ~/.aws/credentials
mkdir -p ~/.aws
printenv AWS_CONFIG > ~/.aws/config
printenv AWS_CREDENTIALS > ~/.aws/credentials

# Force processes to synchronize at init_process_group
export TORCH_DIST_INIT_BARRIER=1
Expand All @@ -48,9 +48,10 @@ torchrun \
--rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \
--node_rank "${BEAKER_REPLICA_RANK}" \
--rdzv_conf 'read_timeout=420' \
scripts/eval.py \
scripts/train.py \
configs/peteish7-weka.yaml \
--run_name="${GANTRY_TASK_NAME}" \
--save_interval_ephemeral=500 \
--save_folder="/weka/oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-no-warmup" \
'--load_path=${path.last_checkpoint:${save_folder}}' \
--save_overwrite

# '--load_path=${path.last_checkpoint:${save_folder}}' \

0 comments on commit 97d78ed

Please sign in to comment.