[UX] Error out for null env var (#3557)

* [UX] Error out for null env var * format * Fix examples for env, including HF_TOKEN and WANDB_API_KEY * fix * Add test * format * fix * type * fix * remove print * add doc * fix comment * minor fix
skypilot-org · May 16, 2024 · eae8fc5 · eae8fc5
1 parent 4a66806
commit eae8fc5
Show file tree

Hide file tree

Showing 23 changed files with 73 additions and 33 deletions.
diff --git a/docs/source/running-jobs/environment-variables.rst b/docs/source/running-jobs/environment-variables.rst
@@ -12,6 +12,20 @@ You can specify environment variables to be made available to a task in two ways
 - The ``envs`` field (dict) in a :ref:`task YAML <yaml-spec>`
 - The ``--env`` flag in the ``sky launch/exec`` :ref:`CLI <cli>` (takes precedence over the above)
 
+.. tip::
+
+ If an environment variable is required to be specified with `--env` during
+ ``sky launch/exec``, you can set it to ``null`` in task YAML to raise an
+ error when it is forgotten to be specified. For example, the ``WANDB_API_KEY``
+ and ``HF_TOKEN`` in the following task YAML:
+
+ .. code-block:: yaml
+
+ envs:
+ WANDB_API_KEY:
+ HF_TOKEN: null
+ MYVAR: val
+
 The ``file_mounts``, ``setup``, and ``run`` sections of a task YAML can access the variables via the ``${MYVAR}`` syntax.
 
 Using in ``file_mounts``

diff --git a/examples/serve/llama2/llama2.yaml b/examples/serve/llama2/llama2.yaml
@@ -25,7 +25,7 @@ resources:
 
 envs:
  MODEL_SIZE: 7
- HF_TOKEN: <your-huggingface-token> # TODO: Replace with huggingface token
+ HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 setup: |
  conda activate chatbot

diff --git a/examples/spot_pipeline/bert_qa_train_eval.yaml b/examples/spot_pipeline/bert_qa_train_eval.yaml
@@ -42,7 +42,7 @@ run: |
  echo Model saved to /checkpoint/bert_qa/$SKYPILOT_TASK_ID
 
 envs:
- WANDB_API_KEY: # NOTE: Fill in your wandb key
+ WANDB_API_KEY: # TODO: Fill with your own WANDB_API_KEY, or use --env to pass.
 
 ---
 
@@ -84,4 +84,4 @@ run: |
  --save_steps 1000
 
 envs:
- WANDB_API_KEY: # NOTE: Fill in your wandb key
+ WANDB_API_KEY: # TODO: Fill with your own WANDB_API_KEY, or use --env to pass.
diff --git a/llm/axolotl/axolotl-spot.yaml b/llm/axolotl/axolotl-spot.yaml
@@ -38,8 +38,8 @@ run: |
  accelerate launch -m axolotl.cli.train /sky_workdir/qlora-checkpoint.yaml
 
 envs:
- HF_TOKEN: <your-huggingface-token> # TODO: Replace with huggingface token
- BUCKET: <a-unique-bucket-name-to-use>
+ HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
+ BUCKET: # TODO: Fill with your unique bucket name, or use --env to pass.
 
 
 

diff --git a/llm/axolotl/axolotl.yaml b/llm/axolotl/axolotl.yaml
@@ -26,7 +26,7 @@ run: |
  accelerate launch -m axolotl.cli.train /sky_workdir/qlora.yaml
 
 envs:
- HF_TOKEN: <your-huggingface-token> # TODO: Replace with huggingface token
+ HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 
 

diff --git a/llm/dbrx/README.md b/llm/dbrx/README.md
@@ -22,7 +22,7 @@ In this recipe, you will serve `databricks/dbrx-instruct` on your own infra --
 ```yaml
 envs:
  MODEL_NAME: databricks/dbrx-instruct
- HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
+ HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 service:
  replicas: 2

diff --git a/llm/dbrx/dbrx.yaml b/llm/dbrx/dbrx.yaml
@@ -31,7 +31,7 @@
 
 envs:
  MODEL_NAME: databricks/dbrx-instruct
- HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
+ HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 service:
  replicas: 2

diff --git a/llm/falcon/falcon.yaml b/llm/falcon/falcon.yaml
@@ -7,7 +7,7 @@ workdir: .
 
 envs:
  MODEL_NAME: tiiuae/falcon-7b # [ybelkada/falcon-7b-sharded-bf16, tiiuae/falcon-7b, tiiuae/falcon-40b]
- WANDB_API_KEY: $WANDB_KEY # Change to your own wandb key
+ WANDB_API_KEY: # TODO: Fill with your own WANDB_API_KEY, or use --env to pass.
  OUTPUT_BUCKET_NAME: # Set a unique name for the bucket which will store model weights
 
 file_mounts:
@@ -39,4 +39,4 @@ run: |
  --bnb_4bit_compute_dtype bfloat16 \
  --max_steps 500 \
  --dataset_name timdettmers/openassistant-guanaco \
- --output_dir /results
+ --output_dir /results
diff --git a/llm/gemma/serve.yaml b/llm/gemma/serve.yaml
@@ -17,7 +17,7 @@ service:
 
 envs:
  MODEL_NAME: google/gemma-7b-it
- HF_TOKEN: <your-huggingface-token> # TODO: Replace with huggingface token
+ HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 resources: 
  accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB}

diff --git a/llm/llama-2/README.md b/llm/llama-2/README.md
@@ -33,7 +33,7 @@ Fill the access token in the [chatbot-hf.yaml](https://github.com/skypilot-org/s
 ```yaml
 envs:
  MODEL_SIZE: 7
- HF_TOKEN: <your-huggingface-token>
+ HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 ```
 
 

diff --git a/llm/llama-2/chatbot-hf.yaml b/llm/llama-2/chatbot-hf.yaml
@@ -6,7 +6,7 @@ resources:
 
 envs:
  MODEL_SIZE: 7
- HF_TOKEN: <your-huggingface-token> # TODO: Replace with huggingface token
+ HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 setup: |
  conda activate chatbot

diff --git a/llm/llama-2/chatbot-meta.yaml b/llm/llama-2/chatbot-meta.yaml
@@ -6,7 +6,7 @@ resources:
 
 envs:
  MODEL_SIZE: 7
- HF_TOKEN: <your-huggingface-token> # TODO: Replace with huggingface token
+ HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 setup: |
  set -ex

diff --git a/llm/llama-3/README.md b/llm/llama-3/README.md
@@ -44,7 +44,7 @@
 envs:
  MODEL_NAME: meta-llama/Meta-Llama-3-70B-Instruct
  # MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
- HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
+ HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 service:
  replicas: 2

diff --git a/llm/llama-3/llama3.yaml b/llm/llama-3/llama3.yaml
@@ -59,7 +59,7 @@
 envs:
  MODEL_NAME: meta-llama/Meta-Llama-3-70B-Instruct
  # MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
- HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
+ HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 service:
  replicas: 2

diff --git a/llm/sglang/llama2.yaml b/llm/sglang/llama2.yaml
@@ -6,7 +6,7 @@ service:
 
 envs:
  MODEL_NAME: meta-llama/Llama-2-7b-chat-hf
- HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token
+ HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 resources:
  accelerators: {L4:1, A10G:1, A10:1, A100:1, A100-80GB:1}

diff --git a/llm/vicuna-llama-2/README.md b/llm/vicuna-llama-2/README.md
@@ -31,7 +31,7 @@ cd skypilot/llm/vicuna-llama-2
 Paste the access token into [train.yaml](https://github.com/skypilot-org/skypilot/tree/master/llm/vicuna-llama-2/train.yaml):
 ```yaml
 envs:
- HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token
+ HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 ```
 
 ## Train your own Vicuna on Llama-2

diff --git a/llm/vicuna-llama-2/train.yaml b/llm/vicuna-llama-2/train.yaml
@@ -1,7 +1,7 @@
 envs:
- HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token
- ARTIFACT_BUCKET_NAME: YOUR_OWN_BUCKET_NAME # Change to your own bucket name
- WANDB_API_KEY: "" # Change to your own wandb api key
+ HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
+ ARTIFACT_BUCKET_NAME: # TODO: Fill with your unique bucket name, or use --env to pass.
+ WANDB_API_KEY: # TODO: Fill with your own WANDB_API_KEY, or use --env to pass.
  MODEL_SIZE: 7
  USE_XFORMERS: 1
 

diff --git a/llm/vicuna/train.yaml b/llm/vicuna/train.yaml
@@ -1,3 +1,10 @@
+envs:
+ MODEL_SIZE: 7
+ SEQ_LEN: 2048
+ GC_SCALE: 4
+ USE_FLASH_ATTN: 0
+ WANDB_API_KEY: # TODO: Fill with your own WANDB_API_KEY, or use --env to pass.
+
 resources:
  accelerators: A100-80GB:8
  disk_size: 1000
@@ -109,10 +116,3 @@ run: |
  gsutil -m rsync -r -x 'checkpoint-*' $LOCAL_CKPT_PATH/ $CKPT_PATH/
  exit $returncode
 
-
-envs:
- MODEL_SIZE: 7
- SEQ_LEN: 2048
- GC_SCALE: 4
- USE_FLASH_ATTN: 0
- WANDB_API_KEY: ""
diff --git a/llm/vllm/serve-openai-api.yaml b/llm/vllm/serve-openai-api.yaml
@@ -1,6 +1,6 @@
 envs:
  MODEL_NAME: meta-llama/Llama-2-7b-chat-hf
- HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token
+ HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 resources:
  accelerators: {L4:1, A10G:1, A10:1, A100:1, A100-80GB:1}

diff --git a/llm/vllm/service.yaml b/llm/vllm/service.yaml
@@ -9,7 +9,7 @@ service:
 # Fields below are the same with `serve-openai-api.yaml`.
 envs:
  MODEL_NAME: meta-llama/Llama-2-7b-chat-hf
- HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token
+ HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 resources:
  accelerators: {L4:1, A10G:1, A10:1, A100:1, A100-80GB:1}

diff --git a/sky/task.py b/sky/task.py
@@ -353,8 +353,13 @@ def from_yaml_config(
  # as int causing validate_schema() to fail.
  envs = config.get('envs')
  if envs is not None and isinstance(envs, dict):
- config['envs'] = {str(k): str(v) for k, v in envs.items()}
-
+ new_envs: Dict[str, Optional[str]] = {}
+ for k, v in envs.items():
+ if v is not None:
+ new_envs[str(k)] = str(v)
+ else:
+ new_envs[str(k)] = None
+ config['envs'] = new_envs
  common_utils.validate_schema(config, schemas.get_task_schema(),
  'Invalid task YAML: ')
  if env_overrides is not None:
@@ -368,6 +373,15 @@ def from_yaml_config(
  new_envs.update(env_overrides)
  config['envs'] = new_envs
 
+ for k, v in config.get('envs', {}).items():
+ if v is None:
+ with ux_utils.print_exception_no_traceback():
+ raise ValueError(
+ f'Environment variable {k!r} is None. Please set a '
+ 'value for it in task YAML or with --env flag. '
+ f'To set it to be empty, use an empty string ({k}: "" '
+ f'in task YAML or --env {k}="" in CLI).')
+
  # Fill in any Task.envs into file_mounts (src/dst paths, storage
  # name/source).
  if config.get('file_mounts') is not None:

diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py
@@ -402,7 +402,7 @@ def get_task_schema():
  'patternProperties': {
  # Checks env keys are valid env var names.
  '^[a-zA-Z_][a-zA-Z0-9_]*$': {
- 'type': 'string'
+ 'type': ['string', 'null']
  }
  },
  'additionalProperties': False,

diff --git a/tests/test_yaml_parser.py b/tests/test_yaml_parser.py
@@ -134,3 +134,15 @@ def test_invalid_envs_type(tmp_path):
  with pytest.raises(ValueError) as e:
  Task.from_yaml(config_path)
  assert 'is not of type \'dict\'' in e.value.args[0]
+
+
+def test_invalid_empty_envs(tmp_path):
+ config_path = _create_config_file(
+ textwrap.dedent(f"""\
+ envs:
+ env_key1: abc
+ env_key2:
+ """), tmp_path)
+ with pytest.raises(ValueError) as e:
+ Task.from_yaml(config_path)
+ assert 'Environment variable \'env_key2\' is None.' in e.value.args[0]