From eae8fc5740c0563b458fcb2cec6df5a7f0d0f9d1 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Thu, 16 May 2024 10:29:09 -0700
Subject: [PATCH] [UX] Error out for null env var (#3557)

* [UX] Error out for null env var

* format

* Fix examples for env, including HF_TOKEN and WANDB_API_KEY

* fix

* Add test

* format

* fix

* type

* fix

* remove print

* add doc

* fix comment

* minor fix
---
 .../running-jobs/environment-variables.rst     | 14 ++++++++++++++
 examples/serve/llama2/llama2.yaml              |  2 +-
 examples/spot_pipeline/bert_qa_train_eval.yaml |  4 ++--
 llm/axolotl/axolotl-spot.yaml                  |  4 ++--
 llm/axolotl/axolotl.yaml                       |  2 +-
 llm/dbrx/README.md                             |  2 +-
 llm/dbrx/dbrx.yaml                             |  2 +-
 llm/falcon/falcon.yaml                         |  4 ++--
 llm/gemma/serve.yaml                           |  2 +-
 llm/llama-2/README.md                          |  2 +-
 llm/llama-2/chatbot-hf.yaml                    |  2 +-
 llm/llama-2/chatbot-meta.yaml                  |  2 +-
 llm/llama-3/README.md                          |  2 +-
 llm/llama-3/llama3.yaml                        |  2 +-
 llm/sglang/llama2.yaml                         |  2 +-
 llm/vicuna-llama-2/README.md                   |  2 +-
 llm/vicuna-llama-2/train.yaml                  |  6 +++---
 llm/vicuna/train.yaml                          | 14 +++++++-------
 llm/vllm/serve-openai-api.yaml                 |  2 +-
 llm/vllm/service.yaml                          |  2 +-
 sky/task.py                                    | 18 ++++++++++++++++--
 sky/utils/schemas.py                           |  2 +-
 tests/test_yaml_parser.py                      | 12 ++++++++++++
 23 files changed, 73 insertions(+), 33 deletions(-)
diff --git a/docs/source/running-jobs/environment-variables.rst b/docs/source/running-jobs/environment-variables.rst
index 16502f70818..2f3427c1bf5 100644
--- a/docs/source/running-jobs/environment-variables.rst
+++ b/docs/source/running-jobs/environment-variables.rst
@@ -12,6 +12,20 @@ You can specify environment variables to be made available to a task in two ways
 - The ``envs`` field (dict) in a :ref:`task YAML <yaml-spec>`
 - The ``--env`` flag in the ``sky launch/exec`` :ref:`CLI <cli>` (takes precedence over the above)
 
+.. tip::
+
+  If an environment variable is required to be specified with `--env` during
+  ``sky launch/exec``, you can set it to ``null`` in task YAML to raise an
+  error when it is forgotten to be specified. For example, the ``WANDB_API_KEY``
+  and ``HF_TOKEN`` in the following task YAML:
+
+  .. code-block:: yaml
+
+    envs:
+      WANDB_API_KEY:
+      HF_TOKEN: null
+      MYVAR: val
+
 The ``file_mounts``, ``setup``, and ``run`` sections of a task YAML can access the variables via the ``${MYVAR}`` syntax.
 
 Using in ``file_mounts``
diff --git a/examples/serve/llama2/llama2.yaml b/examples/serve/llama2/llama2.yaml
index 5eaaea449d0..42c82ea0cc9 100644
--- a/examples/serve/llama2/llama2.yaml
+++ b/examples/serve/llama2/llama2.yaml
@@ -25,7 +25,7 @@ resources:
 
 envs:
   MODEL_SIZE: 7
-  HF_TOKEN: <your-huggingface-token> # TODO: Replace with huggingface token
+  HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 setup: |
   conda activate chatbot
diff --git a/examples/spot_pipeline/bert_qa_train_eval.yaml b/examples/spot_pipeline/bert_qa_train_eval.yaml
index 32fb526ca91..62bd34c3b76 100644
--- a/examples/spot_pipeline/bert_qa_train_eval.yaml
+++ b/examples/spot_pipeline/bert_qa_train_eval.yaml
@@ -42,7 +42,7 @@ run: |
     echo Model saved to /checkpoint/bert_qa/$SKYPILOT_TASK_ID
 
 envs:
-    WANDB_API_KEY: # NOTE: Fill in your wandb key
+    WANDB_API_KEY: # TODO: Fill with your own WANDB_API_KEY, or use --env to pass.
 
 ---
 
@@ -84,4 +84,4 @@ run: |
     --save_steps 1000
 
 envs:
-    WANDB_API_KEY: # NOTE: Fill in your wandb key
+    WANDB_API_KEY: # TODO: Fill with your own WANDB_API_KEY, or use --env to pass.
diff --git a/llm/axolotl/axolotl-spot.yaml b/llm/axolotl/axolotl-spot.yaml
index b6c81b742c9..942f4ccc4ba 100644
--- a/llm/axolotl/axolotl-spot.yaml
+++ b/llm/axolotl/axolotl-spot.yaml
@@ -38,8 +38,8 @@ run: |
     accelerate launch -m axolotl.cli.train /sky_workdir/qlora-checkpoint.yaml
 
 envs:
-  HF_TOKEN: <your-huggingface-token> # TODO: Replace with huggingface token
-  BUCKET: <a-unique-bucket-name-to-use>
+  HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
+  BUCKET: # TODO: Fill with your unique bucket name, or use --env to pass.
   
   
 
diff --git a/llm/axolotl/axolotl.yaml b/llm/axolotl/axolotl.yaml
index d9cfd91aa6d..9cec1d1f331 100644
--- a/llm/axolotl/axolotl.yaml
+++ b/llm/axolotl/axolotl.yaml
@@ -26,7 +26,7 @@ run: |
     accelerate launch -m axolotl.cli.train /sky_workdir/qlora.yaml
 
 envs:
-  HF_TOKEN: <your-huggingface-token> # TODO: Replace with huggingface token
+  HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
   
   
 
diff --git a/llm/dbrx/README.md b/llm/dbrx/README.md
index e0ad216e92c..3011af9d4e6 100644
--- a/llm/dbrx/README.md
+++ b/llm/dbrx/README.md
@@ -22,7 +22,7 @@ In this recipe, you will serve `databricks/dbrx-instruct` on your own infra  --
 ```yaml
 envs:
   MODEL_NAME: databricks/dbrx-instruct
-  HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
+  HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 service:
   replicas: 2
diff --git a/llm/dbrx/dbrx.yaml b/llm/dbrx/dbrx.yaml
index ffa777ab86d..0c9abd06d30 100644
--- a/llm/dbrx/dbrx.yaml
+++ b/llm/dbrx/dbrx.yaml
@@ -31,7 +31,7 @@
 
 envs:
   MODEL_NAME: databricks/dbrx-instruct
-  HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
+  HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 service:
   replicas: 2
diff --git a/llm/falcon/falcon.yaml b/llm/falcon/falcon.yaml
index 256d936d61b..b752db5256b 100644
--- a/llm/falcon/falcon.yaml
+++ b/llm/falcon/falcon.yaml
@@ -7,7 +7,7 @@ workdir: .
 
 envs:
   MODEL_NAME: tiiuae/falcon-7b # [ybelkada/falcon-7b-sharded-bf16, tiiuae/falcon-7b, tiiuae/falcon-40b]
-  WANDB_API_KEY: $WANDB_KEY # Change to your own wandb key
+  WANDB_API_KEY: # TODO: Fill with your own WANDB_API_KEY, or use --env to pass.
   OUTPUT_BUCKET_NAME: # Set a unique name for the bucket which will store model weights
 
 file_mounts:
@@ -39,4 +39,4 @@ run: |
   --bnb_4bit_compute_dtype bfloat16 \
   --max_steps 500 \
   --dataset_name timdettmers/openassistant-guanaco \
-  --output_dir /results
\ No newline at end of file
+  --output_dir /results
diff --git a/llm/gemma/serve.yaml b/llm/gemma/serve.yaml
index 73f5b9c2b5d..4c5a2c984c5 100644
--- a/llm/gemma/serve.yaml
+++ b/llm/gemma/serve.yaml
@@ -17,7 +17,7 @@ service:
 
 envs:
   MODEL_NAME: google/gemma-7b-it
-  HF_TOKEN: <your-huggingface-token> # TODO: Replace with huggingface token
+  HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 resources: 
   accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB}
diff --git a/llm/llama-2/README.md b/llm/llama-2/README.md
index 7b20ea4aed7..d8f8151572e 100644
--- a/llm/llama-2/README.md
+++ b/llm/llama-2/README.md
@@ -33,7 +33,7 @@ Fill the access token in the [chatbot-hf.yaml](https://github.com/skypilot-org/s
 ```yaml
 envs:
   MODEL_SIZE: 7
-  HF_TOKEN: <your-huggingface-token>
+  HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 ```
 
 
diff --git a/llm/llama-2/chatbot-hf.yaml b/llm/llama-2/chatbot-hf.yaml
index 992c01346e6..ee9d0281296 100644
--- a/llm/llama-2/chatbot-hf.yaml
+++ b/llm/llama-2/chatbot-hf.yaml
@@ -6,7 +6,7 @@ resources:
 
 envs:
   MODEL_SIZE: 7
-  HF_TOKEN: <your-huggingface-token> # TODO: Replace with huggingface token
+  HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 setup: |
   conda activate chatbot
diff --git a/llm/llama-2/chatbot-meta.yaml b/llm/llama-2/chatbot-meta.yaml
index a0481fe760f..733a2a867d2 100644
--- a/llm/llama-2/chatbot-meta.yaml
+++ b/llm/llama-2/chatbot-meta.yaml
@@ -6,7 +6,7 @@ resources:
 
 envs:
   MODEL_SIZE: 7
-  HF_TOKEN: <your-huggingface-token> # TODO: Replace with huggingface token
+  HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 setup: |
   set -ex
diff --git a/llm/llama-3/README.md b/llm/llama-3/README.md
index 7b3b6cb56e5..decff6054bf 100644
--- a/llm/llama-3/README.md
+++ b/llm/llama-3/README.md
@@ -44,7 +44,7 @@
 envs:
   MODEL_NAME: meta-llama/Meta-Llama-3-70B-Instruct
   # MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
-  HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
+  HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 service:
   replicas: 2
diff --git a/llm/llama-3/llama3.yaml b/llm/llama-3/llama3.yaml
index 0974d4db51b..1e9b236efd4 100644
--- a/llm/llama-3/llama3.yaml
+++ b/llm/llama-3/llama3.yaml
@@ -59,7 +59,7 @@
 envs:
   MODEL_NAME: meta-llama/Meta-Llama-3-70B-Instruct
   # MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
-  HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token, or use --env to pass.
+  HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 service:
   replicas: 2
diff --git a/llm/sglang/llama2.yaml b/llm/sglang/llama2.yaml
index 08427ab2001..8b58c4365d6 100644
--- a/llm/sglang/llama2.yaml
+++ b/llm/sglang/llama2.yaml
@@ -6,7 +6,7 @@ service:
 
 envs:
   MODEL_NAME: meta-llama/Llama-2-7b-chat-hf
-  HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token
+  HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 resources:
   accelerators: {L4:1, A10G:1, A10:1, A100:1, A100-80GB:1}
diff --git a/llm/vicuna-llama-2/README.md b/llm/vicuna-llama-2/README.md
index 0fc5da6c4ba..899792c299d 100644
--- a/llm/vicuna-llama-2/README.md
+++ b/llm/vicuna-llama-2/README.md
@@ -31,7 +31,7 @@ cd skypilot/llm/vicuna-llama-2
 Paste the access token into [train.yaml](https://github.com/skypilot-org/skypilot/tree/master/llm/vicuna-llama-2/train.yaml):
 ```yaml
 envs:
-  HF_TOKEN: <your-huggingface-token>  # Change to your own huggingface token
+  HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 ```
 
 ## Train your own Vicuna on Llama-2
diff --git a/llm/vicuna-llama-2/train.yaml b/llm/vicuna-llama-2/train.yaml
index e23d5797e76..8d35c2dff85 100644
--- a/llm/vicuna-llama-2/train.yaml
+++ b/llm/vicuna-llama-2/train.yaml
@@ -1,7 +1,7 @@
 envs:
-  HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token
-  ARTIFACT_BUCKET_NAME: YOUR_OWN_BUCKET_NAME # Change to your own bucket name
-  WANDB_API_KEY: "" # Change to your own wandb api key
+  HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
+  ARTIFACT_BUCKET_NAME: # TODO: Fill with your unique bucket name, or use --env to pass.
+  WANDB_API_KEY: # TODO: Fill with your own WANDB_API_KEY, or use --env to pass.
   MODEL_SIZE: 7
   USE_XFORMERS: 1
 
diff --git a/llm/vicuna/train.yaml b/llm/vicuna/train.yaml
index c577561e858..a2121aaf8fd 100644
--- a/llm/vicuna/train.yaml
+++ b/llm/vicuna/train.yaml
@@ -1,3 +1,10 @@
+envs:
+  MODEL_SIZE: 7
+  SEQ_LEN: 2048
+  GC_SCALE: 4
+  USE_FLASH_ATTN: 0
+  WANDB_API_KEY: # TODO: Fill with your own WANDB_API_KEY, or use --env to pass.
+
 resources:
   accelerators: A100-80GB:8
   disk_size: 1000
@@ -109,10 +116,3 @@ run: |
   gsutil -m rsync -r -x 'checkpoint-*' $LOCAL_CKPT_PATH/ $CKPT_PATH/
   exit $returncode
 
-
-envs:
-  MODEL_SIZE: 7
-  SEQ_LEN: 2048
-  GC_SCALE: 4
-  USE_FLASH_ATTN: 0
-  WANDB_API_KEY: ""
diff --git a/llm/vllm/serve-openai-api.yaml b/llm/vllm/serve-openai-api.yaml
index 9ddf7b280ba..a68f476edc7 100644
--- a/llm/vllm/serve-openai-api.yaml
+++ b/llm/vllm/serve-openai-api.yaml
@@ -1,6 +1,6 @@
 envs:
   MODEL_NAME: meta-llama/Llama-2-7b-chat-hf
-  HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token
+  HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 resources:
   accelerators: {L4:1, A10G:1, A10:1, A100:1, A100-80GB:1}
diff --git a/llm/vllm/service.yaml b/llm/vllm/service.yaml
index 335f8a50650..1e5d92a60e5 100644
--- a/llm/vllm/service.yaml
+++ b/llm/vllm/service.yaml
@@ -9,7 +9,7 @@ service:
 # Fields below are the same with `serve-openai-api.yaml`.
 envs:
   MODEL_NAME: meta-llama/Llama-2-7b-chat-hf
-  HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token
+  HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 resources:
   accelerators: {L4:1, A10G:1, A10:1, A100:1, A100-80GB:1}
diff --git a/sky/task.py b/sky/task.py
index b6a71581a15..3dd254838f0 100644
--- a/sky/task.py
+++ b/sky/task.py
@@ -353,8 +353,13 @@ def from_yaml_config(
         # as int causing validate_schema() to fail.
         envs = config.get('envs')
         if envs is not None and isinstance(envs, dict):
-            config['envs'] = {str(k): str(v) for k, v in envs.items()}
-
+            new_envs: Dict[str, Optional[str]] = {}
+            for k, v in envs.items():
+                if v is not None:
+                    new_envs[str(k)] = str(v)
+                else:
+                    new_envs[str(k)] = None
+            config['envs'] = new_envs
         common_utils.validate_schema(config, schemas.get_task_schema(),
                                      'Invalid task YAML: ')
         if env_overrides is not None:
@@ -368,6 +373,15 @@ def from_yaml_config(
             new_envs.update(env_overrides)
             config['envs'] = new_envs
 
+        for k, v in config.get('envs', {}).items():
+            if v is None:
+                with ux_utils.print_exception_no_traceback():
+                    raise ValueError(
+                        f'Environment variable {k!r} is None. Please set a '
+                        'value for it in task YAML or with --env flag. '
+                        f'To set it to be empty, use an empty string ({k}: "" '
+                        f'in task YAML or --env {k}="" in CLI).')
+
         # Fill in any Task.envs into file_mounts (src/dst paths, storage
         # name/source).
         if config.get('file_mounts') is not None:
diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py
index c50e15185a3..878fe67178e 100644
--- a/sky/utils/schemas.py
+++ b/sky/utils/schemas.py
@@ -402,7 +402,7 @@ def get_task_schema():
                 'patternProperties': {
                     # Checks env keys are valid env var names.
                     '^[a-zA-Z_][a-zA-Z0-9_]*$': {
-                        'type': 'string'
+                        'type': ['string', 'null']
                     }
                 },
                 'additionalProperties': False,
diff --git a/tests/test_yaml_parser.py b/tests/test_yaml_parser.py
index 0338084925e..1453cfe1620 100644
--- a/tests/test_yaml_parser.py
+++ b/tests/test_yaml_parser.py
@@ -134,3 +134,15 @@ def test_invalid_envs_type(tmp_path):
     with pytest.raises(ValueError) as e:
         Task.from_yaml(config_path)
     assert 'is not of type \'dict\'' in e.value.args[0]
+
+
+def test_invalid_empty_envs(tmp_path):
+    config_path = _create_config_file(
+        textwrap.dedent(f"""\
+            envs:
+                env_key1: abc
+                env_key2:
+            """), tmp_path)
+    with pytest.raises(ValueError) as e:
+        Task.from_yaml(config_path)
+    assert 'Environment variable \'env_key2\' is None.' in e.value.args[0]