Merge branch 'master' of https://github.com/skypilot-org/skypilot int…

…o core_remotes # Conflicts: # sky/check.py
skypilot-org · May 16, 2024 · 983856e · 983856e
2 parents c3bee42 + 4bf71d5
commit 983856e
Show file tree

Hide file tree

Showing 28 changed files with 213 additions and 81 deletions.
diff --git a/docs/source/getting-started/installation.rst b/docs/source/getting-started/installation.rst
@@ -164,6 +164,10 @@ section :ref:`below <cloud-account-setup>`.
  If your clouds show ``enabled`` --- |:tada:| |:tada:| **Congratulations!** |:tada:| |:tada:| You can now head over to
  :ref:`Quickstart <quickstart>` to get started with SkyPilot.
 
+.. tip::
+
+ To check credentials only for specific clouds, pass the clouds as arguments: :code:`sky check aws gcp`
+
 .. _cloud-account-setup:
 
 Cloud account setup

diff --git a/docs/source/running-jobs/environment-variables.rst b/docs/source/running-jobs/environment-variables.rst
@@ -12,6 +12,20 @@ You can specify environment variables to be made available to a task in two ways
 - The ``envs`` field (dict) in a :ref:`task YAML <yaml-spec>`
 - The ``--env`` flag in the ``sky launch/exec`` :ref:`CLI <cli>` (takes precedence over the above)
 
+.. tip::
+
+ If an environment variable is required to be specified with `--env` during
+ ``sky launch/exec``, you can set it to ``null`` in task YAML to raise an
+ error when it is forgotten to be specified. For example, the ``WANDB_API_KEY``
+ and ``HF_TOKEN`` in the following task YAML:
+
+ .. code-block:: yaml
+
+ envs:
+ WANDB_API_KEY:
+ HF_TOKEN: null
+ MYVAR: val
+
 The ``file_mounts``, ``setup``, and ``run`` sections of a task YAML can access the variables via the ``${MYVAR}`` syntax.
 
 Using in ``file_mounts``

diff --git a/examples/serve/llama2/llama2.yaml b/examples/serve/llama2/llama2.yaml
@@ -25,7 +25,7 @@ resources:
 
 envs:
  MODEL_SIZE: 7
- HF_TOKEN: <your-huggingface-token> # TODO: Replace with huggingface token
+ HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 setup: |
  conda activate chatbot

diff --git a/examples/spot_pipeline/bert_qa_train_eval.yaml b/examples/spot_pipeline/bert_qa_train_eval.yaml
@@ -42,7 +42,7 @@ run: |
  echo Model saved to /checkpoint/bert_qa/$SKYPILOT_TASK_ID
 
 envs:
- WANDB_API_KEY: # NOTE: Fill in your wandb key
+ WANDB_API_KEY: # TODO: Fill with your own WANDB_API_KEY, or use --env to pass.
 
 ---
 
@@ -84,4 +84,4 @@ run: |
  --save_steps 1000
 
 envs:
- WANDB_API_KEY: # NOTE: Fill in your wandb key
+ WANDB_API_KEY: # TODO: Fill with your own WANDB_API_KEY, or use --env to pass.
diff --git a/llm/axolotl/axolotl-spot.yaml b/llm/axolotl/axolotl-spot.yaml
@@ -38,8 +38,8 @@ run: |
  accelerate launch -m axolotl.cli.train /sky_workdir/qlora-checkpoint.yaml
 
 envs:
- HF_TOKEN: <your-huggingface-token> # TODO: Replace with huggingface token
- BUCKET: <a-unique-bucket-name-to-use>
+ HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
+ BUCKET: # TODO: Fill with your unique bucket name, or use --env to pass.
 
 
 

diff --git a/llm/axolotl/axolotl.yaml b/llm/axolotl/axolotl.yaml
@@ -26,7 +26,7 @@ run: |
  accelerate launch -m axolotl.cli.train /sky_workdir/qlora.yaml
 
 envs:
- HF_TOKEN: <your-huggingface-token> # TODO: Replace with huggingface token
+ HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 
 

diff --git a/llm/dbrx/README.md b/llm/dbrx/README.md
@@ -22,7 +22,7 @@ In this recipe, you will serve `databricks/dbrx-instruct` on your own infra --
 ```yaml
 envs:
  MODEL_NAME: databricks/dbrx-instruct
- HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
+ HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 service:
  replicas: 2

diff --git a/llm/dbrx/dbrx.yaml b/llm/dbrx/dbrx.yaml
@@ -31,7 +31,7 @@
 
 envs:
  MODEL_NAME: databricks/dbrx-instruct
- HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
+ HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 service:
  replicas: 2

diff --git a/llm/falcon/falcon.yaml b/llm/falcon/falcon.yaml
@@ -7,7 +7,7 @@ workdir: .
 
 envs:
  MODEL_NAME: tiiuae/falcon-7b # [ybelkada/falcon-7b-sharded-bf16, tiiuae/falcon-7b, tiiuae/falcon-40b]
- WANDB_API_KEY: $WANDB_KEY # Change to your own wandb key
+ WANDB_API_KEY: # TODO: Fill with your own WANDB_API_KEY, or use --env to pass.
  OUTPUT_BUCKET_NAME: # Set a unique name for the bucket which will store model weights
 
 file_mounts:
@@ -39,4 +39,4 @@ run: |
  --bnb_4bit_compute_dtype bfloat16 \
  --max_steps 500 \
  --dataset_name timdettmers/openassistant-guanaco \
- --output_dir /results
+ --output_dir /results
diff --git a/llm/gemma/serve.yaml b/llm/gemma/serve.yaml
@@ -17,7 +17,7 @@ service:
 
 envs:
  MODEL_NAME: google/gemma-7b-it
- HF_TOKEN: <your-huggingface-token> # TODO: Replace with huggingface token
+ HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 resources: 
  accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB}

diff --git a/llm/llama-2/README.md b/llm/llama-2/README.md
@@ -33,7 +33,7 @@ Fill the access token in the [chatbot-hf.yaml](https://github.com/skypilot-org/s
 ```yaml
 envs:
  MODEL_SIZE: 7
- HF_TOKEN: <your-huggingface-token>
+ HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 ```
 
 

diff --git a/llm/llama-2/chatbot-hf.yaml b/llm/llama-2/chatbot-hf.yaml
@@ -6,7 +6,7 @@ resources:
 
 envs:
  MODEL_SIZE: 7
- HF_TOKEN: <your-huggingface-token> # TODO: Replace with huggingface token
+ HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 setup: |
  conda activate chatbot

diff --git a/llm/llama-2/chatbot-meta.yaml b/llm/llama-2/chatbot-meta.yaml
@@ -6,7 +6,7 @@ resources:
 
 envs:
  MODEL_SIZE: 7
- HF_TOKEN: <your-huggingface-token> # TODO: Replace with huggingface token
+ HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 setup: |
  set -ex

diff --git a/llm/llama-3/README.md b/llm/llama-3/README.md
@@ -44,7 +44,7 @@
 envs:
  MODEL_NAME: meta-llama/Meta-Llama-3-70B-Instruct
  # MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
- HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
+ HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 service:
  replicas: 2

diff --git a/llm/llama-3/llama3.yaml b/llm/llama-3/llama3.yaml
@@ -59,7 +59,7 @@
 envs:
  MODEL_NAME: meta-llama/Meta-Llama-3-70B-Instruct
  # MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
- HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
+ HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 service:
  replicas: 2

diff --git a/llm/sglang/llama2.yaml b/llm/sglang/llama2.yaml
@@ -6,7 +6,7 @@ service:
 
 envs:
  MODEL_NAME: meta-llama/Llama-2-7b-chat-hf
- HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token
+ HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 resources:
  accelerators: {L4:1, A10G:1, A10:1, A100:1, A100-80GB:1}

diff --git a/llm/vicuna-llama-2/README.md b/llm/vicuna-llama-2/README.md
@@ -31,7 +31,7 @@ cd skypilot/llm/vicuna-llama-2
 Paste the access token into [train.yaml](https://github.com/skypilot-org/skypilot/tree/master/llm/vicuna-llama-2/train.yaml):
 ```yaml
 envs:
- HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token
+ HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 ```
 
 ## Train your own Vicuna on Llama-2

diff --git a/llm/vicuna-llama-2/train.yaml b/llm/vicuna-llama-2/train.yaml
@@ -1,7 +1,7 @@
 envs:
- HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token
- ARTIFACT_BUCKET_NAME: YOUR_OWN_BUCKET_NAME # Change to your own bucket name
- WANDB_API_KEY: "" # Change to your own wandb api key
+ HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
+ ARTIFACT_BUCKET_NAME: # TODO: Fill with your unique bucket name, or use --env to pass.
+ WANDB_API_KEY: # TODO: Fill with your own WANDB_API_KEY, or use --env to pass.
  MODEL_SIZE: 7
  USE_XFORMERS: 1
 

diff --git a/llm/vicuna/train.yaml b/llm/vicuna/train.yaml
@@ -1,3 +1,10 @@
+envs:
+ MODEL_SIZE: 7
+ SEQ_LEN: 2048
+ GC_SCALE: 4
+ USE_FLASH_ATTN: 0
+ WANDB_API_KEY: # TODO: Fill with your own WANDB_API_KEY, or use --env to pass.
+
 resources:
  accelerators: A100-80GB:8
  disk_size: 1000
@@ -109,10 +116,3 @@ run: |
  gsutil -m rsync -r -x 'checkpoint-*' $LOCAL_CKPT_PATH/ $CKPT_PATH/
  exit $returncode
 
-
-envs:
- MODEL_SIZE: 7
- SEQ_LEN: 2048
- GC_SCALE: 4
- USE_FLASH_ATTN: 0
- WANDB_API_KEY: ""
diff --git a/llm/vllm/serve-openai-api.yaml b/llm/vllm/serve-openai-api.yaml
@@ -1,6 +1,6 @@
 envs:
  MODEL_NAME: meta-llama/Llama-2-7b-chat-hf
- HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token
+ HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 resources:
  accelerators: {L4:1, A10G:1, A10:1, A100:1, A100-80GB:1}

diff --git a/llm/vllm/service.yaml b/llm/vllm/service.yaml
@@ -9,7 +9,7 @@ service:
 # Fields below are the same with `serve-openai-api.yaml`.
 envs:
  MODEL_NAME: meta-llama/Llama-2-7b-chat-hf
- HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token
+ HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
 
 resources:
  accelerators: {L4:1, A10G:1, A10:1, A100:1, A100-80GB:1}

diff --git a/sky/check.py b/sky/check.py
@@ -1,27 +1,33 @@
 """Credential checks: check cloud credentials and enable clouds."""
 import traceback
-from typing import Dict, Iterable, List, Optional, Tuple
+from types import ModuleType
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 
 import click
 import colorama
 import rich
 
-from sky import clouds
+from sky import clouds as sky_clouds
 from sky import skypilot_config
 from sky import exceptions
 from sky import global_user_state
 from sky.adaptors import cloudflare
 from sky.utils import ux_utils
 
 
-# TODO(zhwu): add check for a single cloud to improve performance
-def check(quiet: bool = False, verbose: bool = False) -> None:
+def check(
+ quiet: bool = False,
+ verbose: bool = False,
+ clouds: Optional[Tuple[str]] = None,
+) -> None:
  echo = (lambda *_args, **_kwargs: None) if quiet else click.echo
  echo('Checking credentials to enable clouds for SkyPilot.')
-
  enabled_clouds = []
+ disabled_clouds = []
 
- def check_one_cloud(cloud_tuple: Tuple[str, clouds.Cloud]) -> None:
+ def check_one_cloud(
+ cloud_tuple: Tuple[str, Union[sky_clouds.Cloud,
+ ModuleType]]) -> None:
  cloud_repr, cloud = cloud_tuple
  echo(f' Checking {cloud_repr}...', nl=False)
  try:
@@ -44,35 +50,53 @@ def check_one_cloud(cloud_tuple: Tuple[str, clouds.Cloud]) -> None:
  if reason is not None:
  echo(f' Hint: {reason}')
  else:
+ disabled_clouds.append(cloud_repr)
  echo(f' Reason: {reason}')
 
  # Use candidate_clouds from config if it exists, otherwise check all clouds.
  config_candidate_clouds = skypilot_config.get_nested(['candidate_clouds'],
  None)
- if config_candidate_clouds:
- # TODO: Handle cloudflare here since it is not in CLOUD_REGISTRY.
- clouds_to_check = [
- (cloud_name, clouds.CLOUD_REGISTRY.from_str(cloud_name))
- for cloud_name in config_candidate_clouds
- ]
+ # Validate config_candidate_clouds
+ config_candidate_clouds = [repr(sky_clouds.CLOUD_REGISTRY.from_str(c) for c in config_candidate_clouds)
+
+ if clouds is not None:
+ clouds_to_check: List[Tuple[str, Any]] = []
+ for cloud in clouds:
+ if cloud.lower() == 'cloudflare':
+ clouds_to_check.append(
+ ('Cloudflare, for R2 object store', cloudflare))
+ else:
+ cloud_obj = sky_clouds.CLOUD_REGISTRY.from_str(cloud)
+ assert cloud_obj is not None, f'Cloud {cloud!r} not found'
+ clouds_to_check.append((repr(cloud_obj), cloud_obj))
  else:
- clouds_to_check = [
- (repr(cloud), cloud) for cloud in clouds.CLOUD_REGISTRY.values()
- ]
+ clouds_to_check = [(repr(cloud_obj), cloud_obj)
+ for cloud_obj in sky_clouds.CLOUD_REGISTRY.values()]
  clouds_to_check.append(('Cloudflare, for R2 object store', cloudflare))
 
  for cloud_tuple in sorted(clouds_to_check):
  check_one_cloud(cloud_tuple)
 
- # Cloudflare is not a real cloud in clouds.CLOUD_REGISTRY, and should not be
- # inserted into the DB (otherwise `sky launch` and other code would error
- # out when it's trying to look it up in the registry).
- enabled_clouds = [
+ # Cloudflare is not a real cloud in sky_clouds.CLOUD_REGISTRY, and should
+ # not be inserted into the DB (otherwise `sky launch` and other code would
+ # error out when it's trying to look it up in the registry).
+ enabled_clouds_set = {
  cloud for cloud in enabled_clouds if not cloud.startswith('Cloudflare')
- ]
- global_user_state.set_enabled_clouds(enabled_clouds)
-
- if len(enabled_clouds) == 0:
+ }
+ disabled_clouds_set = {
+ cloud for cloud in disabled_clouds if not cloud.startswith('Cloudflare')
+ }
+ previously_enabled_clouds_set = {
+ repr(cloud) for cloud in global_user_state.get_cached_enabled_clouds()
+ }
+
+ # Determine the set of enabled clouds: previously enabled clouds + newly
+ # enabled clouds - newly disabled clouds.
+ all_enabled_clouds = ((previously_enabled_clouds_set | enabled_clouds_set) -
+ disabled_clouds_set)
+ global_user_state.set_enabled_clouds(list(all_enabled_clouds))
+
+ if len(all_enabled_clouds) == 0:
  echo(
  click.style(
  'No cloud is enabled. SkyPilot will not be able to run any '
@@ -81,25 +105,27 @@ def check_one_cloud(cloud_tuple: Tuple[str, clouds.Cloud]) -> None:
  bold=True))
  raise SystemExit()
  else:
+ clouds_arg = (' ' +
+ ' '.join(disabled_clouds) if clouds is not None else '')
  echo(
  click.style(
  '\nTo enable a cloud, follow the hints above and rerun: ',
- dim=True) + click.style('sky check', bold=True) + '\n' +
- click.style(
+ dim=True) + click.style(f'sky check{clouds_arg}', bold=True) +
+ '\n' + click.style(
  'If any problems remain, refer to detailed docs at: '
  'https://skypilot.readthedocs.io/en/latest/getting-started/installation.html', # pylint: disable=line-too-long
  dim=True))
 
  # Pretty print for UX.
  if not quiet:
  enabled_clouds_str = '\n :heavy_check_mark: '.join(
- [''] + sorted(enabled_clouds))
+ [''] + sorted(all_enabled_clouds))
  rich.print('\n[green]:tada: Enabled clouds :tada:'
  f'{enabled_clouds_str}[/green]')
 
 
 def get_cached_enabled_clouds_or_refresh(
- raise_if_no_cloud_access: bool = False) -> List[clouds.Cloud]:
+ raise_if_no_cloud_access: bool = False) -> List[sky_clouds.Cloud]:
  """Returns cached enabled clouds and if no cloud is enabled, refresh.
 
  This function will perform a refresh if no public cloud is enabled.
@@ -131,7 +157,8 @@ def get_cached_enabled_clouds_or_refresh(
 
 
 def get_cloud_credential_file_mounts(
- excluded_clouds: Optional[Iterable[clouds.Cloud]]) -> Dict[str, str]:
+ excluded_clouds: Optional[Iterable[sky_clouds.Cloud]]
+) -> Dict[str, str]:
  """Returns the files necessary to access all enabled clouds.
 
  Returns a dictionary that will be added to a task's file mounts
@@ -141,7 +168,7 @@ def get_cloud_credential_file_mounts(
  file_mounts = {}
  for cloud in enabled_clouds:
  if (excluded_clouds is not None and
- clouds.cloud_in_iterable(cloud, excluded_clouds)):
+ sky_clouds.cloud_in_iterable(cloud, excluded_clouds)):
  continue
  cloud_file_mounts = cloud.get_credential_file_mounts()
  file_mounts.update(cloud_file_mounts)