Skip to content

Commit

Permalink
[Provisioner] Support docker in Lambda Cloud and TPU (#4115)
Browse files Browse the repository at this point in the history
* [Provisioner] Support docker in Lambda Cloud

* fix permission issue

* merge with check docker installed

* add tpu support & test

* patch lambda cloud

* add comment
  • Loading branch information
cblmemo authored Oct 20, 2024
1 parent c6ae536 commit 63e96f4
Show file tree
Hide file tree
Showing 8 changed files with 60 additions and 26 deletions.
1 change: 0 additions & 1 deletion sky/clouds/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,6 @@ def make_deploy_resources_variables(
runcmd:
- sed -i 's/#Banner none/Banner none/' /etc/ssh/sshd_config
- echo '\\nif [ ! -f "/tmp/__restarted" ]; then\\n sudo systemctl restart ssh\\n sleep 2\\n touch /tmp/__restarted\\nfi' >> /home/skypilot:ssh_user/.bashrc
- usermod -aG docker skypilot:ssh_user
write_files:
- path: /etc/apt/apt.conf.d/20auto-upgrades
content: |
Expand Down
3 changes: 3 additions & 0 deletions sky/clouds/gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,9 @@ def _failover_disk_tier() -> Optional[resources_utils.DiskTier]:
'runtime_version']
resources_vars['tpu_node_name'] = r.accelerator_args.get(
'tpu_name')
# TPU VMs require privileged mode for docker containers to
# access TPU devices.
resources_vars['docker_run_options'] = ['--privileged']
else:
# Convert to GCP names:
# https://cloud.google.com/compute/docs/gpus
Expand Down
14 changes: 9 additions & 5 deletions sky/clouds/lambda_cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,6 @@ class Lambda(clouds.Cloud):
_CLOUD_UNSUPPORTED_FEATURES = {
clouds.CloudImplementationFeatures.STOP: 'Lambda cloud does not support stopping VMs.',
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER: f'Migrating disk is currently not supported on {_REPR}.',
clouds.CloudImplementationFeatures.DOCKER_IMAGE: (
f'Docker image is currently not supported on {_REPR}. '
'You can try running docker command inside the `run` section in task.yaml.'
),
clouds.CloudImplementationFeatures.SPOT_INSTANCE: f'Spot instances are not supported in {_REPR}.',
clouds.CloudImplementationFeatures.IMAGE_ID: f'Specifying image ID is not supported in {_REPR}.',
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER: f'Custom disk tiers are not supported in {_REPR}.',
Expand Down Expand Up @@ -173,12 +169,20 @@ def make_deploy_resources_variables(
else:
custom_resources = None

return {
resources_vars = {
'instance_type': resources.instance_type,
'custom_resources': custom_resources,
'region': region.name,
}

if acc_dict is not None:
# Lambda cloud's docker runtime information does not contain
# 'nvidia-container-runtime', causing no GPU option is added to
# the docker run command. We patch this by adding it here.
resources_vars['docker_run_options'] = ['--gpus all']

return resources_vars

def _get_feasible_launchable_resources(
self, resources: 'resources_lib.Resources'
) -> 'resources_utils.FeasibleResources':
Expand Down
19 changes: 12 additions & 7 deletions sky/provision/docker_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,12 +253,13 @@ def initialize(self) -> str:
# issue with nvidia container toolkit:
# https://github.com/NVIDIA/nvidia-container-toolkit/issues/48
self._run(
'[ -f /etc/docker/daemon.json ] || '
'{ which jq || sudo apt update && sudo apt install -y jq; } && '
'{ [ -f /etc/docker/daemon.json ] || '
'echo "{}" | sudo tee /etc/docker/daemon.json;'
'sudo jq \'.["exec-opts"] = ["native.cgroupdriver=cgroupfs"]\' '
'/etc/docker/daemon.json > /tmp/daemon.json;'
'sudo mv /tmp/daemon.json /etc/docker/daemon.json;'
'sudo systemctl restart docker')
'sudo systemctl restart docker; } || true')
user_docker_run_options = self.docker_config.get('run_options', [])
start_command = docker_start_cmds(
specific_image,
Expand Down Expand Up @@ -335,7 +336,11 @@ def initialize(self) -> str:

def _check_docker_installed(self):
no_exist = 'NoExist'
# SkyPilot: Add the current user to the docker group first (if needed),
# before checking if docker is installed to avoid permission issues.
cleaned_output = self._run(
'id -nG $USER | grep -qw docker || '
'sudo usermod -aG docker $USER > /dev/null 2>&1;'
f'command -v {self.docker_cmd} || echo {no_exist!r}')
if no_exist in cleaned_output or 'docker' not in cleaned_output:
logger.error(
Expand Down Expand Up @@ -424,8 +429,8 @@ def _auto_configure_shm(self, run_options: List[str]) -> List[str]:
def _check_container_exited(self) -> bool:
if self.initialized:
return True
output = (self._run(check_docker_running_cmd(self.container_name,
self.docker_cmd),
wait_for_docker_daemon=True))
return 'false' in output.lower(
) and 'no such object' not in output.lower()
output = self._run(check_docker_running_cmd(self.container_name,
self.docker_cmd),
wait_for_docker_daemon=True)
return ('false' in output.lower() and
'no such object' not in output.lower())
2 changes: 2 additions & 0 deletions sky/provision/paperspace/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,8 @@ def set_sky_key_script(self, public_key: str) -> None:
'apt-get update \n'
'apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin \n' # pylint: disable=line-too-long
'fi \n'
# TODO(tian): Maybe remove this as well since we are now adding
# users to docker group in the DockerInitializer. Need to test.
'usermod -aG docker paperspace \n'
f'echo "{public_key}" >> /home/paperspace/.ssh/authorized_keys \n')
try:
Expand Down
23 changes: 11 additions & 12 deletions sky/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -842,12 +842,6 @@ def _try_validate_image_id(self) -> None:

if self.extract_docker_image() is not None:
# TODO(tian): validate the docker image exists / of reasonable size
if self.accelerators is not None:
for acc in self.accelerators.keys():
if acc.lower().startswith('tpu'):
with ux_utils.print_exception_no_traceback():
raise ValueError(
'Docker image is not supported for TPU VM.')
if self.cloud is not None:
self.cloud.check_features_are_supported(
self, {clouds.CloudImplementationFeatures.DOCKER_IMAGE})
Expand Down Expand Up @@ -1032,25 +1026,30 @@ def make_deploy_variables(self, cluster_name: resources_utils.ClusterName,
self.accelerators is not None):
initial_setup_commands = [constants.DISABLE_GPU_ECC_COMMAND]

docker_image = self.extract_docker_image()

# Cloud specific variables
cloud_specific_variables = self.cloud.make_deploy_resources_variables(
self, cluster_name, region, zones, dryrun)

# Docker run options
docker_run_options = skypilot_config.get_nested(
('docker', 'run_options'),
default_value=[],
override_configs=self.cluster_config_overrides)
if isinstance(docker_run_options, str):
docker_run_options = [docker_run_options]
# Special accelerator runtime might require additional docker run
# options. e.g., for TPU, we need --privileged.
if 'docker_run_options' in cloud_specific_variables:
docker_run_options.extend(
cloud_specific_variables['docker_run_options'])
if docker_run_options and isinstance(self.cloud, clouds.Kubernetes):
logger.warning(
f'{colorama.Style.DIM}Docker run options are specified, '
'but ignored for Kubernetes: '
f'{" ".join(docker_run_options)}'
f'{colorama.Style.RESET_ALL}')

docker_image = self.extract_docker_image()

# Cloud specific variables
cloud_specific_variables = self.cloud.make_deploy_resources_variables(
self, cluster_name, region, zones, dryrun)
return dict(
cloud_specific_variables,
**{
Expand Down
20 changes: 20 additions & 0 deletions sky/templates/lambda-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,26 @@ max_workers: {{num_nodes - 1}}
upscaling_speed: {{num_nodes - 1}}
idle_timeout_minutes: 60

{%- if docker_image is not none %}
docker:
image: {{docker_image}}
container_name: {{docker_container_name}}
run_options:
- --ulimit nofile=1048576:1048576
{%- for run_option in docker_run_options %}
- {{run_option}}
{%- endfor %}
{%- if docker_login_config is not none %}
docker_login_config:
username: |-
{{docker_login_config.username}}
password: |-
{{docker_login_config.password}}
server: |-
{{docker_login_config.server}}
{%- endif %}
{%- endif %}

provider:
type: external
module: sky.provision.lambda
Expand Down
4 changes: 3 additions & 1 deletion sky/utils/command_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,8 +502,10 @@ def close_cached_connection(self) -> None:
if self.ssh_control_name is not None:
control_path = _ssh_control_path(self.ssh_control_name)
if control_path is not None:
# Suppress the `Exit request sent.` output for this comamnd
# which would interrupt the CLI spinner.
cmd = (f'ssh -O exit -S {control_path}/%C '
f'{self.ssh_user}@{self.ip}')
f'{self.ssh_user}@{self.ip} > /dev/null 2>&1')
logger.debug(f'Closing cached connection {control_path!r} with '
f'cmd: {cmd}')
log_lib.run_with_log(cmd,
Expand Down

0 comments on commit 63e96f4

Please sign in to comment.