Skip to content

Commit

Permalink
Merge remote-tracking branch 'skypilot/master' into serve-sync-log
Browse files Browse the repository at this point in the history
  • Loading branch information
root-hbx committed Oct 21, 2024
2 parents a4c143b + 3c3bcee commit e9a2979
Show file tree
Hide file tree
Showing 5 changed files with 99 additions and 41 deletions.
19 changes: 18 additions & 1 deletion examples/deepspeed-multinode/sky.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,16 @@
#
# This takes care constructing a "hostfile" to pass to DeepSpeed.
#
# If running on Kubernetes, use the nvidia/cuda:12.1.1-devel-ubuntu20.04 image
# because DeepSpeed requires nvcc.
#
# Usage:
#
# $ sky launch sky.yaml -r --down -c ds
#
# If running on Kubernetes:
# $ sky launch sky.yaml -r --down -c ds --cloud kubernetes --image nvidia/cuda:12.1.1-devel-ubuntu20.04
#
# # Optional: After the job starts running, you can log into the two nodes and
# # check gpustat:
# $ ssh ds
Expand All @@ -18,6 +24,7 @@ resources:
# accelerators: A100-80GB:1 # Azure, GCP, SCP
# accelerators: A10G:1 # AWS. Will OOM for (1) single_node/run_1.3b_lora.sh (2) multi_node/run_66b.sh.
# accelerators: T4:1 # AWS, Azure, GCP. Will OOM for (1) single_node/run_1.3b_lora.sh (2) multi_node/run_66b.sh.
# image_id: docker:nvidia/cuda:12.1.1-devel-ubuntu20.04 # Use this image if running on Kubernetes

num_nodes: 2

Expand All @@ -28,6 +35,13 @@ envs:
DEEPSPEED_ENVS: "MY_VAR_1,MY_VAR_2,SKYPILOT_NODE_RANK"

setup: |
if ! command -v git &> /dev/null
then
echo "git is not installed. Installing git..."
sudo apt-get update
sudo apt-get install -y git
fi
git clone https://github.com/microsoft/DeepSpeedExamples.git || true
cd DeepSpeedExamples
git checkout d7c42b4f34df91035e7ed3e0c51500bb53d0bc71
Expand All @@ -39,16 +53,19 @@ setup: |
conda create -n deepspeed python=3.8 -y
conda activate deepspeed
pip install deepspeed
pip install deepspeed==0.14.4
cd applications/DeepSpeed-Chat
pip install -r requirements.txt
pip install transformers==4.44.0
# Required by DeepSpeed in multi-node settings.
#
# NOTE(skypilot): DeepSpeed uses `pdsh` to log into each node and calls
# `ninja --version`; so it has to be installed system-wide rather than in
# the above 'deepspeed' conda env.
sudo apt-get update
sudo apt-get -y install pdsh ninja-build
fi
Expand Down
73 changes: 36 additions & 37 deletions sky/clouds/oci.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
make_deploy_resources_variables(): Bug fix for specify the image_id as
the ocid of the image in the task.yaml file, in this case the image_id
for the node config should be set to the ocid instead of a dict.
- Hysun He ([email protected]) @ Oct 13, 2024:
Support more OS types additional to ubuntu for OCI resources.
"""
import json
import logging
Expand Down Expand Up @@ -295,10 +297,21 @@ def make_deploy_resources_variables(
cpus=None if cpus is None else float(cpus),
disk_tier=resources.disk_tier)

image_str = self._get_image_str(image_id=resources.image_id,
instance_type=resources.instance_type,
region=region.name)

# pylint: disable=import-outside-toplevel
from sky.clouds.service_catalog import oci_catalog
os_type = oci_catalog.get_image_os_from_tag(tag=image_str,
region=region.name)
logger.debug(f'OS type for the image {image_str} is {os_type}')

return {
'instance_type': instance_type,
'custom_resources': custom_resources,
'region': region.name,
'os_type': os_type,
'cpus': str(cpus),
'memory': resources.memory,
'disk_size': resources.disk_size,
Expand Down Expand Up @@ -501,59 +514,45 @@ def _get_image_id(
region_name: str,
instance_type: str,
) -> str:
if image_id is None:
return self._get_default_image(region_name=region_name,
instance_type=instance_type)
if None in image_id:
image_id_str = image_id[None]
else:
assert region_name in image_id, image_id
image_id_str = image_id[region_name]
image_id_str = self._get_image_str(image_id=image_id,
instance_type=instance_type,
region=region_name)

if image_id_str.startswith('skypilot:'):
image_id_str = service_catalog.get_image_id_from_tag(image_id_str,
region_name,
clouds='oci')
if image_id_str is None:
logger.critical(
'! Real image_id not found! - {region_name}:{image_id}')
# Raise ResourcesUnavailableError to make sure the failover
# in CloudVMRayBackend will be correctly triggered.
# TODO(zhwu): This is a information leakage to the cloud
# implementor, we need to find a better way to handle this.
raise exceptions.ResourcesUnavailableError(
'! ERR: No image found in catalog for region '
f'{region_name}. Try setting a valid image_id.')

# Image_id should be impossible be None, except for the case when
# user specify an image tag which does not exist in the image.csv
# catalog file which only possible in "test" / "evaluation" phase.
# Therefore, we use assert here.
assert image_id_str is not None

logger.debug(f'Got real image_id {image_id_str}')
return image_id_str

def _get_default_image(self, region_name: str, instance_type: str) -> str:
def _get_image_str(self, image_id: Optional[Dict[Optional[str], str]],
instance_type: str, region: str):
if image_id is None:
image_str = self._get_default_image_tag(instance_type)
elif None in image_id:
image_str = image_id[None]
else:
assert region in image_id, image_id
image_str = image_id[region]
return image_str

def _get_default_image_tag(self, instance_type: str) -> str:
acc = self.get_accelerators_from_instance_type(instance_type)

if acc is None:
image_tag = oci_utils.oci_config.get_default_image_tag()
image_id_str = service_catalog.get_image_id_from_tag(image_tag,
region_name,
clouds='oci')
else:
assert len(acc) == 1, acc
image_tag = oci_utils.oci_config.get_default_gpu_image_tag()
image_id_str = service_catalog.get_image_id_from_tag(image_tag,
region_name,
clouds='oci')

if image_id_str is not None:
logger.debug(
f'Got default image_id {image_id_str} from tag {image_tag}')
return image_id_str

# Raise ResourcesUnavailableError to make sure the failover in
# CloudVMRayBackend will be correctly triggered.
# TODO(zhwu): This is a information leakage to the cloud implementor,
# we need to find a better way to handle this.
raise exceptions.ResourcesUnavailableError(
'ERR: No image found in catalog for region '
f'{region_name}. Try update your default image_id settings.')
return image_tag

def get_vpu_from_disktier(
self, cpus: Optional[float],
Expand Down
22 changes: 22 additions & 0 deletions sky/clouds/service_catalog/oci_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
- Hysun He ([email protected]) @ Apr, 2023: Initial implementation
- Hysun He ([email protected]) @ Jun, 2023: Reduce retry times by
excluding those unsubscribed regions.
- Hysun He ([email protected]) @ Oct 14, 2024: Bug fix for validation
of the Marketplace images
"""

import logging
Expand Down Expand Up @@ -206,4 +208,24 @@ def get_image_id_from_tag(tag: str, region: Optional[str]) -> Optional[str]:

def is_image_tag_valid(tag: str, region: Optional[str]) -> bool:
"""Returns whether the image tag is valid."""
# Oct.14, 2024 by Hysun He: Marketplace images are region neutral, so don't
# check with region for the Marketplace images.
df = _image_df[_image_df['Tag'].str.fullmatch(tag)]
if df.empty:
return False
app_catalog_listing_id = df['AppCatalogListingId'].iloc[0]
if app_catalog_listing_id:
return True
return common.is_image_tag_valid_impl(_image_df, tag, region)


def get_image_os_from_tag(tag: str, region: Optional[str]) -> Optional[str]:
del region
df = _image_df[_image_df['Tag'].str.fullmatch(tag)]
if df.empty:
os_type = oci_utils.oci_config.get_default_image_os()
else:
os_type = df['OS'].iloc[0]

logger.debug(f'Operation system for the image {tag} is {os_type}')
return os_type
12 changes: 11 additions & 1 deletion sky/clouds/utils/oci_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
"""OCI Configuration.
History:
- Zhanghao Wu @ Oct 2023: Formatting and refactoring
- Hysun He ([email protected]) @ Apr, 2023: Initial implementation
- Zhanghao Wu @ Oct 2023: Formatting and refactoring
- Hysun He ([email protected]) @ Oct, 2024: Add default image OS
configuration.
"""
import logging
import os
Expand Down Expand Up @@ -121,5 +123,13 @@ def get_profile(cls) -> str:
return skypilot_config.get_nested(
('oci', 'default', 'oci_config_profile'), 'DEFAULT')

@classmethod
def get_default_image_os(cls) -> str:
# Get the default image OS. Instead of hardcoding, we give a choice to
# set the default image OS type in the sky's user-config file. (if not
# specified, use the hardcode one at last)
return skypilot_config.get_nested(('oci', 'default', 'image_os_type'),
'ubuntu')


oci_config = OCIConfig()
14 changes: 12 additions & 2 deletions sky/templates/oci-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@ provider:
disable_launch_config_check: true

auth:
{% if os_type == "ubuntu" %}
ssh_user: ubuntu
{% else %}
ssh_user: opc
{% endif %}
ssh_private_key: {{ssh_private_key}}

available_node_types:
Expand Down Expand Up @@ -85,14 +89,20 @@ setup_commands:
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
# Line 'mkdir -p ..': disable host key check
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
- sudo systemctl stop unattended-upgrades || true;
- echo "setup commands runs at $(date)" > /tmp/provision.tmp.out || true;
{%- if os_type == "ubuntu" %}
sudo systemctl stop unattended-upgrades || true;
sudo systemctl disable unattended-upgrades || true;
sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
sudo pkill -9 apt-get;
sudo pkill -9 dpkg;
sudo dpkg --configure -a;
([ `sudo lshw -class display | grep "NVIDIA Corporation" | wc -l` -gt 0 ]) && (sudo which nvidia-smi > /dev/null || ( sudo apt-get install nvidia-driver-530-open -y && sudo apt-get install nvidia-driver-525-server -y ) || true);
{%- else %}
sudo /usr/libexec/oci-growfs -y || true;
sudo systemctl stop firewalld || true;
sudo systemctl disable firewalld || true;
{%- endif %}
mkdir -p ~/.ssh; touch ~/.ssh/config;
{{ conda_installation_commands }}
{{ ray_skypilot_installation_commands }}
Expand Down

0 comments on commit e9a2979

Please sign in to comment.