From 067a0a35dfcb4789b1f72a083a7309814a807e80 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Sun, 20 Oct 2024 18:26:46 -0700 Subject: [PATCH 1/2] [examples] Deepspeed fixes + k8s support (#4124) deepspeed kubernetes fixes --- examples/deepspeed-multinode/sky.yaml | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/examples/deepspeed-multinode/sky.yaml b/examples/deepspeed-multinode/sky.yaml index 37d7445a2a1..07bd3746894 100644 --- a/examples/deepspeed-multinode/sky.yaml +++ b/examples/deepspeed-multinode/sky.yaml @@ -2,10 +2,16 @@ # # This takes care constructing a "hostfile" to pass to DeepSpeed. # +# If running on Kubernetes, use the nvidia/cuda:12.1.1-devel-ubuntu20.04 image +# because DeepSpeed requires nvcc. +# # Usage: # # $ sky launch sky.yaml -r --down -c ds # +# If running on Kubernetes: +# $ sky launch sky.yaml -r --down -c ds --cloud kubernetes --image nvidia/cuda:12.1.1-devel-ubuntu20.04 +# # # Optional: After the job starts running, you can log into the two nodes and # # check gpustat: # $ ssh ds @@ -18,6 +24,7 @@ resources: # accelerators: A100-80GB:1 # Azure, GCP, SCP # accelerators: A10G:1 # AWS. Will OOM for (1) single_node/run_1.3b_lora.sh (2) multi_node/run_66b.sh. # accelerators: T4:1 # AWS, Azure, GCP. Will OOM for (1) single_node/run_1.3b_lora.sh (2) multi_node/run_66b.sh. + # image_id: docker:nvidia/cuda:12.1.1-devel-ubuntu20.04 # Use this image if running on Kubernetes num_nodes: 2 @@ -28,6 +35,13 @@ envs: DEEPSPEED_ENVS: "MY_VAR_1,MY_VAR_2,SKYPILOT_NODE_RANK" setup: | + if ! command -v git &> /dev/null + then + echo "git is not installed. Installing git..." + sudo apt-get update + sudo apt-get install -y git + fi + git clone https://github.com/microsoft/DeepSpeedExamples.git || true cd DeepSpeedExamples git checkout d7c42b4f34df91035e7ed3e0c51500bb53d0bc71 @@ -39,16 +53,19 @@ setup: | conda create -n deepspeed python=3.8 -y conda activate deepspeed - pip install deepspeed + pip install deepspeed==0.14.4 cd applications/DeepSpeed-Chat pip install -r requirements.txt + + pip install transformers==4.44.0 # Required by DeepSpeed in multi-node settings. # # NOTE(skypilot): DeepSpeed uses `pdsh` to log into each node and calls # `ninja --version`; so it has to be installed system-wide rather than in # the above 'deepspeed' conda env. + sudo apt-get update sudo apt-get -y install pdsh ninja-build fi From 3c3bcee5cfe720a96ab67f4049a557a79e7f077f Mon Sep 17 00:00:00 2001 From: Hysun He Date: Mon, 21 Oct 2024 12:13:51 +0800 Subject: [PATCH 2/2] [OCI] Support more OS types in addition to ubuntu (#4080) * Bug fix for sky config file path resolution. * format * [OCI] Bug fix for image_id in Task YAML * [OCI]: Support more OS types (esp. oraclelinux) in addition to ubuntu. * format * Disable system firewall * Bug fix for validation of the Marketplace images * Update sky/clouds/oci.py Co-authored-by: Zhanghao Wu * Update sky/clouds/oci.py Co-authored-by: Zhanghao Wu * variable/function naming * address review comments: not to change the service_catalog api. call oci_catalog directly for get os type for a image. * Update sky/clouds/oci.py Co-authored-by: Zhanghao Wu * Update sky/clouds/oci.py Co-authored-by: Zhanghao Wu * Update sky/clouds/oci.py Co-authored-by: Zhanghao Wu * address review comments --------- Co-authored-by: Zhanghao Wu --- sky/clouds/oci.py | 73 +++++++++++------------ sky/clouds/service_catalog/oci_catalog.py | 22 +++++++ sky/clouds/utils/oci_utils.py | 12 +++- sky/templates/oci-ray.yml.j2 | 14 ++++- 4 files changed, 81 insertions(+), 40 deletions(-) diff --git a/sky/clouds/oci.py b/sky/clouds/oci.py index f4ac4d577e3..810e43fe3b5 100644 --- a/sky/clouds/oci.py +++ b/sky/clouds/oci.py @@ -17,6 +17,8 @@ make_deploy_resources_variables(): Bug fix for specify the image_id as the ocid of the image in the task.yaml file, in this case the image_id for the node config should be set to the ocid instead of a dict. + - Hysun He (hysun.he@oracle.com) @ Oct 13, 2024: + Support more OS types additional to ubuntu for OCI resources. """ import json import logging @@ -295,10 +297,21 @@ def make_deploy_resources_variables( cpus=None if cpus is None else float(cpus), disk_tier=resources.disk_tier) + image_str = self._get_image_str(image_id=resources.image_id, + instance_type=resources.instance_type, + region=region.name) + + # pylint: disable=import-outside-toplevel + from sky.clouds.service_catalog import oci_catalog + os_type = oci_catalog.get_image_os_from_tag(tag=image_str, + region=region.name) + logger.debug(f'OS type for the image {image_str} is {os_type}') + return { 'instance_type': instance_type, 'custom_resources': custom_resources, 'region': region.name, + 'os_type': os_type, 'cpus': str(cpus), 'memory': resources.memory, 'disk_size': resources.disk_size, @@ -501,59 +514,45 @@ def _get_image_id( region_name: str, instance_type: str, ) -> str: - if image_id is None: - return self._get_default_image(region_name=region_name, - instance_type=instance_type) - if None in image_id: - image_id_str = image_id[None] - else: - assert region_name in image_id, image_id - image_id_str = image_id[region_name] + image_id_str = self._get_image_str(image_id=image_id, + instance_type=instance_type, + region=region_name) + if image_id_str.startswith('skypilot:'): image_id_str = service_catalog.get_image_id_from_tag(image_id_str, region_name, clouds='oci') - if image_id_str is None: - logger.critical( - '! Real image_id not found! - {region_name}:{image_id}') - # Raise ResourcesUnavailableError to make sure the failover - # in CloudVMRayBackend will be correctly triggered. - # TODO(zhwu): This is a information leakage to the cloud - # implementor, we need to find a better way to handle this. - raise exceptions.ResourcesUnavailableError( - '! ERR: No image found in catalog for region ' - f'{region_name}. Try setting a valid image_id.') + + # Image_id should be impossible be None, except for the case when + # user specify an image tag which does not exist in the image.csv + # catalog file which only possible in "test" / "evaluation" phase. + # Therefore, we use assert here. + assert image_id_str is not None logger.debug(f'Got real image_id {image_id_str}') return image_id_str - def _get_default_image(self, region_name: str, instance_type: str) -> str: + def _get_image_str(self, image_id: Optional[Dict[Optional[str], str]], + instance_type: str, region: str): + if image_id is None: + image_str = self._get_default_image_tag(instance_type) + elif None in image_id: + image_str = image_id[None] + else: + assert region in image_id, image_id + image_str = image_id[region] + return image_str + + def _get_default_image_tag(self, instance_type: str) -> str: acc = self.get_accelerators_from_instance_type(instance_type) if acc is None: image_tag = oci_utils.oci_config.get_default_image_tag() - image_id_str = service_catalog.get_image_id_from_tag(image_tag, - region_name, - clouds='oci') else: assert len(acc) == 1, acc image_tag = oci_utils.oci_config.get_default_gpu_image_tag() - image_id_str = service_catalog.get_image_id_from_tag(image_tag, - region_name, - clouds='oci') - if image_id_str is not None: - logger.debug( - f'Got default image_id {image_id_str} from tag {image_tag}') - return image_id_str - - # Raise ResourcesUnavailableError to make sure the failover in - # CloudVMRayBackend will be correctly triggered. - # TODO(zhwu): This is a information leakage to the cloud implementor, - # we need to find a better way to handle this. - raise exceptions.ResourcesUnavailableError( - 'ERR: No image found in catalog for region ' - f'{region_name}. Try update your default image_id settings.') + return image_tag def get_vpu_from_disktier( self, cpus: Optional[float], diff --git a/sky/clouds/service_catalog/oci_catalog.py b/sky/clouds/service_catalog/oci_catalog.py index a18dee79be5..47d0489f6ab 100644 --- a/sky/clouds/service_catalog/oci_catalog.py +++ b/sky/clouds/service_catalog/oci_catalog.py @@ -7,6 +7,8 @@ - Hysun He (hysun.he@oracle.com) @ Apr, 2023: Initial implementation - Hysun He (hysun.he@oracle.com) @ Jun, 2023: Reduce retry times by excluding those unsubscribed regions. + - Hysun He (hysun.he@oracle.com) @ Oct 14, 2024: Bug fix for validation + of the Marketplace images """ import logging @@ -206,4 +208,24 @@ def get_image_id_from_tag(tag: str, region: Optional[str]) -> Optional[str]: def is_image_tag_valid(tag: str, region: Optional[str]) -> bool: """Returns whether the image tag is valid.""" + # Oct.14, 2024 by Hysun He: Marketplace images are region neutral, so don't + # check with region for the Marketplace images. + df = _image_df[_image_df['Tag'].str.fullmatch(tag)] + if df.empty: + return False + app_catalog_listing_id = df['AppCatalogListingId'].iloc[0] + if app_catalog_listing_id: + return True return common.is_image_tag_valid_impl(_image_df, tag, region) + + +def get_image_os_from_tag(tag: str, region: Optional[str]) -> Optional[str]: + del region + df = _image_df[_image_df['Tag'].str.fullmatch(tag)] + if df.empty: + os_type = oci_utils.oci_config.get_default_image_os() + else: + os_type = df['OS'].iloc[0] + + logger.debug(f'Operation system for the image {tag} is {os_type}') + return os_type diff --git a/sky/clouds/utils/oci_utils.py b/sky/clouds/utils/oci_utils.py index 3d11bab24da..86647071f3e 100644 --- a/sky/clouds/utils/oci_utils.py +++ b/sky/clouds/utils/oci_utils.py @@ -1,7 +1,9 @@ """OCI Configuration. History: - - Zhanghao Wu @ Oct 2023: Formatting and refactoring - Hysun He (hysun.he@oracle.com) @ Apr, 2023: Initial implementation + - Zhanghao Wu @ Oct 2023: Formatting and refactoring + - Hysun He (hysun.he@oracle.com) @ Oct, 2024: Add default image OS + configuration. """ import logging import os @@ -121,5 +123,13 @@ def get_profile(cls) -> str: return skypilot_config.get_nested( ('oci', 'default', 'oci_config_profile'), 'DEFAULT') + @classmethod + def get_default_image_os(cls) -> str: + # Get the default image OS. Instead of hardcoding, we give a choice to + # set the default image OS type in the sky's user-config file. (if not + # specified, use the hardcode one at last) + return skypilot_config.get_nested(('oci', 'default', 'image_os_type'), + 'ubuntu') + oci_config = OCIConfig() diff --git a/sky/templates/oci-ray.yml.j2 b/sky/templates/oci-ray.yml.j2 index 32bd6326ee2..64fa4e745c7 100644 --- a/sky/templates/oci-ray.yml.j2 +++ b/sky/templates/oci-ray.yml.j2 @@ -16,7 +16,11 @@ provider: disable_launch_config_check: true auth: +{% if os_type == "ubuntu" %} ssh_user: ubuntu +{% else %} + ssh_user: opc +{% endif %} ssh_private_key: {{ssh_private_key}} available_node_types: @@ -85,14 +89,20 @@ setup_commands: # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase. # Line 'mkdir -p ..': disable host key check # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys` - - sudo systemctl stop unattended-upgrades || true; + - echo "setup commands runs at $(date)" > /tmp/provision.tmp.out || true; + {%- if os_type == "ubuntu" %} + sudo systemctl stop unattended-upgrades || true; sudo systemctl disable unattended-upgrades || true; sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true; sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true; sudo pkill -9 apt-get; sudo pkill -9 dpkg; sudo dpkg --configure -a; - ([ `sudo lshw -class display | grep "NVIDIA Corporation" | wc -l` -gt 0 ]) && (sudo which nvidia-smi > /dev/null || ( sudo apt-get install nvidia-driver-530-open -y && sudo apt-get install nvidia-driver-525-server -y ) || true); + {%- else %} + sudo /usr/libexec/oci-growfs -y || true; + sudo systemctl stop firewalld || true; + sudo systemctl disable firewalld || true; + {%- endif %} mkdir -p ~/.ssh; touch ~/.ssh/config; {{ conda_installation_commands }} {{ ray_skypilot_installation_commands }}