From 9c70fce3721eeb739d017d505bf3de88390b3800 Mon Sep 17 00:00:00 2001 From: Sean Date: Mon, 1 Jul 2024 21:24:33 +0100 Subject: [PATCH] [Cudo] Update and bugfixes (#3256) * bug fixes and improvements * moved shared function to helper, added error message * moved catalog helper to utils * small fixes * fetch cudo fix * id fix for vms.csv file * format fix --- .../data_fetchers/fetch_cudo.py | 125 ++---------------- sky/provision/cudo/cudo_utils.py | 112 ++++++++++++++++ sky/provision/cudo/cudo_wrapper.py | 53 +++++--- sky/provision/cudo/instance.py | 23 ++-- 4 files changed, 172 insertions(+), 141 deletions(-) create mode 100644 sky/provision/cudo/cudo_utils.py diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_cudo.py b/sky/clouds/service_catalog/data_fetchers/fetch_cudo.py index b15570ddcbc..617751d865a 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_cudo.py @@ -9,98 +9,9 @@ import cudo_compute -VMS_CSV = 'cudo/vms.csv' +import sky.provision.cudo.cudo_utils as utils -cudo_gpu_model = { - 'NVIDIA V100': 'V100', - 'NVIDIA A40': 'A40', - 'RTX 3080': 'RTX3080', - 'RTX A4000': 'RTXA4000', - 'RTX A4500': 'RTXA4500', - 'RTX A5000': 'RTXA5000', - 'RTX A6000': 'RTXA6000', -} - -cudo_gpu_mem = { - 'RTX3080': 12, - 'A40': 48, - 'RTXA4000': 16, - 'RTXA4500': 20, - 'RTXA5000': 24, - 'RTXA6000': 48, - 'V100': 16, -} - -machine_specs = [ - # Low - { - 'vcpu': 2, - 'mem': 4, - 'gpu': 1, - }, - { - 'vcpu': 4, - 'mem': 8, - 'gpu': 1, - }, - { - 'vcpu': 8, - 'mem': 16, - 'gpu': 2, - }, - { - 'vcpu': 16, - 'mem': 32, - 'gpu': 2, - }, - { - 'vcpu': 32, - 'mem': 64, - 'gpu': 4, - }, - { - 'vcpu': 64, - 'mem': 128, - 'gpu': 8, - }, - # Mid - { - 'vcpu': 96, - 'mem': 192, - 'gpu': 8 - }, - { - 'vcpu': 48, - 'mem': 96, - 'gpu': 4 - }, - { - 'vcpu': 24, - 'mem': 48, - 'gpu': 2 - }, - { - 'vcpu': 12, - 'mem': 24, - 'gpu': 1 - }, - # Hi - { - 'vcpu': 96, - 'mem': 192, - 'gpu': 4 - }, - { - 'vcpu': 48, - 'mem': 96, - 'gpu': 2 - }, - { - 'vcpu': 24, - 'mem': 48, - 'gpu': 1 - }, -] +VMS_CSV = 'cudo/vms.csv' def cudo_api(): @@ -110,28 +21,8 @@ def cudo_api(): return cudo_compute.VirtualMachinesApi(client) -def cudo_gpu_to_skypilot_gpu(model): - if model in cudo_gpu_model: - return cudo_gpu_model[model] - else: - return model - - -def skypilot_gpu_to_cudo_gpu(model): - for key, value in cudo_gpu_model.items(): - if value == model: - return key - return model - - -def gpu_exists(model): - if model in cudo_gpu_model: - return True - return False - - def get_gpu_info(count, model): - mem = cudo_gpu_mem[model] + mem = utils.cudo_gpu_mem[model] # pylint: disable=line-too-long # {'Name': 'A4000', 'Manufacturer': 'NVIDIA', 'Count': 1.0, 'MemoryInfo': {'SizeInMiB': 16384}}], 'TotalGpuMemoryInMiB': 16384}" info = { @@ -168,16 +59,16 @@ def machine_types(gpu_model, mem_gib, vcpu_count, gpu_count): def update_prices(): rows = [] - for spec in machine_specs: + for spec in utils.machine_specs: mts = machine_types('', spec['mem'], spec['vcpu'], spec['gpu']) for hc in mts['host_configs']: - if not gpu_exists(hc['gpu_model']): + if not utils.gpu_exists(hc['gpu_model']): continue - accelerator_name = cudo_gpu_to_skypilot_gpu(hc['gpu_model']) + accelerator_name = utils.cudo_gpu_to_skypilot_gpu(hc['gpu_model']) row = { 'instance_type': get_instance_type(hc['machine_type'], - spec['gpu'], spec['vcpu'], - spec['mem']), + spec['vcpu'], spec['mem'], + spec['gpu']), 'accelerator_name': accelerator_name, 'accelerator_count': str(spec['gpu']) + '.0', 'vcpus': str(spec['vcpu']), diff --git a/sky/provision/cudo/cudo_utils.py b/sky/provision/cudo/cudo_utils.py new file mode 100644 index 00000000000..d4ef7f9e415 --- /dev/null +++ b/sky/provision/cudo/cudo_utils.py @@ -0,0 +1,112 @@ +"""Cudo catalog helper.""" + +cudo_gpu_model = { + 'NVIDIA V100': 'V100', + 'NVIDIA A40': 'A40', + 'RTX 3080': 'RTX3080', + 'RTX A4000': 'RTXA4000', + 'RTX A4500': 'RTXA4500', + 'RTX A5000': 'RTXA5000', + 'RTX A6000': 'RTXA6000', +} + +cudo_gpu_mem = { + 'RTX3080': 12, + 'A40': 48, + 'RTXA4000': 16, + 'RTXA4500': 20, + 'RTXA5000': 24, + 'RTXA6000': 48, + 'V100': 16, +} + +machine_specs = [ + # Low + { + 'vcpu': 2, + 'mem': 4, + 'gpu': 1, + }, + { + 'vcpu': 4, + 'mem': 8, + 'gpu': 1, + }, + { + 'vcpu': 8, + 'mem': 16, + 'gpu': 2, + }, + { + 'vcpu': 16, + 'mem': 32, + 'gpu': 2, + }, + { + 'vcpu': 32, + 'mem': 64, + 'gpu': 4, + }, + { + 'vcpu': 64, + 'mem': 128, + 'gpu': 8, + }, + # Mid + { + 'vcpu': 96, + 'mem': 192, + 'gpu': 8 + }, + { + 'vcpu': 48, + 'mem': 96, + 'gpu': 4 + }, + { + 'vcpu': 24, + 'mem': 48, + 'gpu': 2 + }, + { + 'vcpu': 12, + 'mem': 24, + 'gpu': 1 + }, + # Hi + { + 'vcpu': 96, + 'mem': 192, + 'gpu': 4 + }, + { + 'vcpu': 48, + 'mem': 96, + 'gpu': 2 + }, + { + 'vcpu': 24, + 'mem': 48, + 'gpu': 1 + }, +] + + +def cudo_gpu_to_skypilot_gpu(model): + if model in cudo_gpu_model: + return cudo_gpu_model[model] + else: + return model + + +def skypilot_gpu_to_cudo_gpu(model): + for key, value in cudo_gpu_model.items(): + if value == model: + return key + return model + + +def gpu_exists(model): + if model in cudo_gpu_model: + return True + return False diff --git a/sky/provision/cudo/cudo_wrapper.py b/sky/provision/cudo/cudo_wrapper.py index 691c69bda8c..eac39d9faed 100644 --- a/sky/provision/cudo/cudo_wrapper.py +++ b/sky/provision/cudo/cudo_wrapper.py @@ -4,29 +4,29 @@ from sky import sky_logging from sky.adaptors import cudo +import sky.provision.cudo.cudo_utils as utils logger = sky_logging.init_logger(__name__) def launch(name: str, data_center_id: str, ssh_key: str, machine_type: str, - memory_gib: int, vcpu_count: int, gpu_count: int, gpu_model: str, + memory_gib: int, vcpu_count: int, gpu_count: int, tags: Dict[str, str], disk_size: int): """Launches an instance with the given parameters.""" - disk = cudo.cudo.Disk(storage_class='STORAGE_CLASS_NETWORK', - size_gib=disk_size) - - request = cudo.cudo.CreateVMBody(ssh_key_source='SSH_KEY_SOURCE_NONE', - custom_ssh_keys=[ssh_key], - vm_id=name, - machine_type=machine_type, - data_center_id=data_center_id, - boot_disk_image_id='ubuntu-nvidia-docker', - memory_gib=memory_gib, - vcpus=vcpu_count, - gpus=gpu_count, - gpu_model=gpu_model, - boot_disk=disk, - metadata=tags) + + request = cudo.cudo.CreateVMBody( + ssh_key_source='SSH_KEY_SOURCE_NONE', + custom_ssh_keys=[ssh_key], + vm_id=name, + machine_type=machine_type, + data_center_id=data_center_id, + boot_disk_image_id='ubuntu-2204-nvidia-535-docker-v20240214', + memory_gib=memory_gib, + vcpus=vcpu_count, + gpus=gpu_count, + boot_disk=cudo.cudo.Disk(storage_class='STORAGE_CLASS_NETWORK', + size_gib=disk_size), + metadata=tags) try: api = cudo.cudo.cudo_api.virtual_machines() @@ -121,3 +121,24 @@ def list_instances(): return instances except cudo.cudo.rest.ApiException as e: raise e + + +def vm_available(to_start_count, gpu_count, gpu_model, data_center_id, mem, + cpus): + try: + gpu_model = utils.skypilot_gpu_to_cudo_gpu(gpu_model) + api = cudo.cudo.cudo_api.virtual_machines() + types = api.list_vm_machine_types(mem, + cpus, + gpu=gpu_count, + gpu_model=gpu_model, + data_center_id=data_center_id) + types_dict = types.to_dict() + hc = types_dict['host_configs'] + total_count = sum(item['count_vm_available'] for item in hc) + if total_count < to_start_count: + raise Exception( + 'Too many VMs requested, try another gpu type or region') + return total_count + except cudo.cudo.rest.ApiException as e: + raise e diff --git a/sky/provision/cudo/instance.py b/sky/provision/cudo/instance.py index 105f82a3a37..db306ddaa19 100644 --- a/sky/provision/cudo/instance.py +++ b/sky/provision/cudo/instance.py @@ -16,7 +16,6 @@ def _filter_instances(cluster_name_on_cloud: str, status_filters: Optional[List[str]]) -> Dict[str, Any]: - instances = cudo_wrapper.list_instances() possible_names = [ f'{cluster_name_on_cloud}-head', f'{cluster_name_on_cloud}-worker' @@ -77,10 +76,19 @@ def run_instances(region: str, cluster_name_on_cloud: str, created_instance_ids = [] public_key = config.node_config['AuthorizedKey'] - + instance_type = config.node_config['InstanceType'] + spec = cudo_machine_type.get_spec_from_instance(instance_type, region) + gpu_count = int(float(spec['gpu_count'])) + vcpu_count = int(spec['vcpu_count']) + memory_gib = int(spec['mem_gb']) + gpu_model = spec['gpu_model'] + try: + cudo_wrapper.vm_available(to_start_count, gpu_count, gpu_model, region, + memory_gib, vcpu_count) + except Exception as e: + logger.warning(f'run_instances: {e}') + raise for _ in range(to_start_count): - instance_type = config.node_config['InstanceType'] - spec = cudo_machine_type.get_spec_from_instance(instance_type, region) node_type = 'head' if head_instance_id is None else 'worker' try: @@ -89,10 +97,9 @@ def run_instances(region: str, cluster_name_on_cloud: str, ssh_key=public_key, data_center_id=region, machine_type=spec['machine_type'], - memory_gib=int(spec['mem_gb']), - vcpu_count=int(spec['vcpu_count']), - gpu_count=int(float(spec['gpu_count'])), - gpu_model=spec['gpu_model'], + memory_gib=memory_gib, + vcpu_count=vcpu_count, + gpu_count=gpu_count, tags={}, disk_size=config.node_config['DiskSize']) except Exception as e: # pylint: disable=broad-except