Skip to content

Commit

Permalink
[Cudo] Update and bugfixes (#3256)
Browse files Browse the repository at this point in the history
* bug fixes and improvements

* moved shared function to helper, added error message

* moved catalog helper to utils

* small fixes

* fetch cudo fix

* id fix for vms.csv file

* format fix
  • Loading branch information
JungleCatSW authored and Michaelvll committed Aug 23, 2024
1 parent a0336b3 commit 9c70fce
Show file tree
Hide file tree
Showing 4 changed files with 172 additions and 141 deletions.
125 changes: 8 additions & 117 deletions sky/clouds/service_catalog/data_fetchers/fetch_cudo.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,98 +9,9 @@

import cudo_compute

VMS_CSV = 'cudo/vms.csv'
import sky.provision.cudo.cudo_utils as utils

cudo_gpu_model = {
'NVIDIA V100': 'V100',
'NVIDIA A40': 'A40',
'RTX 3080': 'RTX3080',
'RTX A4000': 'RTXA4000',
'RTX A4500': 'RTXA4500',
'RTX A5000': 'RTXA5000',
'RTX A6000': 'RTXA6000',
}

cudo_gpu_mem = {
'RTX3080': 12,
'A40': 48,
'RTXA4000': 16,
'RTXA4500': 20,
'RTXA5000': 24,
'RTXA6000': 48,
'V100': 16,
}

machine_specs = [
# Low
{
'vcpu': 2,
'mem': 4,
'gpu': 1,
},
{
'vcpu': 4,
'mem': 8,
'gpu': 1,
},
{
'vcpu': 8,
'mem': 16,
'gpu': 2,
},
{
'vcpu': 16,
'mem': 32,
'gpu': 2,
},
{
'vcpu': 32,
'mem': 64,
'gpu': 4,
},
{
'vcpu': 64,
'mem': 128,
'gpu': 8,
},
# Mid
{
'vcpu': 96,
'mem': 192,
'gpu': 8
},
{
'vcpu': 48,
'mem': 96,
'gpu': 4
},
{
'vcpu': 24,
'mem': 48,
'gpu': 2
},
{
'vcpu': 12,
'mem': 24,
'gpu': 1
},
# Hi
{
'vcpu': 96,
'mem': 192,
'gpu': 4
},
{
'vcpu': 48,
'mem': 96,
'gpu': 2
},
{
'vcpu': 24,
'mem': 48,
'gpu': 1
},
]
VMS_CSV = 'cudo/vms.csv'


def cudo_api():
Expand All @@ -110,28 +21,8 @@ def cudo_api():
return cudo_compute.VirtualMachinesApi(client)


def cudo_gpu_to_skypilot_gpu(model):
if model in cudo_gpu_model:
return cudo_gpu_model[model]
else:
return model


def skypilot_gpu_to_cudo_gpu(model):
for key, value in cudo_gpu_model.items():
if value == model:
return key
return model


def gpu_exists(model):
if model in cudo_gpu_model:
return True
return False


def get_gpu_info(count, model):
mem = cudo_gpu_mem[model]
mem = utils.cudo_gpu_mem[model]
# pylint: disable=line-too-long
# {'Name': 'A4000', 'Manufacturer': 'NVIDIA', 'Count': 1.0, 'MemoryInfo': {'SizeInMiB': 16384}}], 'TotalGpuMemoryInMiB': 16384}"
info = {
Expand Down Expand Up @@ -168,16 +59,16 @@ def machine_types(gpu_model, mem_gib, vcpu_count, gpu_count):

def update_prices():
rows = []
for spec in machine_specs:
for spec in utils.machine_specs:
mts = machine_types('', spec['mem'], spec['vcpu'], spec['gpu'])
for hc in mts['host_configs']:
if not gpu_exists(hc['gpu_model']):
if not utils.gpu_exists(hc['gpu_model']):
continue
accelerator_name = cudo_gpu_to_skypilot_gpu(hc['gpu_model'])
accelerator_name = utils.cudo_gpu_to_skypilot_gpu(hc['gpu_model'])
row = {
'instance_type': get_instance_type(hc['machine_type'],
spec['gpu'], spec['vcpu'],
spec['mem']),
spec['vcpu'], spec['mem'],
spec['gpu']),
'accelerator_name': accelerator_name,
'accelerator_count': str(spec['gpu']) + '.0',
'vcpus': str(spec['vcpu']),
Expand Down
112 changes: 112 additions & 0 deletions sky/provision/cudo/cudo_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
"""Cudo catalog helper."""

cudo_gpu_model = {
'NVIDIA V100': 'V100',
'NVIDIA A40': 'A40',
'RTX 3080': 'RTX3080',
'RTX A4000': 'RTXA4000',
'RTX A4500': 'RTXA4500',
'RTX A5000': 'RTXA5000',
'RTX A6000': 'RTXA6000',
}

cudo_gpu_mem = {
'RTX3080': 12,
'A40': 48,
'RTXA4000': 16,
'RTXA4500': 20,
'RTXA5000': 24,
'RTXA6000': 48,
'V100': 16,
}

machine_specs = [
# Low
{
'vcpu': 2,
'mem': 4,
'gpu': 1,
},
{
'vcpu': 4,
'mem': 8,
'gpu': 1,
},
{
'vcpu': 8,
'mem': 16,
'gpu': 2,
},
{
'vcpu': 16,
'mem': 32,
'gpu': 2,
},
{
'vcpu': 32,
'mem': 64,
'gpu': 4,
},
{
'vcpu': 64,
'mem': 128,
'gpu': 8,
},
# Mid
{
'vcpu': 96,
'mem': 192,
'gpu': 8
},
{
'vcpu': 48,
'mem': 96,
'gpu': 4
},
{
'vcpu': 24,
'mem': 48,
'gpu': 2
},
{
'vcpu': 12,
'mem': 24,
'gpu': 1
},
# Hi
{
'vcpu': 96,
'mem': 192,
'gpu': 4
},
{
'vcpu': 48,
'mem': 96,
'gpu': 2
},
{
'vcpu': 24,
'mem': 48,
'gpu': 1
},
]


def cudo_gpu_to_skypilot_gpu(model):
if model in cudo_gpu_model:
return cudo_gpu_model[model]
else:
return model


def skypilot_gpu_to_cudo_gpu(model):
for key, value in cudo_gpu_model.items():
if value == model:
return key
return model


def gpu_exists(model):
if model in cudo_gpu_model:
return True
return False
53 changes: 37 additions & 16 deletions sky/provision/cudo/cudo_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,29 +4,29 @@

from sky import sky_logging
from sky.adaptors import cudo
import sky.provision.cudo.cudo_utils as utils

logger = sky_logging.init_logger(__name__)


def launch(name: str, data_center_id: str, ssh_key: str, machine_type: str,
memory_gib: int, vcpu_count: int, gpu_count: int, gpu_model: str,
memory_gib: int, vcpu_count: int, gpu_count: int,
tags: Dict[str, str], disk_size: int):
"""Launches an instance with the given parameters."""
disk = cudo.cudo.Disk(storage_class='STORAGE_CLASS_NETWORK',
size_gib=disk_size)

request = cudo.cudo.CreateVMBody(ssh_key_source='SSH_KEY_SOURCE_NONE',
custom_ssh_keys=[ssh_key],
vm_id=name,
machine_type=machine_type,
data_center_id=data_center_id,
boot_disk_image_id='ubuntu-nvidia-docker',
memory_gib=memory_gib,
vcpus=vcpu_count,
gpus=gpu_count,
gpu_model=gpu_model,
boot_disk=disk,
metadata=tags)

request = cudo.cudo.CreateVMBody(
ssh_key_source='SSH_KEY_SOURCE_NONE',
custom_ssh_keys=[ssh_key],
vm_id=name,
machine_type=machine_type,
data_center_id=data_center_id,
boot_disk_image_id='ubuntu-2204-nvidia-535-docker-v20240214',
memory_gib=memory_gib,
vcpus=vcpu_count,
gpus=gpu_count,
boot_disk=cudo.cudo.Disk(storage_class='STORAGE_CLASS_NETWORK',
size_gib=disk_size),
metadata=tags)

try:
api = cudo.cudo.cudo_api.virtual_machines()
Expand Down Expand Up @@ -121,3 +121,24 @@ def list_instances():
return instances
except cudo.cudo.rest.ApiException as e:
raise e


def vm_available(to_start_count, gpu_count, gpu_model, data_center_id, mem,
cpus):
try:
gpu_model = utils.skypilot_gpu_to_cudo_gpu(gpu_model)
api = cudo.cudo.cudo_api.virtual_machines()
types = api.list_vm_machine_types(mem,
cpus,
gpu=gpu_count,
gpu_model=gpu_model,
data_center_id=data_center_id)
types_dict = types.to_dict()
hc = types_dict['host_configs']
total_count = sum(item['count_vm_available'] for item in hc)
if total_count < to_start_count:
raise Exception(
'Too many VMs requested, try another gpu type or region')
return total_count
except cudo.cudo.rest.ApiException as e:
raise e
23 changes: 15 additions & 8 deletions sky/provision/cudo/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

def _filter_instances(cluster_name_on_cloud: str,
status_filters: Optional[List[str]]) -> Dict[str, Any]:

instances = cudo_wrapper.list_instances()
possible_names = [
f'{cluster_name_on_cloud}-head', f'{cluster_name_on_cloud}-worker'
Expand Down Expand Up @@ -77,10 +76,19 @@ def run_instances(region: str, cluster_name_on_cloud: str,

created_instance_ids = []
public_key = config.node_config['AuthorizedKey']

instance_type = config.node_config['InstanceType']
spec = cudo_machine_type.get_spec_from_instance(instance_type, region)
gpu_count = int(float(spec['gpu_count']))
vcpu_count = int(spec['vcpu_count'])
memory_gib = int(spec['mem_gb'])
gpu_model = spec['gpu_model']
try:
cudo_wrapper.vm_available(to_start_count, gpu_count, gpu_model, region,
memory_gib, vcpu_count)
except Exception as e:
logger.warning(f'run_instances: {e}')
raise
for _ in range(to_start_count):
instance_type = config.node_config['InstanceType']
spec = cudo_machine_type.get_spec_from_instance(instance_type, region)

node_type = 'head' if head_instance_id is None else 'worker'
try:
Expand All @@ -89,10 +97,9 @@ def run_instances(region: str, cluster_name_on_cloud: str,
ssh_key=public_key,
data_center_id=region,
machine_type=spec['machine_type'],
memory_gib=int(spec['mem_gb']),
vcpu_count=int(spec['vcpu_count']),
gpu_count=int(float(spec['gpu_count'])),
gpu_model=spec['gpu_model'],
memory_gib=memory_gib,
vcpu_count=vcpu_count,
gpu_count=gpu_count,
tags={},
disk_size=config.node_config['DiskSize'])
except Exception as e: # pylint: disable=broad-except
Expand Down

0 comments on commit 9c70fce

Please sign in to comment.