Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Cudo] Update and bugfixes #3256

Merged
merged 9 commits into from
Jul 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 8 additions & 117 deletions sky/clouds/service_catalog/data_fetchers/fetch_cudo.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,98 +9,9 @@

import cudo_compute

VMS_CSV = 'cudo/vms.csv'
import sky.provision.cudo.cudo_utils as utils

cudo_gpu_model = {
'NVIDIA V100': 'V100',
'NVIDIA A40': 'A40',
'RTX 3080': 'RTX3080',
'RTX A4000': 'RTXA4000',
'RTX A4500': 'RTXA4500',
'RTX A5000': 'RTXA5000',
'RTX A6000': 'RTXA6000',
}

cudo_gpu_mem = {
'RTX3080': 12,
'A40': 48,
'RTXA4000': 16,
'RTXA4500': 20,
'RTXA5000': 24,
'RTXA6000': 48,
'V100': 16,
}

machine_specs = [
# Low
{
'vcpu': 2,
'mem': 4,
'gpu': 1,
},
{
'vcpu': 4,
'mem': 8,
'gpu': 1,
},
{
'vcpu': 8,
'mem': 16,
'gpu': 2,
},
{
'vcpu': 16,
'mem': 32,
'gpu': 2,
},
{
'vcpu': 32,
'mem': 64,
'gpu': 4,
},
{
'vcpu': 64,
'mem': 128,
'gpu': 8,
},
# Mid
{
'vcpu': 96,
'mem': 192,
'gpu': 8
},
{
'vcpu': 48,
'mem': 96,
'gpu': 4
},
{
'vcpu': 24,
'mem': 48,
'gpu': 2
},
{
'vcpu': 12,
'mem': 24,
'gpu': 1
},
# Hi
{
'vcpu': 96,
'mem': 192,
'gpu': 4
},
{
'vcpu': 48,
'mem': 96,
'gpu': 2
},
{
'vcpu': 24,
'mem': 48,
'gpu': 1
},
]
VMS_CSV = 'cudo/vms.csv'


def cudo_api():
Expand All @@ -110,28 +21,8 @@ def cudo_api():
return cudo_compute.VirtualMachinesApi(client)


def cudo_gpu_to_skypilot_gpu(model):
if model in cudo_gpu_model:
return cudo_gpu_model[model]
else:
return model


def skypilot_gpu_to_cudo_gpu(model):
for key, value in cudo_gpu_model.items():
if value == model:
return key
return model


def gpu_exists(model):
if model in cudo_gpu_model:
return True
return False


def get_gpu_info(count, model):
mem = cudo_gpu_mem[model]
mem = utils.cudo_gpu_mem[model]
# pylint: disable=line-too-long
# {'Name': 'A4000', 'Manufacturer': 'NVIDIA', 'Count': 1.0, 'MemoryInfo': {'SizeInMiB': 16384}}], 'TotalGpuMemoryInMiB': 16384}"
info = {
Expand Down Expand Up @@ -168,16 +59,16 @@ def machine_types(gpu_model, mem_gib, vcpu_count, gpu_count):

def update_prices():
rows = []
for spec in machine_specs:
for spec in utils.machine_specs:
mts = machine_types('', spec['mem'], spec['vcpu'], spec['gpu'])
for hc in mts['host_configs']:
if not gpu_exists(hc['gpu_model']):
if not utils.gpu_exists(hc['gpu_model']):
continue
accelerator_name = cudo_gpu_to_skypilot_gpu(hc['gpu_model'])
accelerator_name = utils.cudo_gpu_to_skypilot_gpu(hc['gpu_model'])
row = {
'instance_type': get_instance_type(hc['machine_type'],
spec['gpu'], spec['vcpu'],
spec['mem']),
spec['vcpu'], spec['mem'],
spec['gpu']),
'accelerator_name': accelerator_name,
'accelerator_count': str(spec['gpu']) + '.0',
'vcpus': str(spec['vcpu']),
Expand Down
112 changes: 112 additions & 0 deletions sky/provision/cudo/cudo_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
"""Cudo catalog helper."""

cudo_gpu_model = {
'NVIDIA V100': 'V100',
'NVIDIA A40': 'A40',
'RTX 3080': 'RTX3080',
'RTX A4000': 'RTXA4000',
'RTX A4500': 'RTXA4500',
'RTX A5000': 'RTXA5000',
'RTX A6000': 'RTXA6000',
}

cudo_gpu_mem = {
'RTX3080': 12,
'A40': 48,
'RTXA4000': 16,
'RTXA4500': 20,
'RTXA5000': 24,
'RTXA6000': 48,
'V100': 16,
}

machine_specs = [
# Low
{
'vcpu': 2,
'mem': 4,
'gpu': 1,
},
{
'vcpu': 4,
'mem': 8,
'gpu': 1,
},
{
'vcpu': 8,
'mem': 16,
'gpu': 2,
},
{
'vcpu': 16,
'mem': 32,
'gpu': 2,
},
{
'vcpu': 32,
'mem': 64,
'gpu': 4,
},
{
'vcpu': 64,
'mem': 128,
'gpu': 8,
},
# Mid
{
'vcpu': 96,
'mem': 192,
'gpu': 8
},
{
'vcpu': 48,
'mem': 96,
'gpu': 4
},
{
'vcpu': 24,
'mem': 48,
'gpu': 2
},
{
'vcpu': 12,
'mem': 24,
'gpu': 1
},
# Hi
{
'vcpu': 96,
'mem': 192,
'gpu': 4
},
{
'vcpu': 48,
'mem': 96,
'gpu': 2
},
{
'vcpu': 24,
'mem': 48,
'gpu': 1
},
]


def cudo_gpu_to_skypilot_gpu(model):
if model in cudo_gpu_model:
return cudo_gpu_model[model]
else:
return model


def skypilot_gpu_to_cudo_gpu(model):
for key, value in cudo_gpu_model.items():
if value == model:
return key
return model


def gpu_exists(model):
if model in cudo_gpu_model:
return True
return False
53 changes: 37 additions & 16 deletions sky/provision/cudo/cudo_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,29 +4,29 @@

from sky import sky_logging
from sky.adaptors import cudo
import sky.provision.cudo.cudo_utils as utils

logger = sky_logging.init_logger(__name__)


def launch(name: str, data_center_id: str, ssh_key: str, machine_type: str,
memory_gib: int, vcpu_count: int, gpu_count: int, gpu_model: str,
memory_gib: int, vcpu_count: int, gpu_count: int,
tags: Dict[str, str], disk_size: int):
"""Launches an instance with the given parameters."""
disk = cudo.cudo.Disk(storage_class='STORAGE_CLASS_NETWORK',
size_gib=disk_size)

request = cudo.cudo.CreateVMBody(ssh_key_source='SSH_KEY_SOURCE_NONE',
custom_ssh_keys=[ssh_key],
vm_id=name,
machine_type=machine_type,
data_center_id=data_center_id,
boot_disk_image_id='ubuntu-nvidia-docker',
memory_gib=memory_gib,
vcpus=vcpu_count,
gpus=gpu_count,
gpu_model=gpu_model,
boot_disk=disk,
metadata=tags)

request = cudo.cudo.CreateVMBody(
ssh_key_source='SSH_KEY_SOURCE_NONE',
custom_ssh_keys=[ssh_key],
vm_id=name,
machine_type=machine_type,
data_center_id=data_center_id,
boot_disk_image_id='ubuntu-2204-nvidia-535-docker-v20240214',
memory_gib=memory_gib,
vcpus=vcpu_count,
gpus=gpu_count,
boot_disk=cudo.cudo.Disk(storage_class='STORAGE_CLASS_NETWORK',
size_gib=disk_size),
metadata=tags)

try:
api = cudo.cudo.cudo_api.virtual_machines()
Expand Down Expand Up @@ -121,3 +121,24 @@ def list_instances():
return instances
except cudo.cudo.rest.ApiException as e:
raise e


def vm_available(to_start_count, gpu_count, gpu_model, data_center_id, mem,
cpus):
try:
gpu_model = utils.skypilot_gpu_to_cudo_gpu(gpu_model)
api = cudo.cudo.cudo_api.virtual_machines()
types = api.list_vm_machine_types(mem,
cpus,
gpu=gpu_count,
gpu_model=gpu_model,
data_center_id=data_center_id)
types_dict = types.to_dict()
hc = types_dict['host_configs']
total_count = sum(item['count_vm_available'] for item in hc)
if total_count < to_start_count:
raise Exception(
'Too many VMs requested, try another gpu type or region')
return total_count
except cudo.cudo.rest.ApiException as e:
raise e
23 changes: 15 additions & 8 deletions sky/provision/cudo/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

def _filter_instances(cluster_name_on_cloud: str,
status_filters: Optional[List[str]]) -> Dict[str, Any]:

instances = cudo_wrapper.list_instances()
possible_names = [
f'{cluster_name_on_cloud}-head', f'{cluster_name_on_cloud}-worker'
Expand Down Expand Up @@ -77,10 +76,19 @@ def run_instances(region: str, cluster_name_on_cloud: str,

created_instance_ids = []
public_key = config.node_config['AuthorizedKey']

instance_type = config.node_config['InstanceType']
spec = cudo_machine_type.get_spec_from_instance(instance_type, region)
gpu_count = int(float(spec['gpu_count']))
vcpu_count = int(spec['vcpu_count'])
memory_gib = int(spec['mem_gb'])
gpu_model = spec['gpu_model']
try:
cudo_wrapper.vm_available(to_start_count, gpu_count, gpu_model, region,
memory_gib, vcpu_count)
except Exception as e:
logger.warning(f'run_instances: {e}')
raise
for _ in range(to_start_count):
instance_type = config.node_config['InstanceType']
spec = cudo_machine_type.get_spec_from_instance(instance_type, region)

node_type = 'head' if head_instance_id is None else 'worker'
try:
Expand All @@ -89,10 +97,9 @@ def run_instances(region: str, cluster_name_on_cloud: str,
ssh_key=public_key,
data_center_id=region,
machine_type=spec['machine_type'],
memory_gib=int(spec['mem_gb']),
vcpu_count=int(spec['vcpu_count']),
gpu_count=int(float(spec['gpu_count'])),
gpu_model=spec['gpu_model'],
memory_gib=memory_gib,
vcpu_count=vcpu_count,
gpu_count=gpu_count,
tags={},
disk_size=config.node_config['DiskSize'])
except Exception as e: # pylint: disable=broad-except
Expand Down
Loading