[Cudo] Update and bugfixes (#3256)

* bug fixes and improvements * moved shared function to helper, added error message * moved catalog helper to utils * small fixes * fetch cudo fix * id fix for vms.csv file * format fix
skypilot-org · Aug 23, 2024 · 9c70fce · 9c70fce
1 parent a0336b3
commit 9c70fce
Show file tree

Hide file tree

Showing 4 changed files with 172 additions and 141 deletions.
diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_cudo.py b/sky/clouds/service_catalog/data_fetchers/fetch_cudo.py
@@ -9,98 +9,9 @@
 
 import cudo_compute
 
-VMS_CSV = 'cudo/vms.csv'
+import sky.provision.cudo.cudo_utils as utils
 
-cudo_gpu_model = {
- 'NVIDIA V100': 'V100',
- 'NVIDIA A40': 'A40',
- 'RTX 3080': 'RTX3080',
- 'RTX A4000': 'RTXA4000',
- 'RTX A4500': 'RTXA4500',
- 'RTX A5000': 'RTXA5000',
- 'RTX A6000': 'RTXA6000',
-}
-
-cudo_gpu_mem = {
- 'RTX3080': 12,
- 'A40': 48,
- 'RTXA4000': 16,
- 'RTXA4500': 20,
- 'RTXA5000': 24,
- 'RTXA6000': 48,
- 'V100': 16,
-}
-
-machine_specs = [
- # Low
- {
- 'vcpu': 2,
- 'mem': 4,
- 'gpu': 1,
- },
- {
- 'vcpu': 4,
- 'mem': 8,
- 'gpu': 1,
- },
- {
- 'vcpu': 8,
- 'mem': 16,
- 'gpu': 2,
- },
- {
- 'vcpu': 16,
- 'mem': 32,
- 'gpu': 2,
- },
- {
- 'vcpu': 32,
- 'mem': 64,
- 'gpu': 4,
- },
- {
- 'vcpu': 64,
- 'mem': 128,
- 'gpu': 8,
- },
- # Mid
- {
- 'vcpu': 96,
- 'mem': 192,
- 'gpu': 8
- },
- {
- 'vcpu': 48,
- 'mem': 96,
- 'gpu': 4
- },
- {
- 'vcpu': 24,
- 'mem': 48,
- 'gpu': 2
- },
- {
- 'vcpu': 12,
- 'mem': 24,
- 'gpu': 1
- },
- # Hi
- {
- 'vcpu': 96,
- 'mem': 192,
- 'gpu': 4
- },
- {
- 'vcpu': 48,
- 'mem': 96,
- 'gpu': 2
- },
- {
- 'vcpu': 24,
- 'mem': 48,
- 'gpu': 1
- },
-]
+VMS_CSV = 'cudo/vms.csv'
 
 
 def cudo_api():
@@ -110,28 +21,8 @@ def cudo_api():
  return cudo_compute.VirtualMachinesApi(client)
 
 
-def cudo_gpu_to_skypilot_gpu(model):
- if model in cudo_gpu_model:
- return cudo_gpu_model[model]
- else:
- return model
-
-
-def skypilot_gpu_to_cudo_gpu(model):
- for key, value in cudo_gpu_model.items():
- if value == model:
- return key
- return model
-
-
-def gpu_exists(model):
- if model in cudo_gpu_model:
- return True
- return False
-
-
 def get_gpu_info(count, model):
- mem = cudo_gpu_mem[model]
+ mem = utils.cudo_gpu_mem[model]
  # pylint: disable=line-too-long
  # {'Name': 'A4000', 'Manufacturer': 'NVIDIA', 'Count': 1.0, 'MemoryInfo': {'SizeInMiB': 16384}}], 'TotalGpuMemoryInMiB': 16384}"
  info = {
@@ -168,16 +59,16 @@ def machine_types(gpu_model, mem_gib, vcpu_count, gpu_count):
 
 def update_prices():
  rows = []
- for spec in machine_specs:
+ for spec in utils.machine_specs:
  mts = machine_types('', spec['mem'], spec['vcpu'], spec['gpu'])
  for hc in mts['host_configs']:
- if not gpu_exists(hc['gpu_model']):
+ if not utils.gpu_exists(hc['gpu_model']):
  continue
- accelerator_name = cudo_gpu_to_skypilot_gpu(hc['gpu_model'])
+ accelerator_name = utils.cudo_gpu_to_skypilot_gpu(hc['gpu_model'])
  row = {
  'instance_type': get_instance_type(hc['machine_type'],
- spec['gpu'], spec['vcpu'],
- spec['mem']),
+ spec['vcpu'], spec['mem'],
+ spec['gpu']),
  'accelerator_name': accelerator_name,
  'accelerator_count': str(spec['gpu']) + '.0',
  'vcpus': str(spec['vcpu']),

diff --git a/sky/provision/cudo/cudo_utils.py b/sky/provision/cudo/cudo_utils.py
@@ -0,0 +1,112 @@
+"""Cudo catalog helper."""
+
+cudo_gpu_model = {
+ 'NVIDIA V100': 'V100',
+ 'NVIDIA A40': 'A40',
+ 'RTX 3080': 'RTX3080',
+ 'RTX A4000': 'RTXA4000',
+ 'RTX A4500': 'RTXA4500',
+ 'RTX A5000': 'RTXA5000',
+ 'RTX A6000': 'RTXA6000',
+}
+
+cudo_gpu_mem = {
+ 'RTX3080': 12,
+ 'A40': 48,
+ 'RTXA4000': 16,
+ 'RTXA4500': 20,
+ 'RTXA5000': 24,
+ 'RTXA6000': 48,
+ 'V100': 16,
+}
+
+machine_specs = [
+ # Low
+ {
+ 'vcpu': 2,
+ 'mem': 4,
+ 'gpu': 1,
+ },
+ {
+ 'vcpu': 4,
+ 'mem': 8,
+ 'gpu': 1,
+ },
+ {
+ 'vcpu': 8,
+ 'mem': 16,
+ 'gpu': 2,
+ },
+ {
+ 'vcpu': 16,
+ 'mem': 32,
+ 'gpu': 2,
+ },
+ {
+ 'vcpu': 32,
+ 'mem': 64,
+ 'gpu': 4,
+ },
+ {
+ 'vcpu': 64,
+ 'mem': 128,
+ 'gpu': 8,
+ },
+ # Mid
+ {
+ 'vcpu': 96,
+ 'mem': 192,
+ 'gpu': 8
+ },
+ {
+ 'vcpu': 48,
+ 'mem': 96,
+ 'gpu': 4
+ },
+ {
+ 'vcpu': 24,
+ 'mem': 48,
+ 'gpu': 2
+ },
+ {
+ 'vcpu': 12,
+ 'mem': 24,
+ 'gpu': 1
+ },
+ # Hi
+ {
+ 'vcpu': 96,
+ 'mem': 192,
+ 'gpu': 4
+ },
+ {
+ 'vcpu': 48,
+ 'mem': 96,
+ 'gpu': 2
+ },
+ {
+ 'vcpu': 24,
+ 'mem': 48,
+ 'gpu': 1
+ },
+]
+
+
+def cudo_gpu_to_skypilot_gpu(model):
+ if model in cudo_gpu_model:
+ return cudo_gpu_model[model]
+ else:
+ return model
+
+
+def skypilot_gpu_to_cudo_gpu(model):
+ for key, value in cudo_gpu_model.items():
+ if value == model:
+ return key
+ return model
+
+
+def gpu_exists(model):
+ if model in cudo_gpu_model:
+ return True
+ return False
diff --git a/sky/provision/cudo/cudo_wrapper.py b/sky/provision/cudo/cudo_wrapper.py
@@ -4,29 +4,29 @@
 
 from sky import sky_logging
 from sky.adaptors import cudo
+import sky.provision.cudo.cudo_utils as utils
 
 logger = sky_logging.init_logger(__name__)
 
 
 def launch(name: str, data_center_id: str, ssh_key: str, machine_type: str,
- memory_gib: int, vcpu_count: int, gpu_count: int, gpu_model: str,
+ memory_gib: int, vcpu_count: int, gpu_count: int,
  tags: Dict[str, str], disk_size: int):
  """Launches an instance with the given parameters."""
- disk = cudo.cudo.Disk(storage_class='STORAGE_CLASS_NETWORK',
- size_gib=disk_size)
-
- request = cudo.cudo.CreateVMBody(ssh_key_source='SSH_KEY_SOURCE_NONE',
- custom_ssh_keys=[ssh_key],
- vm_id=name,
- machine_type=machine_type,
- data_center_id=data_center_id,
- boot_disk_image_id='ubuntu-nvidia-docker',
- memory_gib=memory_gib,
- vcpus=vcpu_count,
- gpus=gpu_count,
- gpu_model=gpu_model,
- boot_disk=disk,
- metadata=tags)
+
+ request = cudo.cudo.CreateVMBody(
+ ssh_key_source='SSH_KEY_SOURCE_NONE',
+ custom_ssh_keys=[ssh_key],
+ vm_id=name,
+ machine_type=machine_type,
+ data_center_id=data_center_id,
+ boot_disk_image_id='ubuntu-2204-nvidia-535-docker-v20240214',
+ memory_gib=memory_gib,
+ vcpus=vcpu_count,
+ gpus=gpu_count,
+ boot_disk=cudo.cudo.Disk(storage_class='STORAGE_CLASS_NETWORK',
+ size_gib=disk_size),
+ metadata=tags)
 
  try:
  api = cudo.cudo.cudo_api.virtual_machines()
@@ -121,3 +121,24 @@ def list_instances():
  return instances
  except cudo.cudo.rest.ApiException as e:
  raise e
+
+
+def vm_available(to_start_count, gpu_count, gpu_model, data_center_id, mem,
+ cpus):
+ try:
+ gpu_model = utils.skypilot_gpu_to_cudo_gpu(gpu_model)
+ api = cudo.cudo.cudo_api.virtual_machines()
+ types = api.list_vm_machine_types(mem,
+ cpus,
+ gpu=gpu_count,
+ gpu_model=gpu_model,
+ data_center_id=data_center_id)
+ types_dict = types.to_dict()
+ hc = types_dict['host_configs']
+ total_count = sum(item['count_vm_available'] for item in hc)
+ if total_count < to_start_count:
+ raise Exception(
+ 'Too many VMs requested, try another gpu type or region')
+ return total_count
+ except cudo.cudo.rest.ApiException as e:
+ raise e
diff --git a/sky/provision/cudo/instance.py b/sky/provision/cudo/instance.py
@@ -16,7 +16,6 @@
 
 def _filter_instances(cluster_name_on_cloud: str,
  status_filters: Optional[List[str]]) -> Dict[str, Any]:
-
  instances = cudo_wrapper.list_instances()
  possible_names = [
  f'{cluster_name_on_cloud}-head', f'{cluster_name_on_cloud}-worker'
@@ -77,10 +76,19 @@ def run_instances(region: str, cluster_name_on_cloud: str,
 
  created_instance_ids = []
  public_key = config.node_config['AuthorizedKey']
-
+ instance_type = config.node_config['InstanceType']
+ spec = cudo_machine_type.get_spec_from_instance(instance_type, region)
+ gpu_count = int(float(spec['gpu_count']))
+ vcpu_count = int(spec['vcpu_count'])
+ memory_gib = int(spec['mem_gb'])
+ gpu_model = spec['gpu_model']
+ try:
+ cudo_wrapper.vm_available(to_start_count, gpu_count, gpu_model, region,
+ memory_gib, vcpu_count)
+ except Exception as e:
+ logger.warning(f'run_instances: {e}')
+ raise
  for _ in range(to_start_count):
- instance_type = config.node_config['InstanceType']
- spec = cudo_machine_type.get_spec_from_instance(instance_type, region)
 
  node_type = 'head' if head_instance_id is None else 'worker'
  try:
@@ -89,10 +97,9 @@ def run_instances(region: str, cluster_name_on_cloud: str,
  ssh_key=public_key,
  data_center_id=region,
  machine_type=spec['machine_type'],
- memory_gib=int(spec['mem_gb']),
- vcpu_count=int(spec['vcpu_count']),
- gpu_count=int(float(spec['gpu_count'])),
- gpu_model=spec['gpu_model'],
+ memory_gib=memory_gib,
+ vcpu_count=vcpu_count,
+ gpu_count=gpu_count,
  tags={},
  disk_size=config.node_config['DiskSize'])
  except Exception as e: # pylint: disable=broad-except