From 9c70fce3721eeb739d017d505bf3de88390b3800 Mon Sep 17 00:00:00 2001
From: Sean <JungleCatSW@users.noreply.github.com>
Date: Mon, 1 Jul 2024 21:24:33 +0100
Subject: [PATCH] [Cudo] Update and bugfixes (#3256)

* bug fixes and improvements

* moved shared function to helper, added error message

* moved catalog helper to utils

* small fixes

* fetch cudo fix

* id fix for vms.csv file

* format fix
---
 .../data_fetchers/fetch_cudo.py               | 125 ++----------------
 sky/provision/cudo/cudo_utils.py              | 112 ++++++++++++++++
 sky/provision/cudo/cudo_wrapper.py            |  53 +++++---
 sky/provision/cudo/instance.py                |  23 ++--
 4 files changed, 172 insertions(+), 141 deletions(-)
 create mode 100644 sky/provision/cudo/cudo_utils.py

diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_cudo.py b/sky/clouds/service_catalog/data_fetchers/fetch_cudo.py
index b15570ddcbc..617751d865a 100644
--- a/sky/clouds/service_catalog/data_fetchers/fetch_cudo.py
+++ b/sky/clouds/service_catalog/data_fetchers/fetch_cudo.py
@@ -9,98 +9,9 @@
 
 import cudo_compute
 
-VMS_CSV = 'cudo/vms.csv'
+import sky.provision.cudo.cudo_utils as utils
 
-cudo_gpu_model = {
-    'NVIDIA V100': 'V100',
-    'NVIDIA A40': 'A40',
-    'RTX 3080': 'RTX3080',
-    'RTX A4000': 'RTXA4000',
-    'RTX A4500': 'RTXA4500',
-    'RTX A5000': 'RTXA5000',
-    'RTX A6000': 'RTXA6000',
-}
-
-cudo_gpu_mem = {
-    'RTX3080': 12,
-    'A40': 48,
-    'RTXA4000': 16,
-    'RTXA4500': 20,
-    'RTXA5000': 24,
-    'RTXA6000': 48,
-    'V100': 16,
-}
-
-machine_specs = [
-    # Low
-    {
-        'vcpu': 2,
-        'mem': 4,
-        'gpu': 1,
-    },
-    {
-        'vcpu': 4,
-        'mem': 8,
-        'gpu': 1,
-    },
-    {
-        'vcpu': 8,
-        'mem': 16,
-        'gpu': 2,
-    },
-    {
-        'vcpu': 16,
-        'mem': 32,
-        'gpu': 2,
-    },
-    {
-        'vcpu': 32,
-        'mem': 64,
-        'gpu': 4,
-    },
-    {
-        'vcpu': 64,
-        'mem': 128,
-        'gpu': 8,
-    },
-    # Mid
-    {
-        'vcpu': 96,
-        'mem': 192,
-        'gpu': 8
-    },
-    {
-        'vcpu': 48,
-        'mem': 96,
-        'gpu': 4
-    },
-    {
-        'vcpu': 24,
-        'mem': 48,
-        'gpu': 2
-    },
-    {
-        'vcpu': 12,
-        'mem': 24,
-        'gpu': 1
-    },
-    # Hi
-    {
-        'vcpu': 96,
-        'mem': 192,
-        'gpu': 4
-    },
-    {
-        'vcpu': 48,
-        'mem': 96,
-        'gpu': 2
-    },
-    {
-        'vcpu': 24,
-        'mem': 48,
-        'gpu': 1
-    },
-]
+VMS_CSV = 'cudo/vms.csv'
 
 
 def cudo_api():
@@ -110,28 +21,8 @@ def cudo_api():
     return cudo_compute.VirtualMachinesApi(client)
 
 
-def cudo_gpu_to_skypilot_gpu(model):
-    if model in cudo_gpu_model:
-        return cudo_gpu_model[model]
-    else:
-        return model
-
-
-def skypilot_gpu_to_cudo_gpu(model):
-    for key, value in cudo_gpu_model.items():
-        if value == model:
-            return key
-    return model
-
-
-def gpu_exists(model):
-    if model in cudo_gpu_model:
-        return True
-    return False
-
-
 def get_gpu_info(count, model):
-    mem = cudo_gpu_mem[model]
+    mem = utils.cudo_gpu_mem[model]
     # pylint: disable=line-too-long
     # {'Name': 'A4000', 'Manufacturer': 'NVIDIA', 'Count': 1.0, 'MemoryInfo': {'SizeInMiB': 16384}}], 'TotalGpuMemoryInMiB': 16384}"
     info = {
@@ -168,16 +59,16 @@ def machine_types(gpu_model, mem_gib, vcpu_count, gpu_count):
 
 def update_prices():
     rows = []
-    for spec in machine_specs:
+    for spec in utils.machine_specs:
         mts = machine_types('', spec['mem'], spec['vcpu'], spec['gpu'])
         for hc in mts['host_configs']:
-            if not gpu_exists(hc['gpu_model']):
+            if not utils.gpu_exists(hc['gpu_model']):
                 continue
-            accelerator_name = cudo_gpu_to_skypilot_gpu(hc['gpu_model'])
+            accelerator_name = utils.cudo_gpu_to_skypilot_gpu(hc['gpu_model'])
             row = {
                 'instance_type': get_instance_type(hc['machine_type'],
-                                                   spec['gpu'], spec['vcpu'],
-                                                   spec['mem']),
+                                                   spec['vcpu'], spec['mem'],
+                                                   spec['gpu']),
                 'accelerator_name': accelerator_name,
                 'accelerator_count': str(spec['gpu']) + '.0',
                 'vcpus': str(spec['vcpu']),
diff --git a/sky/provision/cudo/cudo_utils.py b/sky/provision/cudo/cudo_utils.py
new file mode 100644
index 00000000000..d4ef7f9e415
--- /dev/null
+++ b/sky/provision/cudo/cudo_utils.py
@@ -0,0 +1,112 @@
+"""Cudo catalog helper."""
+
+cudo_gpu_model = {
+    'NVIDIA V100': 'V100',
+    'NVIDIA A40': 'A40',
+    'RTX 3080': 'RTX3080',
+    'RTX A4000': 'RTXA4000',
+    'RTX A4500': 'RTXA4500',
+    'RTX A5000': 'RTXA5000',
+    'RTX A6000': 'RTXA6000',
+}
+
+cudo_gpu_mem = {
+    'RTX3080': 12,
+    'A40': 48,
+    'RTXA4000': 16,
+    'RTXA4500': 20,
+    'RTXA5000': 24,
+    'RTXA6000': 48,
+    'V100': 16,
+}
+
+machine_specs = [
+    # Low
+    {
+        'vcpu': 2,
+        'mem': 4,
+        'gpu': 1,
+    },
+    {
+        'vcpu': 4,
+        'mem': 8,
+        'gpu': 1,
+    },
+    {
+        'vcpu': 8,
+        'mem': 16,
+        'gpu': 2,
+    },
+    {
+        'vcpu': 16,
+        'mem': 32,
+        'gpu': 2,
+    },
+    {
+        'vcpu': 32,
+        'mem': 64,
+        'gpu': 4,
+    },
+    {
+        'vcpu': 64,
+        'mem': 128,
+        'gpu': 8,
+    },
+    # Mid
+    {
+        'vcpu': 96,
+        'mem': 192,
+        'gpu': 8
+    },
+    {
+        'vcpu': 48,
+        'mem': 96,
+        'gpu': 4
+    },
+    {
+        'vcpu': 24,
+        'mem': 48,
+        'gpu': 2
+    },
+    {
+        'vcpu': 12,
+        'mem': 24,
+        'gpu': 1
+    },
+    # Hi
+    {
+        'vcpu': 96,
+        'mem': 192,
+        'gpu': 4
+    },
+    {
+        'vcpu': 48,
+        'mem': 96,
+        'gpu': 2
+    },
+    {
+        'vcpu': 24,
+        'mem': 48,
+        'gpu': 1
+    },
+]
+
+
+def cudo_gpu_to_skypilot_gpu(model):
+    if model in cudo_gpu_model:
+        return cudo_gpu_model[model]
+    else:
+        return model
+
+
+def skypilot_gpu_to_cudo_gpu(model):
+    for key, value in cudo_gpu_model.items():
+        if value == model:
+            return key
+    return model
+
+
+def gpu_exists(model):
+    if model in cudo_gpu_model:
+        return True
+    return False
diff --git a/sky/provision/cudo/cudo_wrapper.py b/sky/provision/cudo/cudo_wrapper.py
index 691c69bda8c..eac39d9faed 100644
--- a/sky/provision/cudo/cudo_wrapper.py
+++ b/sky/provision/cudo/cudo_wrapper.py
@@ -4,29 +4,29 @@
 
 from sky import sky_logging
 from sky.adaptors import cudo
+import sky.provision.cudo.cudo_utils as utils
 
 logger = sky_logging.init_logger(__name__)
 
 
 def launch(name: str, data_center_id: str, ssh_key: str, machine_type: str,
-           memory_gib: int, vcpu_count: int, gpu_count: int, gpu_model: str,
+           memory_gib: int, vcpu_count: int, gpu_count: int,
            tags: Dict[str, str], disk_size: int):
     """Launches an instance with the given parameters."""
-    disk = cudo.cudo.Disk(storage_class='STORAGE_CLASS_NETWORK',
-                          size_gib=disk_size)
-
-    request = cudo.cudo.CreateVMBody(ssh_key_source='SSH_KEY_SOURCE_NONE',
-                                     custom_ssh_keys=[ssh_key],
-                                     vm_id=name,
-                                     machine_type=machine_type,
-                                     data_center_id=data_center_id,
-                                     boot_disk_image_id='ubuntu-nvidia-docker',
-                                     memory_gib=memory_gib,
-                                     vcpus=vcpu_count,
-                                     gpus=gpu_count,
-                                     gpu_model=gpu_model,
-                                     boot_disk=disk,
-                                     metadata=tags)
+
+    request = cudo.cudo.CreateVMBody(
+        ssh_key_source='SSH_KEY_SOURCE_NONE',
+        custom_ssh_keys=[ssh_key],
+        vm_id=name,
+        machine_type=machine_type,
+        data_center_id=data_center_id,
+        boot_disk_image_id='ubuntu-2204-nvidia-535-docker-v20240214',
+        memory_gib=memory_gib,
+        vcpus=vcpu_count,
+        gpus=gpu_count,
+        boot_disk=cudo.cudo.Disk(storage_class='STORAGE_CLASS_NETWORK',
+                                 size_gib=disk_size),
+        metadata=tags)
 
     try:
         api = cudo.cudo.cudo_api.virtual_machines()
@@ -121,3 +121,24 @@ def list_instances():
         return instances
     except cudo.cudo.rest.ApiException as e:
         raise e
+
+
+def vm_available(to_start_count, gpu_count, gpu_model, data_center_id, mem,
+                 cpus):
+    try:
+        gpu_model = utils.skypilot_gpu_to_cudo_gpu(gpu_model)
+        api = cudo.cudo.cudo_api.virtual_machines()
+        types = api.list_vm_machine_types(mem,
+                                          cpus,
+                                          gpu=gpu_count,
+                                          gpu_model=gpu_model,
+                                          data_center_id=data_center_id)
+        types_dict = types.to_dict()
+        hc = types_dict['host_configs']
+        total_count = sum(item['count_vm_available'] for item in hc)
+        if total_count < to_start_count:
+            raise Exception(
+                'Too many VMs requested, try another gpu type or region')
+        return total_count
+    except cudo.cudo.rest.ApiException as e:
+        raise e
diff --git a/sky/provision/cudo/instance.py b/sky/provision/cudo/instance.py
index 105f82a3a37..db306ddaa19 100644
--- a/sky/provision/cudo/instance.py
+++ b/sky/provision/cudo/instance.py
@@ -16,7 +16,6 @@
 
 def _filter_instances(cluster_name_on_cloud: str,
                       status_filters: Optional[List[str]]) -> Dict[str, Any]:
-
     instances = cudo_wrapper.list_instances()
     possible_names = [
         f'{cluster_name_on_cloud}-head', f'{cluster_name_on_cloud}-worker'
@@ -77,10 +76,19 @@ def run_instances(region: str, cluster_name_on_cloud: str,
 
     created_instance_ids = []
     public_key = config.node_config['AuthorizedKey']
-
+    instance_type = config.node_config['InstanceType']
+    spec = cudo_machine_type.get_spec_from_instance(instance_type, region)
+    gpu_count = int(float(spec['gpu_count']))
+    vcpu_count = int(spec['vcpu_count'])
+    memory_gib = int(spec['mem_gb'])
+    gpu_model = spec['gpu_model']
+    try:
+        cudo_wrapper.vm_available(to_start_count, gpu_count, gpu_model, region,
+                                  memory_gib, vcpu_count)
+    except Exception as e:
+        logger.warning(f'run_instances: {e}')
+        raise
     for _ in range(to_start_count):
-        instance_type = config.node_config['InstanceType']
-        spec = cudo_machine_type.get_spec_from_instance(instance_type, region)
 
         node_type = 'head' if head_instance_id is None else 'worker'
         try:
@@ -89,10 +97,9 @@ def run_instances(region: str, cluster_name_on_cloud: str,
                 ssh_key=public_key,
                 data_center_id=region,
                 machine_type=spec['machine_type'],
-                memory_gib=int(spec['mem_gb']),
-                vcpu_count=int(spec['vcpu_count']),
-                gpu_count=int(float(spec['gpu_count'])),
-                gpu_model=spec['gpu_model'],
+                memory_gib=memory_gib,
+                vcpu_count=vcpu_count,
+                gpu_count=gpu_count,
                 tags={},
                 disk_size=config.node_config['DiskSize'])
         except Exception as e:  # pylint: disable=broad-except