From f19128918f4640e50e6ada51c6f9d5ea2bd4386a Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Tue, 5 Mar 2024 13:44:43 -0800 Subject: [PATCH] [GCP] Support H100 for GCP (#3279) * Support h100 * Fix H100 from sku * Fix H100 --- sky/clouds/gcp.py | 4 +++- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py | 8 ++++++++ sky/clouds/service_catalog/gcp_catalog.py | 3 +++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py index 9c6fffcc435..b786a9f8d2f 100644 --- a/sky/clouds/gcp.py +++ b/sky/clouds/gcp.py @@ -445,7 +445,9 @@ def make_deploy_resources_variables( # https://cloud.google.com/compute/docs/gpus if acc in ('A100-80GB', 'L4'): # A100-80GB and L4 have a different name pattern. - resources_vars['gpu'] = 'nvidia-{}'.format(acc.lower()) + resources_vars['gpu'] = f'nvidia-{acc.lower()}' + elif acc == 'H100': + resources_vars['gpu'] = f'nvidia-{acc.lower()}-80gb' else: resources_vars['gpu'] = 'nvidia-tesla-{}'.format( acc.lower()) diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py b/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py index 2376e785531..8dca5bd407b 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py @@ -62,6 +62,7 @@ # Unsupported Series: 'f1', 'm2' SERIES_TO_DISCRIPTION = { 'a2': 'A2 Instance', + 'a3': 'A3 Instance', 'c2': 'Compute optimized', 'c2d': 'C2D AMD Instance', 'c3': 'C3 Instance', @@ -298,6 +299,11 @@ def _get_gpus_for_zone(zone: str) -> pd.DataFrame: gpu_name = gpu_name.replace('nvidia-', '') gpu_name = gpu_name.replace('tesla-', '') gpu_name = gpu_name.upper() + if 'H100-80GB' in gpu_name: + gpu_name = 'H100' + if count != 8: + # H100 only has 8 cards. + continue if 'VWS' in gpu_name: continue if gpu_name.startswith('TPU-'): @@ -344,6 +350,8 @@ def get_gpu_price(row: pd.Series, spot: bool) -> Optional[float]: gpu_name = row['AcceleratorName'] if gpu_name == 'A100-80GB': gpu_name = 'A100 80GB' + if gpu_name == 'H100': + gpu_name = 'H100 80GB' if f'{gpu_name} GPU' not in sku['description']: continue diff --git a/sky/clouds/service_catalog/gcp_catalog.py b/sky/clouds/service_catalog/gcp_catalog.py index b4ea3c1dc25..24b48972166 100644 --- a/sky/clouds/service_catalog/gcp_catalog.py +++ b/sky/clouds/service_catalog/gcp_catalog.py @@ -92,6 +92,9 @@ 4: ['g2-standard-48'], 8: ['g2-standard-96'], }, + 'H100': { + 8: ['a3-highgpu-8g'], + } } # Number of CPU cores per GPU based on the AWS setting.