Skip to content

Commit

Permalink
[k8s] Add label validation during sky check (#2653)
Browse files Browse the repository at this point in the history
* Add label validation

* early return

* lint

* update docs

* fix test

* update monkey patching

* fixes
  • Loading branch information
romilbhardwaj authored Nov 19, 2023
1 parent 3a7c858 commit feacc9f
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 22 deletions.
11 changes: 7 additions & 4 deletions docs/source/reference/kubernetes/kubernetes-setup.rst
Original file line number Diff line number Diff line change
Expand Up @@ -167,10 +167,10 @@ Please follow their respective guides to deploy your Kubernetes cluster.

Setting up GPU support
~~~~~~~~~~~~~~~~~~~~~~
If your Kubernetes cluster has Nvidia GPUs, make sure you have the Nvidia
device plugin installed (i.e., ``nvidia.com/gpu`` resource is available on each node).
Additionally, you will need to label each node in your cluster with the GPU type.
For example, a node with v100 GPUs must have a label :code:`skypilot.co/accelerators: v100`.
If your Kubernetes cluster has Nvidia GPUs, ensure that:

1. The Nvidia device plugin is installed (i.e., ``nvidia.com/gpu`` resource is available on each node).
2. Each node in your cluster is labelled with the GPU type. This labelling can be done by adding a label of the format ``skypilot.co/accelerators: <gpu_name>``, where the ``<gpu_name>`` is the lowercase name of the GPU. For example, a node with V100 GPUs must have a label :code:`skypilot.co/accelerators: v100`.

We provide a convenience script that automatically detects GPU types and labels each node. You can run it with:

Expand All @@ -185,6 +185,9 @@ We provide a convenience script that automatically detects GPU types and labels
You can check if nodes have been labeled by running `kubectl describe nodes` and looking for labels of the format `skypilot.co/accelerators: <gpu_name>`.
.. note::
GPU labels are case-sensitive. Ensure that the GPU name is lowercase if you are using the ``skypilot.co/accelerators`` label.

.. note::
GPU labelling is not required on GKE clusters - SkyPilot will automatically use GKE provided labels. However, you will still need to install `drivers <https://cloud.google.com/kubernetes-engine/docs/how-to/gpus#installing_drivers>`_.

Expand Down
73 changes: 56 additions & 17 deletions sky/utils/kubernetes_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,20 @@ def get_accelerator_from_label_value(cls, value: str) -> str:
"""Given a label value, returns the GPU type"""
raise NotImplementedError

@classmethod
def validate_label_value(cls, value: str) -> Tuple[bool, str]:
"""Validates if the specified label value is correct.
Used to check if the labelling on the cluster is correct and
preemptively raise an error if it is not.
Returns:
bool: True if the label value is valid, False otherwise.
str: Error message if the label value is invalid, None otherwise.
"""
del value
return True, ''


def get_gke_accelerator_name(accelerator: str) -> str:
"""Returns the accelerator name for GKE clusters
Expand Down Expand Up @@ -97,6 +111,14 @@ def get_label_value(cls, accelerator: str) -> str:
def get_accelerator_from_label_value(cls, value: str) -> str:
return value.upper()

@classmethod
def validate_label_value(cls, value: str) -> Tuple[bool, str]:
"""Values must be all lowercase for the SkyPilot formatter."""
is_valid = value == value.lower()
return is_valid, (f'Label value {value!r} must be lowercase if using '
f'the {cls.get_label_key()} label.'
if not is_valid else '')


class CoreWeaveLabelFormatter(GPULabelFormatter):
"""CoreWeave label formatter
Expand Down Expand Up @@ -157,31 +179,32 @@ def get_accelerator_from_label_value(cls, value: str) -> str:


def detect_gpu_label_formatter(
) -> Tuple[Optional[GPULabelFormatter], List[Tuple[str, str]]]:
) -> Tuple[Optional[GPULabelFormatter], Dict[str, List[Tuple[str, str]]]]:
"""Detects the GPU label formatter for the Kubernetes cluster
Returns:
GPULabelFormatter: The GPU label formatter for the cluster, if found.
List[Tuple[str, str]]: The set of labels and values across all nodes.
Dict[str, List[Tuple[str, str]]]: A mapping of nodes and the list of
labels on each node. E.g., {'node1': [('label1', 'value1')]}
"""
# Get all labels across all nodes
node_labels: List[Tuple[str, str]] = []
node_labels: Dict[str, List[Tuple[str, str]]] = {}
nodes = get_kubernetes_nodes()
for node in nodes:
node_labels.extend(node.metadata.labels.items())
node_labels[node.metadata.name] = []
for label, value in node.metadata.labels.items():
node_labels[node.metadata.name].append((label, value))

label_formatter = None

# Check if the node labels contain any of the GPU label prefixes
for lf in LABEL_FORMATTER_REGISTRY:
label_key = lf.get_label_key()
for label, _ in node_labels:
if label.startswith(label_key):
label_formatter = lf()
break

if label_formatter is not None:
break
for _, label_list in node_labels.items():
for label, _ in label_list:
if label.startswith(label_key):
label_formatter = lf()
return label_formatter, node_labels

return label_formatter, node_labels

Expand Down Expand Up @@ -343,6 +366,17 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]:
'the documentation on how to set up node labels.'
f'{suffix}')
if label_formatter is not None:
# Validate the label value on all nodes labels to ensure they are
# correctly setup and will behave as expected.
for node_name, label_list in node_labels.items():
for label, value in label_list:
if label == label_formatter.get_label_key():
is_valid, reason = label_formatter.validate_label_value(
value)
if not is_valid:
raise exceptions.ResourcesUnavailableError(
f'Node {node_name!r} in Kubernetes cluster has '
f'invalid GPU label: {label}={value}. {reason}')
if check_mode:
# If check mode is enabled and we reached so far, we can
# conclude that the cluster is setup correctly and return.
Expand All @@ -355,17 +389,22 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]:
# node. It does not (and should not) check if the resource
# quantity is available since that is dynamic and can change
# during scheduling.
for label, value in node_labels:
if label == k8s_acc_label_key and value == k8s_acc_label_value:
# If a node is found, we can break out of the loop
# and proceed to deploy.
return k8s_acc_label_key, k8s_acc_label_value
for node_name, label_list in node_labels.items():
for label, value in label_list:
if (label == k8s_acc_label_key and
value == k8s_acc_label_value):
# If a node is found, we can break out of the loop
# and proceed to deploy.
return k8s_acc_label_key, k8s_acc_label_value
# If no node is found with the requested acc_type, raise error
with ux_utils.print_exception_no_traceback():
suffix = ''
if env_options.Options.SHOW_DEBUG_INFO.get():
all_labels = []
for node_name, label_list in node_labels.items():
all_labels.extend(label_list)
gpus_available = set(
v for k, v in node_labels if k == k8s_acc_label_key)
v for k, v in all_labels if k == k8s_acc_label_key)
suffix = f' Available GPUs on the cluster: {gpus_available}'
raise exceptions.ResourcesUnavailableError(
'Could not find any node in the Kubernetes cluster '
Expand Down
2 changes: 1 addition & 1 deletion tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def _get_az_mappings(_):
# the cluster to detect available cluster resources.
monkeypatch.setattr(
'sky.utils.kubernetes_utils.detect_gpu_label_formatter',
lambda *_args, **_kwargs: [kubernetes_utils.SkyPilotLabelFormatter, []])
lambda *_args, **_kwargs: [kubernetes_utils.SkyPilotLabelFormatter, {}])
monkeypatch.setattr('sky.utils.kubernetes_utils.detect_gpu_resource',
lambda *_args, **_kwargs: [True, []])
monkeypatch.setattr('sky.utils.kubernetes_utils.check_instance_fits',
Expand Down

0 comments on commit feacc9f

Please sign in to comment.