Add e2e tests using GPU to execute current test scenarios

project-codeflare · Jul 11, 2024 · c308c0f · c308c0f
1 parent 685b0b6
commit c308c0f
Show file tree

Hide file tree

Showing 8 changed files with 83 additions and 44 deletions.
diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml
@@ -30,26 +30,9 @@ env:
 jobs:
  kubernetes:
 
- runs-on: ubuntu-20.04
+ runs-on: ubuntu-20.04-4core-gpu
 
  steps:
- - name: Cleanup
- run: |
- ls -lart
- echo "Initial status:"
- df -h
- echo "Cleaning up resources:"
- sudo swapoff -a
- sudo rm -f /swapfile
- sudo apt clean
- sudo rm -rf /usr/share/dotnet
- sudo rm -rf /opt/ghc
- sudo rm -rf "/usr/local/share/boost"
- sudo rm -rf "$AGENT_TOOLSDIRECTORY"
- docker rmi $(docker image ls -aq)
- echo "Final status:"
- df -h
-
  - name: Checkout code
  uses: actions/checkout@v4
  with:
@@ -85,9 +68,15 @@ jobs:
  python-version: '3.9'
  cache: 'pip' # caching pip dependencies
 
+ - name: Setup NVidia GPU environment for KinD
+ uses: ./common/github-actions/nvidia-gpu-setup
+
  - name: Setup and start KinD cluster
  uses: ./common/github-actions/kind
 
+ - name: Install NVidia GPU operator for KinD
+ uses: ./common/github-actions/nvidia-gpu-operator
+
  - name: Deploy CodeFlare stack
  id: deploy
  run: |
@@ -104,9 +93,6 @@ jobs:
  with:
  user-name: sdk-user
 
- - name: Add kueue resources
- run: kubectl apply --server-side -f "https://github.com/kubernetes-sigs/kueue/releases/download/v0.6.2/manifests.yaml"
-
  - name: Configure RBAC for sdk user with limited permissions
  run: |
  kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
@@ -136,7 +122,7 @@ jobs:
  pip install poetry
  poetry install --with test,docs
  echo "Running e2e tests..."
- poetry run pytest -v -s ./tests/e2e -m kind > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
+ poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
  env:
  GRPC_DNS_RESOLVER: "native"
 

diff --git a/docs/e2e.md b/docs/e2e.md
@@ -5,6 +5,9 @@
 ## On KinD clusters
 Pre-requisite for KinD clusters: please add in your local `/etc/hosts` file `127.0.0.1 kind`. This will map your localhost IP address to the KinD cluster's hostname. This is already performed on [GitHub Actions](https://github.com/project-codeflare/codeflare-common/blob/1edd775e2d4088a5a0bfddafb06ff3a773231c08/github-actions/kind/action.yml#L70-L72)
 
+If the system you run on contains NVidia GPU then you can enable the GPU support in KinD, this will allow you to run also GPU tests.
+To enable GPU on KinD follow [these instructions](https://www.substratus.ai/blog/kind-with-gpus).
+
 - Setup Phase:
  - Pull the [codeflare-operator repo](https://github.com/project-codeflare/codeflare-operator) and run the following make targets:
  ```
@@ -64,9 +67,13 @@ Pre-requisite for KinD clusters: please add in your local `/etc/hosts` file `127
  - Once we have the codeflare-operator, kuberay-operator and kueue running and ready, we can run the e2e test on the codeflare-sdk repository:
  ```
  poetry install --with test,docs
- poetry run pytest -v -s ./tests/e2e/mnist_raycluster_sdk_test.py
+ poetry run pytest -v -s ./tests/e2e/mnist_raycluster_sdk_kind_test.py
+ ```
+ - If the cluster doesn't have NVidia GPU support then we need to disable NVidia GPU tests by providing proper marker:
+ ```
+ poetry install --with test,docs
+ poetry run pytest -v -s ./tests/e2e/mnist_raycluster_sdk_kind_test.py -m 'kind and not nvidia_gpu'
  ```
-
 
 
 ## On OpenShift clusters
@@ -83,6 +90,10 @@ Pre-requisite for KinD clusters: please add in your local `/etc/hosts` file `127
  kubectl apply --server-side -k "github.com/opendatahub-io/kueue/config/rhoai?ref=dev"
  ```
 
+If the system you run on contains NVidia GPU then you can enable the GPU support on OpenShift, this will allow you to run also GPU tests.
+To enable GPU on OpenShift follow [these instructions](https://docs.nvidia.com/datacenter/cloud-native/openshift/latest/introduction.html).
+Currently the SDK doesn't support tolerations, so e2e tests can't be executed on nodes with taint (i.e. GPU taint).
+
 - Test Phase:
  - Once we have the codeflare-operator, kuberay-operator and kueue running and ready, we can run the e2e test on the codeflare-sdk repository:
  ```
@@ -97,3 +108,8 @@ Pre-requisite for KinD clusters: please add in your local `/etc/hosts` file `127
  ```
  poetry run pytest -v -s ./tests/e2e -m openshift --timeout=1200
  ```
+ - If the cluster doesn't have NVidia GPU support or GPU nodes have taint then we need to disable NVidia GPU tests by providing proper marker:
+ ```
+ poetry install --with test,docs
+ poetry run pytest -v -s ./tests/e2e/mnist_raycluster_sdk_kind_test.py -m 'not nvidia_gpu'
+ ```
diff --git a/pyproject.toml b/pyproject.toml
@@ -52,6 +52,7 @@ filterwarnings = [
 ]
 markers = [
  "kind",
- "openshift"
+ "openshift",
+ "nvidia_gpu"
 ]
 addopts = "--timeout=900"
diff --git a/tests/e2e/local_interactive_sdk_kind_test.py b/tests/e2e/local_interactive_sdk_kind_test.py
@@ -27,7 +27,16 @@ def test_local_interactives(self):
  create_kueue_resources(self)
  self.run_local_interactives()
 
- def run_local_interactives(self):
+ @pytest.mark.nvidia_gpu
+ def test_local_interactives_nvidia_gpu(self):
+ self.setup_method()
+ create_namespace(self)
+ create_kueue_resources(self)
+ self.run_local_interactives(number_of_gpus=1)
+
+ def run_local_interactives(
+ self, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
+ ):
  ray_image = get_ray_image()
 
  cluster_name = "test-ray-cluster-li"
@@ -43,6 +52,7 @@ def run_local_interactives(self):
  worker_cpu_limits=1,
  worker_memory_requests=1,
  worker_memory_limits=2,
+ worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
  image=ray_image,
  write_to_file=True,
  verify_tls=False,
@@ -59,7 +69,7 @@ def run_local_interactives(self):
  ray.shutdown()
  ray.init(address=cluster.local_client_url(), logging_level="DEBUG")
 
- @ray.remote
+ @ray.remote(num_gpus=number_of_gpus / 2)
  def heavy_calculation_part(num_iterations):
  result = 0.0
  for i in range(num_iterations):
@@ -68,7 +78,7 @@ def heavy_calculation_part(num_iterations):
  result += math.sin(i) * math.cos(j) * math.tan(k)
  return result
 
- @ray.remote
+ @ray.remote(num_gpus=number_of_gpus / 2)
  def heavy_calculation(num_iterations):
  results = ray.get(
  [heavy_calculation_part.remote(num_iterations // 30) for _ in range(30)]

diff --git a/tests/e2e/mnist.py b/tests/e2e/mnist.py
@@ -32,6 +32,9 @@
 print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR"))
 print("MASTER_PORT: is ", os.getenv("MASTER_PORT"))
 
+print("ACCELERATOR: is ", os.getenv("ACCELERATOR"))
+ACCELERATOR = os.getenv("ACCELERATOR")
+
 
 class LitMNIST(LightningModule):
  def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4):
@@ -149,7 +152,7 @@ def test_dataloader(self):
 
 # Initialize a trainer
 trainer = Trainer(
- accelerator="auto",
+ accelerator=ACCELERATOR,
  # devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs
  max_epochs=3,
  callbacks=[TQDMProgressBar(refresh_rate=20)],

diff --git a/tests/e2e/mnist_raycluster_sdk_aw_kind_test.py b/tests/e2e/mnist_raycluster_sdk_aw_kind_test.py
@@ -24,9 +24,18 @@ def test_mnist_ray_cluster_sdk_kind(self):
  self.setup_method()
  create_namespace(self)
  create_kueue_resources(self)
- self.run_mnist_raycluster_sdk_kind()
+ self.run_mnist_raycluster_sdk_kind(accelerator="cpu")
 
- def run_mnist_raycluster_sdk_kind(self):
+ @pytest.mark.nvidia_gpu
+ def test_mnist_ray_cluster_sdk_kind_nvidia_gpu(self):
+ self.setup_method()
+ create_namespace(self)
+ create_kueue_resources(self)
+ self.run_mnist_raycluster_sdk_kind(accelerator="gpu", number_of_gpus=1)
+
+ def run_mnist_raycluster_sdk_kind(
+ self, accelerator, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
+ ):
  ray_image = get_ray_image()
 
  cluster = Cluster(
@@ -36,11 +45,11 @@ def run_mnist_raycluster_sdk_kind(self):
  num_workers=1,
  head_cpus="500m",
  head_memory=2,
- min_cpus="500m",
- max_cpus=1,
- min_memory=1,
- max_memory=2,
- num_gpus=0,
+ worker_cpu_requests="500m",
+ worker_cpu_limits=1,
+ worker_memory_requests=1,
+ worker_memory_limits=4,
+ worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
  image=ray_image,
  write_to_file=True,
  verify_tls=False,
@@ -58,11 +67,11 @@ def run_mnist_raycluster_sdk_kind(self):
 
  cluster.details()
 
- self.assert_jobsubmit_withoutlogin_kind(cluster)
+ self.assert_jobsubmit_withoutlogin_kind(cluster, accelerator, number_of_gpus)
 
  # Assertions
 
- def assert_jobsubmit_withoutlogin_kind(self, cluster):
+ def assert_jobsubmit_withoutlogin_kind(self, cluster, accelerator, number_of_gpus):
  ray_dashboard = cluster.cluster_dashboard_uri()
  client = RayJobClient(address=ray_dashboard, verify=False)
 
@@ -71,7 +80,9 @@ def assert_jobsubmit_withoutlogin_kind(self, cluster):
  runtime_env={
  "working_dir": "./tests/e2e/",
  "pip": "./tests/e2e/mnist_pip_requirements.txt",
+ "env_vars": {"ACCELERATOR": accelerator},
  },
+ entrypoint_num_gpus=number_of_gpus,
  )
  print(f"Submitted job with ID: {submission_id}")
  done = False

diff --git a/tests/e2e/mnist_raycluster_sdk_kind_test.py b/tests/e2e/mnist_raycluster_sdk_kind_test.py
@@ -25,9 +25,18 @@ def test_mnist_ray_cluster_sdk_kind(self):
  self.setup_method()
  create_namespace(self)
  create_kueue_resources(self)
- self.run_mnist_raycluster_sdk_kind()
+ self.run_mnist_raycluster_sdk_kind(accelerator="cpu")
 
- def run_mnist_raycluster_sdk_kind(self):
+ @pytest.mark.nvidia_gpu
+ def test_mnist_ray_cluster_sdk_kind_nvidia_gpu(self):
+ self.setup_method()
+ create_namespace(self)
+ create_kueue_resources(self)
+ self.run_mnist_raycluster_sdk_kind(accelerator="gpu", number_of_gpus=1)
+
+ def run_mnist_raycluster_sdk_kind(
+ self, accelerator, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
+ ):
  ray_image = get_ray_image()
 
  cluster = Cluster(
@@ -40,7 +49,8 @@ def run_mnist_raycluster_sdk_kind(self):
  worker_cpu_requests="500m",
  worker_cpu_limits=1,
  worker_memory_requests=1,
- worker_memory_limits=2,
+ worker_memory_limits=4,
+ worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
  image=ray_image,
  write_to_file=True,
  verify_tls=False,
@@ -57,11 +67,11 @@ def run_mnist_raycluster_sdk_kind(self):
 
  cluster.details()
 
- self.assert_jobsubmit_withoutlogin_kind(cluster)
+ self.assert_jobsubmit_withoutlogin_kind(cluster, accelerator, number_of_gpus)
 
  # Assertions
 
- def assert_jobsubmit_withoutlogin_kind(self, cluster):
+ def assert_jobsubmit_withoutlogin_kind(self, cluster, accelerator, number_of_gpus):
  ray_dashboard = cluster.cluster_dashboard_uri()
  client = RayJobClient(address=ray_dashboard, verify=False)
 
@@ -70,7 +80,9 @@ def assert_jobsubmit_withoutlogin_kind(self, cluster):
  runtime_env={
  "working_dir": "./tests/e2e/",
  "pip": "./tests/e2e/mnist_pip_requirements.txt",
+ "env_vars": {"ACCELERATOR": accelerator},
  },
+ entrypoint_num_gpus=number_of_gpus,
  )
  print(f"Submitted job with ID: {submission_id}")
  done = False

diff --git a/tests/e2e/support.py b/tests/e2e/support.py
@@ -90,7 +90,7 @@ def create_cluster_queue(self, cluster_queue, flavor):
  "resources": [
  {"name": "cpu", "nominalQuota": 9},
  {"name": "memory", "nominalQuota": "36Gi"},
- {"name": "nvidia.com/gpu", "nominalQuota": 0},
+ {"name": "nvidia.com/gpu", "nominalQuota": 1},
  ],
  }
  ],