From a37fa69b6105a7f8bd67031b50e9b9d732b24b93 Mon Sep 17 00:00:00 2001
From: yangchenjun <yang.chenjun@99cloud.net>
Date: Wed, 3 Jul 2024 10:59:56 +0800
Subject: [PATCH] device now cat boot with different gpu resource name

---
 deploy/install-open-hydra-keystone.yaml | 15 +++++++--
 deploy/install-open-hydra.yaml          | 15 +++++++--
 pkg/open-hydra/apis/api.go              | 22 ++++++-------
 pkg/open-hydra/device-handler.go        | 36 +++++++++++++++++++--
 pkg/open-hydra/k8s/faker.go             | 20 +++++++++---
 pkg/open-hydra/open-hydra_test.go       | 43 ++++++++++++++++++-------
 6 files changed, 117 insertions(+), 34 deletions(-)

diff --git a/deploy/install-open-hydra-keystone.yaml b/deploy/install-open-hydra-keystone.yaml
index 855d624..6846ddf 100644
--- a/deploy/install-open-hydra-keystone.yaml
+++ b/deploy/install-open-hydra-keystone.yaml
@@ -191,7 +191,10 @@ data:
         "sandboxes": {
             "xedu": {
                 "cpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/jupyter:Python-3.8.18-dual-lan",
-                "gpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/jupyter:Python-3.8.18-dual-lan",
+                "gpuImageSet": {
+                    "nvidia.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/jupyter:Python-3.8.18-dual-lan",
+                    "amd.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/jupyter:Python-3.8.18-dual-lan"
+                },
                 "icon_name": "jupyter-lab.png",
                 "command": [],
                 "args": [],
@@ -260,7 +263,10 @@ data:
             },
             "keras": {
                 "cpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/tensorflow:2.15.0-jupyter-cpu",
-                "gpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/tensorflow:latest-gpu-with-jlab",
+                "gpuImageSet": {
+                    "nvidia.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/tensorflow:latest-gpu-with-jlab",
+                    "amd.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/tensorflow:latest-gpu-with-jlab"
+                },
                 "icon_name": "keras.png",
                 "command": [],
                 "args": [],
@@ -316,7 +322,10 @@ data:
             },
             "PaddlePaddle": {
                 "cpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/paddle:2.6.0-jlab",
-                "gpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/paddle:2.6.0-gpu-cuda12.0-cudnn8.9-trt8.6-jlab",
+                "gpuImageSet": {
+                    "nvidia.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/paddle:2.6.0-gpu-cuda12.0-cudnn8.9-trt8.6-jlab",
+                    "amd.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/paddle:2.6.0-gpu-cuda12.0-cudnn8.9-trt8.6-jlab"
+                },
                 "icon_name": "PaddlePaddle.png",
                 "command": [],
                 "args": [],
diff --git a/deploy/install-open-hydra.yaml b/deploy/install-open-hydra.yaml
index 29e5691..e5cb8dc 100644
--- a/deploy/install-open-hydra.yaml
+++ b/deploy/install-open-hydra.yaml
@@ -178,7 +178,10 @@ data:
         "sandboxes": {
             "xedu": {
                 "cpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/jupyter:Python-3.8.18-dual-lan",
-                "gpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/jupyter:Python-3.8.18-dual-lan",
+                "gpuImageSet": {
+                    "nvidia.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/jupyter:Python-3.8.18-dual-lan",
+                    "amd.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/jupyter:Python-3.8.18-dual-lan"
+                },
                 "icon_name": "jupyter-lab.png",
                 "command": [],
                 "args": [],
@@ -247,7 +250,10 @@ data:
             },
             "keras": {
                 "cpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/tensorflow:2.15.0-jupyter-cpu",
-                "gpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/tensorflow:latest-gpu-with-jlab",
+                "gpuImageSet": {
+                    "nvidia.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/tensorflow:latest-gpu-with-jlab",
+                    "amd.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/tensorflow:latest-gpu-with-jlab"
+                },
                 "icon_name": "keras.png",
                 "command": [],
                 "args": [],
@@ -303,7 +309,10 @@ data:
             },
             "PaddlePaddle": {
                 "cpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/paddle:2.6.0-jlab",
-                "gpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/paddle:2.6.0-gpu-cuda12.0-cudnn8.9-trt8.6-jlab",
+                "gpuImageSet": {
+                    "nvidia.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/paddle:2.6.0-gpu-cuda12.0-cudnn8.9-trt8.6-jlab",
+                    "amd.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/paddle:2.6.0-gpu-cuda12.0-cudnn8.9-trt8.6-jlab"
+                },
                 "icon_name": "PaddlePaddle.png",
                 "command": [],
                 "args": [],
diff --git a/pkg/open-hydra/apis/api.go b/pkg/open-hydra/apis/api.go
index 6b08912..ee83bf4 100644
--- a/pkg/open-hydra/apis/api.go
+++ b/pkg/open-hydra/apis/api.go
@@ -35,17 +35,17 @@ type GpuSet struct {
 
 // +k8s:openapi-gen=true
 type Sandbox struct {
-	CPUImageName    string        `json:"cpuImageName,omitempty"`
-	GPUImageName    string        `json:"gpuImageName,omitempty"`
-	Command         []string      `json:"command,omitempty"`
-	Args            []string      `json:"args,omitempty"`
-	Description     string        `json:"description,omitempty"`
-	DevelopmentInfo []string      `json:"developmentInfo,omitempty"`
-	Status          string        `json:"status,omitempty"`
-	Ports           []uint16      `json:"ports,omitempty"`
-	VolumeMounts    []VolumeMount `json:"volume_mounts,omitempty"`
-	Volumes         []Volume      `json:"volumes,omitempty"`
-	IconName        string        `json:"icon_name,omitempty"`
+	CPUImageName    string            `json:"cpuImageName,omitempty"`
+	GPUImageSet     map[string]string `json:"gpuImageSet,omitempty"`
+	Command         []string          `json:"command,omitempty"`
+	Args            []string          `json:"args,omitempty"`
+	Description     string            `json:"description,omitempty"`
+	DevelopmentInfo []string          `json:"developmentInfo,omitempty"`
+	Status          string            `json:"status,omitempty"`
+	Ports           []uint16          `json:"ports,omitempty"`
+	VolumeMounts    []VolumeMount     `json:"volume_mounts,omitempty"`
+	Volumes         []Volume          `json:"volumes,omitempty"`
+	IconName        string            `json:"icon_name,omitempty"`
 }
 
 // +k8s:openapi-gen=true
diff --git a/pkg/open-hydra/device-handler.go b/pkg/open-hydra/device-handler.go
index bd73fa7..7eb2ecb 100644
--- a/pkg/open-hydra/device-handler.go
+++ b/pkg/open-hydra/device-handler.go
@@ -243,8 +243,40 @@ func (builder *OpenHydraRouteBuilder) DeviceCreateRouteHandler(request *restful.
 		// set image with different hardware type if match
 		if gpuSet.Gpu > 0 {
 			// go with gpu image
-			image = plugins.Sandboxes[reqDevice.Spec.SandboxName].GPUImageName
-			slog.Debug(fmt.Sprintf("set image to gpu image '%s'", image))
+			if reqDevice.Spec.GpuDriver == "" {
+				writeHttpResponseAndLogError(response, http.StatusBadRequest, "gpu driver is empty")
+				return
+			}
+
+			// ensure gpu is allowed
+			// should be in config
+			gpuIsAllowed := false
+			for _, gpuAllowed := range builder.Config.GpuResourceKeys {
+				if gpuAllowed == reqDevice.Spec.GpuDriver {
+					gpuIsAllowed = true
+					break
+				}
+			}
+			if !gpuIsAllowed {
+				writeHttpResponseAndLogError(response, http.StatusBadRequest, fmt.Sprintf("gpu driver %s is not allowed", reqDevice.Spec.GpuDriver))
+				return
+			}
+
+			// ensure key is found in GPUImageSet
+			// we do not put any default fall back option here which is on purpose
+			// because different gpu must go with different image especially for none cuda compatible gpu
+			if _, found := plugins.Sandboxes[reqDevice.Spec.SandboxName].GPUImageSet[reqDevice.Spec.GpuDriver]; !found {
+				writeHttpResponseAndLogError(response, http.StatusBadRequest, fmt.Sprintf("gpu image %s not found in sandbox %s", reqDevice.Spec.GpuDriver, reqDevice.Spec.SandboxName))
+				return
+			}
+
+			if plugins.Sandboxes[reqDevice.Spec.SandboxName].GPUImageSet[reqDevice.Spec.GpuDriver] == "" {
+				writeHttpResponseAndLogError(response, http.StatusBadRequest, fmt.Sprintf("gpu image %s is empty in sandbox %s", reqDevice.Spec.GpuDriver, reqDevice.Spec.SandboxName))
+				return
+			}
+
+			image = plugins.Sandboxes[reqDevice.Spec.SandboxName].GPUImageSet[reqDevice.Spec.GpuDriver]
+			slog.Debug(fmt.Sprintf("set image to gpu image '%s' with driver name '%s'", image, reqDevice.Spec.GpuDriver))
 		} else {
 			// go with cpu image
 			image = plugins.Sandboxes[reqDevice.Spec.SandboxName].CPUImageName
diff --git a/pkg/open-hydra/k8s/faker.go b/pkg/open-hydra/k8s/faker.go
index 70ddf5a..01ef012 100644
--- a/pkg/open-hydra/k8s/faker.go
+++ b/pkg/open-hydra/k8s/faker.go
@@ -120,7 +120,10 @@ func (f *Fake) GetMap(name, namespace string, client *kubernetes.Clientset) (*co
 			"plugins": `{"sandboxes":{
 				"test": {
 					"cpuImageName": "test",
-					"gpuImageName": "test",
+					"gpuImageSet": {
+						"nvidia.com/gpu": "nvidia-gpu-image",
+						"amd.com/gpu": ""
+					},
 					"icon_name": "test1.png",
 					"command": ["test"],
 					"description": "test",
@@ -149,7 +152,10 @@ func (f *Fake) GetMap(name, namespace string, client *kubernetes.Clientset) (*co
 				},
 				"jupyter-lab": {
 					"cpuImageName": "jupyter-lab-test",
-					"gpuImageName": "jupyter-lab-test",
+					"gpuImageSet": {
+						"nvidia.com/gpu": "nvidia-gpu-image",
+						"amd.com/gpu": ""
+					},
 					"icon_name": "test2.png",
 					"command": ["jupyter-lab-test"],
 					"description": "jupyter-lab-test",
@@ -178,7 +184,10 @@ func (f *Fake) GetMap(name, namespace string, client *kubernetes.Clientset) (*co
 				},
 				"jupyter-lab-lot-ports": {
 					"cpuImageName": "jupyter-lab-test",
-					"gpuImageName": "jupyter-lab-test",
+					"gpuImageSet": {
+						"nvidia.com/gpu": "nvidia-gpu-image",
+						"amd.com/gpu": ""
+					},
 					"icon_name": "test3.png",
 					"command": ["jupyter-lab-test"],
 					"description": "jupyter-lab-test",
@@ -210,7 +219,10 @@ func (f *Fake) GetMap(name, namespace string, client *kubernetes.Clientset) (*co
 				},
 				"jupyter-lab-not-ports": {
 					"cpuImageName": "jupyter-lab-test",
-					"gpuImageName": "jupyter-lab-test",
+					"gpuImageSet": {
+						"nvidia.com/gpu": "nvidia-gpu-image",
+						"amd.com/gpu": ""
+					},
 					"icon_name": "test4.png",
 					"command": ["jupyter-lab-test"],
 					"description": "jupyter-lab-test",
diff --git a/pkg/open-hydra/open-hydra_test.go b/pkg/open-hydra/open-hydra_test.go
index c78a9bb..a45c274 100644
--- a/pkg/open-hydra/open-hydra_test.go
+++ b/pkg/open-hydra/open-hydra_test.go
@@ -407,7 +407,9 @@ var _ = Describe("open-hydra-server combineDeviceList test", func() {
 		pluginList = apis.PluginList{
 			Sandboxes: map[string]apis.Sandbox{
 				"jupyter-lab": apis.Sandbox{
-					GPUImageName: "test",
+					GPUImageSet: map[string]string{
+						"nvidia.com/gpu": "nvidia-gpu-image",
+					},
 					CPUImageName: "test",
 					Command:      []string{"test"},
 					Description:  "test",
@@ -607,7 +609,7 @@ var _ = Describe("open-hydra-server authorization test", func() {
 	var fakeDb *database.Faker
 	var container *restful.Container
 	var req *restful.Request
-	var device1, device2, device3, device4, device5, device6 *xDeviceV1.Device
+	var device1, device2, device3, device4, device5, device6, device7, device8 *xDeviceV1.Device
 	var setting *xSetting.Setting
 	var openHydraUsersURL = fmt.Sprintf("http://localhost/apis/%s/v1/%s", option.GroupVersion.Group, OpenHydraUserPath)
 	var openHydraDevicesURL = fmt.Sprintf("http://localhost/apis/%s/v1/%s", option.GroupVersion.Group, DevicePath)
@@ -663,7 +665,7 @@ var _ = Describe("open-hydra-server authorization test", func() {
 
 		return defaultHeader
 	}
-	var createDevice = func(name, sandboxName string, gpu uint8) *xDeviceV1.Device {
+	var createDevice = func(name, sandboxName, gpuDriver string, gpu uint8) *xDeviceV1.Device {
 		return &xDeviceV1.Device{
 			ObjectMeta: metaV1.ObjectMeta{
 				Name: name,
@@ -672,6 +674,7 @@ var _ = Describe("open-hydra-server authorization test", func() {
 				OpenHydraUsername: name,
 				DeviceGpu:         gpu,
 				SandboxName:       sandboxName,
+				GpuDriver:         gpuDriver,
 			},
 		}
 	}
@@ -792,12 +795,14 @@ var _ = Describe("open-hydra-server authorization test", func() {
 		student = createFakeUser("student", "student", 2)
 		newTeacher = createFakeUser("newTeacher", "newTeacher", 1)
 		newStudent = createFakeUser("newStudent", "newStudent", 2)
-		device1 = createDevice("teacher", "jupyter-lab", 1)
-		device2 = createDevice("student", "jupyter-lab", 0)
-		device3 = createDevice("student", "jupyter-lab", 1)
-		device4 = createDevice("student", "", 1)
-		device5 = createDevice("student", "jupyter-lab-lot-ports", 0)
-		device6 = createDevice("student", "jupyter-lab-no-ports", 0)
+		device1 = createDevice("teacher", "jupyter-lab", "nvidia.com/gpu", 1)
+		device2 = createDevice("student", "jupyter-lab", "", 0)
+		device3 = createDevice("student", "jupyter-lab", "nvidia.com/gpu", 1)
+		device4 = createDevice("student", "", "nvidia.com/gpu", 1)
+		device5 = createDevice("student", "jupyter-lab-lot-ports", "", 0)
+		device6 = createDevice("student", "jupyter-lab-no-ports", "", 0)
+		device7 = createDevice("student", "jupyter-lab", "amd.com/gpu", 1)
+		device8 = createDevice("student", "jupyter-lab", "huawei", 1)
 		setting = &xSetting.Setting{
 			ObjectMeta: metaV1.ObjectMeta{
 				Name: "default",
@@ -981,6 +986,22 @@ var _ = Describe("open-hydra-server authorization test", func() {
 
 		})
 
+		It("open-hydra device should be rejected due to image not set in plugin config", func() {
+			// exceed port limit should be denied
+			body3, err := json.Marshal(device7)
+			Expect(err).To(BeNil())
+			_, r2 := callApi(http.MethodPost, openHydraDevicesURL, createTokenValue(teacher, nil), bytes.NewReader(body3))
+			Expect(r2.Code).To(Equal(http.StatusBadRequest))
+		})
+
+		It("open-hydra device should be rejected due to gpu huawei is not pre-config so it't not allowed to use", func() {
+			// exceed port limit should be denied
+			body3, err := json.Marshal(device8)
+			Expect(err).To(BeNil())
+			_, r2 := callApi(http.MethodPost, openHydraDevicesURL, createTokenValue(teacher, nil), bytes.NewReader(body3))
+			Expect(r2.Code).To(Equal(http.StatusBadRequest))
+		})
+
 		It("open-hydra device create should be rejected due to no sandbox name is set", func() {
 			body1, err := json.Marshal(device4)
 			Expect(err).To(BeNil())
@@ -1053,9 +1074,9 @@ var _ = Describe("open-hydra-server authorization test", func() {
 			Expect(target.Spec.PluginList.Sandboxes["test"].IconName).To(Equal("test1.png"))
 			Expect(target.Spec.PluginList.Sandboxes["test"].VolumeMounts[0].Name).To(Equal("jupyter-lab"))
 			Expect(target.Spec.PluginList.Sandboxes["test"].VolumeMounts[0].MountPath).To(Equal("/root/notebook"))
-			Expect(target.Spec.PluginList.Sandboxes["jupyter-lab"].GPUImageName).To(Equal("jupyter-lab-test"))
+			Expect(target.Spec.PluginList.Sandboxes["jupyter-lab"].GPUImageSet["nvidia.com/gpu"]).To(Equal("nvidia-gpu-image"))
 			Expect(target.Spec.PluginList.Sandboxes["jupyter-lab"].IconName).To(Equal("test2.png"))
-			Expect(target.Spec.PluginList.Sandboxes["jupyter-lab-lot-ports"].GPUImageName).To(Equal("jupyter-lab-test"))
+			Expect(target.Spec.PluginList.Sandboxes["jupyter-lab-lot-ports"].GPUImageSet["nvidia.com/gpu"]).To(Equal("nvidia-gpu-image"))
 		})
 
 		It("open-hydra update setting by teacher should be ok", func() {