From a37fa69b6105a7f8bd67031b50e9b9d732b24b93 Mon Sep 17 00:00:00 2001 From: yangchenjun Date: Wed, 3 Jul 2024 10:59:56 +0800 Subject: [PATCH] device now cat boot with different gpu resource name --- deploy/install-open-hydra-keystone.yaml | 15 +++++++-- deploy/install-open-hydra.yaml | 15 +++++++-- pkg/open-hydra/apis/api.go | 22 ++++++------- pkg/open-hydra/device-handler.go | 36 +++++++++++++++++++-- pkg/open-hydra/k8s/faker.go | 20 +++++++++--- pkg/open-hydra/open-hydra_test.go | 43 ++++++++++++++++++------- 6 files changed, 117 insertions(+), 34 deletions(-) diff --git a/deploy/install-open-hydra-keystone.yaml b/deploy/install-open-hydra-keystone.yaml index 855d624..6846ddf 100644 --- a/deploy/install-open-hydra-keystone.yaml +++ b/deploy/install-open-hydra-keystone.yaml @@ -191,7 +191,10 @@ data: "sandboxes": { "xedu": { "cpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/jupyter:Python-3.8.18-dual-lan", - "gpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/jupyter:Python-3.8.18-dual-lan", + "gpuImageSet": { + "nvidia.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/jupyter:Python-3.8.18-dual-lan", + "amd.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/jupyter:Python-3.8.18-dual-lan" + }, "icon_name": "jupyter-lab.png", "command": [], "args": [], @@ -260,7 +263,10 @@ data: }, "keras": { "cpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/tensorflow:2.15.0-jupyter-cpu", - "gpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/tensorflow:latest-gpu-with-jlab", + "gpuImageSet": { + "nvidia.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/tensorflow:latest-gpu-with-jlab", + "amd.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/tensorflow:latest-gpu-with-jlab" + }, "icon_name": "keras.png", "command": [], "args": [], @@ -316,7 +322,10 @@ data: }, "PaddlePaddle": { "cpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/paddle:2.6.0-jlab", - "gpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/paddle:2.6.0-gpu-cuda12.0-cudnn8.9-trt8.6-jlab", + "gpuImageSet": { + "nvidia.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/paddle:2.6.0-gpu-cuda12.0-cudnn8.9-trt8.6-jlab", + "amd.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/paddle:2.6.0-gpu-cuda12.0-cudnn8.9-trt8.6-jlab" + }, "icon_name": "PaddlePaddle.png", "command": [], "args": [], diff --git a/deploy/install-open-hydra.yaml b/deploy/install-open-hydra.yaml index 29e5691..e5cb8dc 100644 --- a/deploy/install-open-hydra.yaml +++ b/deploy/install-open-hydra.yaml @@ -178,7 +178,10 @@ data: "sandboxes": { "xedu": { "cpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/jupyter:Python-3.8.18-dual-lan", - "gpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/jupyter:Python-3.8.18-dual-lan", + "gpuImageSet": { + "nvidia.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/jupyter:Python-3.8.18-dual-lan", + "amd.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/jupyter:Python-3.8.18-dual-lan" + }, "icon_name": "jupyter-lab.png", "command": [], "args": [], @@ -247,7 +250,10 @@ data: }, "keras": { "cpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/tensorflow:2.15.0-jupyter-cpu", - "gpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/tensorflow:latest-gpu-with-jlab", + "gpuImageSet": { + "nvidia.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/tensorflow:latest-gpu-with-jlab", + "amd.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/tensorflow:latest-gpu-with-jlab" + }, "icon_name": "keras.png", "command": [], "args": [], @@ -303,7 +309,10 @@ data: }, "PaddlePaddle": { "cpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/paddle:2.6.0-jlab", - "gpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/paddle:2.6.0-gpu-cuda12.0-cudnn8.9-trt8.6-jlab", + "gpuImageSet": { + "nvidia.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/paddle:2.6.0-gpu-cuda12.0-cudnn8.9-trt8.6-jlab", + "amd.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/paddle:2.6.0-gpu-cuda12.0-cudnn8.9-trt8.6-jlab" + }, "icon_name": "PaddlePaddle.png", "command": [], "args": [], diff --git a/pkg/open-hydra/apis/api.go b/pkg/open-hydra/apis/api.go index 6b08912..ee83bf4 100644 --- a/pkg/open-hydra/apis/api.go +++ b/pkg/open-hydra/apis/api.go @@ -35,17 +35,17 @@ type GpuSet struct { // +k8s:openapi-gen=true type Sandbox struct { - CPUImageName string `json:"cpuImageName,omitempty"` - GPUImageName string `json:"gpuImageName,omitempty"` - Command []string `json:"command,omitempty"` - Args []string `json:"args,omitempty"` - Description string `json:"description,omitempty"` - DevelopmentInfo []string `json:"developmentInfo,omitempty"` - Status string `json:"status,omitempty"` - Ports []uint16 `json:"ports,omitempty"` - VolumeMounts []VolumeMount `json:"volume_mounts,omitempty"` - Volumes []Volume `json:"volumes,omitempty"` - IconName string `json:"icon_name,omitempty"` + CPUImageName string `json:"cpuImageName,omitempty"` + GPUImageSet map[string]string `json:"gpuImageSet,omitempty"` + Command []string `json:"command,omitempty"` + Args []string `json:"args,omitempty"` + Description string `json:"description,omitempty"` + DevelopmentInfo []string `json:"developmentInfo,omitempty"` + Status string `json:"status,omitempty"` + Ports []uint16 `json:"ports,omitempty"` + VolumeMounts []VolumeMount `json:"volume_mounts,omitempty"` + Volumes []Volume `json:"volumes,omitempty"` + IconName string `json:"icon_name,omitempty"` } // +k8s:openapi-gen=true diff --git a/pkg/open-hydra/device-handler.go b/pkg/open-hydra/device-handler.go index bd73fa7..7eb2ecb 100644 --- a/pkg/open-hydra/device-handler.go +++ b/pkg/open-hydra/device-handler.go @@ -243,8 +243,40 @@ func (builder *OpenHydraRouteBuilder) DeviceCreateRouteHandler(request *restful. // set image with different hardware type if match if gpuSet.Gpu > 0 { // go with gpu image - image = plugins.Sandboxes[reqDevice.Spec.SandboxName].GPUImageName - slog.Debug(fmt.Sprintf("set image to gpu image '%s'", image)) + if reqDevice.Spec.GpuDriver == "" { + writeHttpResponseAndLogError(response, http.StatusBadRequest, "gpu driver is empty") + return + } + + // ensure gpu is allowed + // should be in config + gpuIsAllowed := false + for _, gpuAllowed := range builder.Config.GpuResourceKeys { + if gpuAllowed == reqDevice.Spec.GpuDriver { + gpuIsAllowed = true + break + } + } + if !gpuIsAllowed { + writeHttpResponseAndLogError(response, http.StatusBadRequest, fmt.Sprintf("gpu driver %s is not allowed", reqDevice.Spec.GpuDriver)) + return + } + + // ensure key is found in GPUImageSet + // we do not put any default fall back option here which is on purpose + // because different gpu must go with different image especially for none cuda compatible gpu + if _, found := plugins.Sandboxes[reqDevice.Spec.SandboxName].GPUImageSet[reqDevice.Spec.GpuDriver]; !found { + writeHttpResponseAndLogError(response, http.StatusBadRequest, fmt.Sprintf("gpu image %s not found in sandbox %s", reqDevice.Spec.GpuDriver, reqDevice.Spec.SandboxName)) + return + } + + if plugins.Sandboxes[reqDevice.Spec.SandboxName].GPUImageSet[reqDevice.Spec.GpuDriver] == "" { + writeHttpResponseAndLogError(response, http.StatusBadRequest, fmt.Sprintf("gpu image %s is empty in sandbox %s", reqDevice.Spec.GpuDriver, reqDevice.Spec.SandboxName)) + return + } + + image = plugins.Sandboxes[reqDevice.Spec.SandboxName].GPUImageSet[reqDevice.Spec.GpuDriver] + slog.Debug(fmt.Sprintf("set image to gpu image '%s' with driver name '%s'", image, reqDevice.Spec.GpuDriver)) } else { // go with cpu image image = plugins.Sandboxes[reqDevice.Spec.SandboxName].CPUImageName diff --git a/pkg/open-hydra/k8s/faker.go b/pkg/open-hydra/k8s/faker.go index 70ddf5a..01ef012 100644 --- a/pkg/open-hydra/k8s/faker.go +++ b/pkg/open-hydra/k8s/faker.go @@ -120,7 +120,10 @@ func (f *Fake) GetMap(name, namespace string, client *kubernetes.Clientset) (*co "plugins": `{"sandboxes":{ "test": { "cpuImageName": "test", - "gpuImageName": "test", + "gpuImageSet": { + "nvidia.com/gpu": "nvidia-gpu-image", + "amd.com/gpu": "" + }, "icon_name": "test1.png", "command": ["test"], "description": "test", @@ -149,7 +152,10 @@ func (f *Fake) GetMap(name, namespace string, client *kubernetes.Clientset) (*co }, "jupyter-lab": { "cpuImageName": "jupyter-lab-test", - "gpuImageName": "jupyter-lab-test", + "gpuImageSet": { + "nvidia.com/gpu": "nvidia-gpu-image", + "amd.com/gpu": "" + }, "icon_name": "test2.png", "command": ["jupyter-lab-test"], "description": "jupyter-lab-test", @@ -178,7 +184,10 @@ func (f *Fake) GetMap(name, namespace string, client *kubernetes.Clientset) (*co }, "jupyter-lab-lot-ports": { "cpuImageName": "jupyter-lab-test", - "gpuImageName": "jupyter-lab-test", + "gpuImageSet": { + "nvidia.com/gpu": "nvidia-gpu-image", + "amd.com/gpu": "" + }, "icon_name": "test3.png", "command": ["jupyter-lab-test"], "description": "jupyter-lab-test", @@ -210,7 +219,10 @@ func (f *Fake) GetMap(name, namespace string, client *kubernetes.Clientset) (*co }, "jupyter-lab-not-ports": { "cpuImageName": "jupyter-lab-test", - "gpuImageName": "jupyter-lab-test", + "gpuImageSet": { + "nvidia.com/gpu": "nvidia-gpu-image", + "amd.com/gpu": "" + }, "icon_name": "test4.png", "command": ["jupyter-lab-test"], "description": "jupyter-lab-test", diff --git a/pkg/open-hydra/open-hydra_test.go b/pkg/open-hydra/open-hydra_test.go index c78a9bb..a45c274 100644 --- a/pkg/open-hydra/open-hydra_test.go +++ b/pkg/open-hydra/open-hydra_test.go @@ -407,7 +407,9 @@ var _ = Describe("open-hydra-server combineDeviceList test", func() { pluginList = apis.PluginList{ Sandboxes: map[string]apis.Sandbox{ "jupyter-lab": apis.Sandbox{ - GPUImageName: "test", + GPUImageSet: map[string]string{ + "nvidia.com/gpu": "nvidia-gpu-image", + }, CPUImageName: "test", Command: []string{"test"}, Description: "test", @@ -607,7 +609,7 @@ var _ = Describe("open-hydra-server authorization test", func() { var fakeDb *database.Faker var container *restful.Container var req *restful.Request - var device1, device2, device3, device4, device5, device6 *xDeviceV1.Device + var device1, device2, device3, device4, device5, device6, device7, device8 *xDeviceV1.Device var setting *xSetting.Setting var openHydraUsersURL = fmt.Sprintf("http://localhost/apis/%s/v1/%s", option.GroupVersion.Group, OpenHydraUserPath) var openHydraDevicesURL = fmt.Sprintf("http://localhost/apis/%s/v1/%s", option.GroupVersion.Group, DevicePath) @@ -663,7 +665,7 @@ var _ = Describe("open-hydra-server authorization test", func() { return defaultHeader } - var createDevice = func(name, sandboxName string, gpu uint8) *xDeviceV1.Device { + var createDevice = func(name, sandboxName, gpuDriver string, gpu uint8) *xDeviceV1.Device { return &xDeviceV1.Device{ ObjectMeta: metaV1.ObjectMeta{ Name: name, @@ -672,6 +674,7 @@ var _ = Describe("open-hydra-server authorization test", func() { OpenHydraUsername: name, DeviceGpu: gpu, SandboxName: sandboxName, + GpuDriver: gpuDriver, }, } } @@ -792,12 +795,14 @@ var _ = Describe("open-hydra-server authorization test", func() { student = createFakeUser("student", "student", 2) newTeacher = createFakeUser("newTeacher", "newTeacher", 1) newStudent = createFakeUser("newStudent", "newStudent", 2) - device1 = createDevice("teacher", "jupyter-lab", 1) - device2 = createDevice("student", "jupyter-lab", 0) - device3 = createDevice("student", "jupyter-lab", 1) - device4 = createDevice("student", "", 1) - device5 = createDevice("student", "jupyter-lab-lot-ports", 0) - device6 = createDevice("student", "jupyter-lab-no-ports", 0) + device1 = createDevice("teacher", "jupyter-lab", "nvidia.com/gpu", 1) + device2 = createDevice("student", "jupyter-lab", "", 0) + device3 = createDevice("student", "jupyter-lab", "nvidia.com/gpu", 1) + device4 = createDevice("student", "", "nvidia.com/gpu", 1) + device5 = createDevice("student", "jupyter-lab-lot-ports", "", 0) + device6 = createDevice("student", "jupyter-lab-no-ports", "", 0) + device7 = createDevice("student", "jupyter-lab", "amd.com/gpu", 1) + device8 = createDevice("student", "jupyter-lab", "huawei", 1) setting = &xSetting.Setting{ ObjectMeta: metaV1.ObjectMeta{ Name: "default", @@ -981,6 +986,22 @@ var _ = Describe("open-hydra-server authorization test", func() { }) + It("open-hydra device should be rejected due to image not set in plugin config", func() { + // exceed port limit should be denied + body3, err := json.Marshal(device7) + Expect(err).To(BeNil()) + _, r2 := callApi(http.MethodPost, openHydraDevicesURL, createTokenValue(teacher, nil), bytes.NewReader(body3)) + Expect(r2.Code).To(Equal(http.StatusBadRequest)) + }) + + It("open-hydra device should be rejected due to gpu huawei is not pre-config so it't not allowed to use", func() { + // exceed port limit should be denied + body3, err := json.Marshal(device8) + Expect(err).To(BeNil()) + _, r2 := callApi(http.MethodPost, openHydraDevicesURL, createTokenValue(teacher, nil), bytes.NewReader(body3)) + Expect(r2.Code).To(Equal(http.StatusBadRequest)) + }) + It("open-hydra device create should be rejected due to no sandbox name is set", func() { body1, err := json.Marshal(device4) Expect(err).To(BeNil()) @@ -1053,9 +1074,9 @@ var _ = Describe("open-hydra-server authorization test", func() { Expect(target.Spec.PluginList.Sandboxes["test"].IconName).To(Equal("test1.png")) Expect(target.Spec.PluginList.Sandboxes["test"].VolumeMounts[0].Name).To(Equal("jupyter-lab")) Expect(target.Spec.PluginList.Sandboxes["test"].VolumeMounts[0].MountPath).To(Equal("/root/notebook")) - Expect(target.Spec.PluginList.Sandboxes["jupyter-lab"].GPUImageName).To(Equal("jupyter-lab-test")) + Expect(target.Spec.PluginList.Sandboxes["jupyter-lab"].GPUImageSet["nvidia.com/gpu"]).To(Equal("nvidia-gpu-image")) Expect(target.Spec.PluginList.Sandboxes["jupyter-lab"].IconName).To(Equal("test2.png")) - Expect(target.Spec.PluginList.Sandboxes["jupyter-lab-lot-ports"].GPUImageName).To(Equal("jupyter-lab-test")) + Expect(target.Spec.PluginList.Sandboxes["jupyter-lab-lot-ports"].GPUImageSet["nvidia.com/gpu"]).To(Equal("nvidia-gpu-image")) }) It("open-hydra update setting by teacher should be ok", func() {