Skip to content

Commit

Permalink
device now cat boot with different gpu resource name
Browse files Browse the repository at this point in the history
  • Loading branch information
simonycj committed Jul 3, 2024
1 parent c7cc9a1 commit a37fa69
Show file tree
Hide file tree
Showing 6 changed files with 117 additions and 34 deletions.
15 changes: 12 additions & 3 deletions deploy/install-open-hydra-keystone.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,10 @@ data:
"sandboxes": {
"xedu": {
"cpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/jupyter:Python-3.8.18-dual-lan",
"gpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/jupyter:Python-3.8.18-dual-lan",
"gpuImageSet": {
"nvidia.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/jupyter:Python-3.8.18-dual-lan",
"amd.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/jupyter:Python-3.8.18-dual-lan"
},
"icon_name": "jupyter-lab.png",
"command": [],
"args": [],
Expand Down Expand Up @@ -260,7 +263,10 @@ data:
},
"keras": {
"cpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/tensorflow:2.15.0-jupyter-cpu",
"gpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/tensorflow:latest-gpu-with-jlab",
"gpuImageSet": {
"nvidia.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/tensorflow:latest-gpu-with-jlab",
"amd.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/tensorflow:latest-gpu-with-jlab"
},
"icon_name": "keras.png",
"command": [],
"args": [],
Expand Down Expand Up @@ -316,7 +322,10 @@ data:
},
"PaddlePaddle": {
"cpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/paddle:2.6.0-jlab",
"gpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/paddle:2.6.0-gpu-cuda12.0-cudnn8.9-trt8.6-jlab",
"gpuImageSet": {
"nvidia.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/paddle:2.6.0-gpu-cuda12.0-cudnn8.9-trt8.6-jlab",
"amd.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/paddle:2.6.0-gpu-cuda12.0-cudnn8.9-trt8.6-jlab"
},
"icon_name": "PaddlePaddle.png",
"command": [],
"args": [],
Expand Down
15 changes: 12 additions & 3 deletions deploy/install-open-hydra.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,10 @@ data:
"sandboxes": {
"xedu": {
"cpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/jupyter:Python-3.8.18-dual-lan",
"gpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/jupyter:Python-3.8.18-dual-lan",
"gpuImageSet": {
"nvidia.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/jupyter:Python-3.8.18-dual-lan",
"amd.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/jupyter:Python-3.8.18-dual-lan"
},
"icon_name": "jupyter-lab.png",
"command": [],
"args": [],
Expand Down Expand Up @@ -247,7 +250,10 @@ data:
},
"keras": {
"cpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/tensorflow:2.15.0-jupyter-cpu",
"gpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/tensorflow:latest-gpu-with-jlab",
"gpuImageSet": {
"nvidia.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/tensorflow:latest-gpu-with-jlab",
"amd.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/tensorflow:latest-gpu-with-jlab"
},
"icon_name": "keras.png",
"command": [],
"args": [],
Expand Down Expand Up @@ -303,7 +309,10 @@ data:
},
"PaddlePaddle": {
"cpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/paddle:2.6.0-jlab",
"gpuImageName": "registry.cn-shanghai.aliyuncs.com/openhydra/paddle:2.6.0-gpu-cuda12.0-cudnn8.9-trt8.6-jlab",
"gpuImageSet": {
"nvidia.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/paddle:2.6.0-gpu-cuda12.0-cudnn8.9-trt8.6-jlab",
"amd.com/gpu": "registry.cn-shanghai.aliyuncs.com/openhydra/paddle:2.6.0-gpu-cuda12.0-cudnn8.9-trt8.6-jlab"
},
"icon_name": "PaddlePaddle.png",
"command": [],
"args": [],
Expand Down
22 changes: 11 additions & 11 deletions pkg/open-hydra/apis/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,17 +35,17 @@ type GpuSet struct {

// +k8s:openapi-gen=true
type Sandbox struct {
CPUImageName string `json:"cpuImageName,omitempty"`
GPUImageName string `json:"gpuImageName,omitempty"`
Command []string `json:"command,omitempty"`
Args []string `json:"args,omitempty"`
Description string `json:"description,omitempty"`
DevelopmentInfo []string `json:"developmentInfo,omitempty"`
Status string `json:"status,omitempty"`
Ports []uint16 `json:"ports,omitempty"`
VolumeMounts []VolumeMount `json:"volume_mounts,omitempty"`
Volumes []Volume `json:"volumes,omitempty"`
IconName string `json:"icon_name,omitempty"`
CPUImageName string `json:"cpuImageName,omitempty"`
GPUImageSet map[string]string `json:"gpuImageSet,omitempty"`
Command []string `json:"command,omitempty"`
Args []string `json:"args,omitempty"`
Description string `json:"description,omitempty"`
DevelopmentInfo []string `json:"developmentInfo,omitempty"`
Status string `json:"status,omitempty"`
Ports []uint16 `json:"ports,omitempty"`
VolumeMounts []VolumeMount `json:"volume_mounts,omitempty"`
Volumes []Volume `json:"volumes,omitempty"`
IconName string `json:"icon_name,omitempty"`
}

// +k8s:openapi-gen=true
Expand Down
36 changes: 34 additions & 2 deletions pkg/open-hydra/device-handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -243,8 +243,40 @@ func (builder *OpenHydraRouteBuilder) DeviceCreateRouteHandler(request *restful.
// set image with different hardware type if match
if gpuSet.Gpu > 0 {
// go with gpu image
image = plugins.Sandboxes[reqDevice.Spec.SandboxName].GPUImageName
slog.Debug(fmt.Sprintf("set image to gpu image '%s'", image))
if reqDevice.Spec.GpuDriver == "" {
writeHttpResponseAndLogError(response, http.StatusBadRequest, "gpu driver is empty")
return
}

// ensure gpu is allowed
// should be in config
gpuIsAllowed := false
for _, gpuAllowed := range builder.Config.GpuResourceKeys {
if gpuAllowed == reqDevice.Spec.GpuDriver {
gpuIsAllowed = true
break
}
}
if !gpuIsAllowed {
writeHttpResponseAndLogError(response, http.StatusBadRequest, fmt.Sprintf("gpu driver %s is not allowed", reqDevice.Spec.GpuDriver))
return
}

// ensure key is found in GPUImageSet
// we do not put any default fall back option here which is on purpose
// because different gpu must go with different image especially for none cuda compatible gpu
if _, found := plugins.Sandboxes[reqDevice.Spec.SandboxName].GPUImageSet[reqDevice.Spec.GpuDriver]; !found {
writeHttpResponseAndLogError(response, http.StatusBadRequest, fmt.Sprintf("gpu image %s not found in sandbox %s", reqDevice.Spec.GpuDriver, reqDevice.Spec.SandboxName))
return
}

if plugins.Sandboxes[reqDevice.Spec.SandboxName].GPUImageSet[reqDevice.Spec.GpuDriver] == "" {
writeHttpResponseAndLogError(response, http.StatusBadRequest, fmt.Sprintf("gpu image %s is empty in sandbox %s", reqDevice.Spec.GpuDriver, reqDevice.Spec.SandboxName))
return
}

image = plugins.Sandboxes[reqDevice.Spec.SandboxName].GPUImageSet[reqDevice.Spec.GpuDriver]
slog.Debug(fmt.Sprintf("set image to gpu image '%s' with driver name '%s'", image, reqDevice.Spec.GpuDriver))
} else {
// go with cpu image
image = plugins.Sandboxes[reqDevice.Spec.SandboxName].CPUImageName
Expand Down
20 changes: 16 additions & 4 deletions pkg/open-hydra/k8s/faker.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,10 @@ func (f *Fake) GetMap(name, namespace string, client *kubernetes.Clientset) (*co
"plugins": `{"sandboxes":{
"test": {
"cpuImageName": "test",
"gpuImageName": "test",
"gpuImageSet": {
"nvidia.com/gpu": "nvidia-gpu-image",
"amd.com/gpu": ""
},
"icon_name": "test1.png",
"command": ["test"],
"description": "test",
Expand Down Expand Up @@ -149,7 +152,10 @@ func (f *Fake) GetMap(name, namespace string, client *kubernetes.Clientset) (*co
},
"jupyter-lab": {
"cpuImageName": "jupyter-lab-test",
"gpuImageName": "jupyter-lab-test",
"gpuImageSet": {
"nvidia.com/gpu": "nvidia-gpu-image",
"amd.com/gpu": ""
},
"icon_name": "test2.png",
"command": ["jupyter-lab-test"],
"description": "jupyter-lab-test",
Expand Down Expand Up @@ -178,7 +184,10 @@ func (f *Fake) GetMap(name, namespace string, client *kubernetes.Clientset) (*co
},
"jupyter-lab-lot-ports": {
"cpuImageName": "jupyter-lab-test",
"gpuImageName": "jupyter-lab-test",
"gpuImageSet": {
"nvidia.com/gpu": "nvidia-gpu-image",
"amd.com/gpu": ""
},
"icon_name": "test3.png",
"command": ["jupyter-lab-test"],
"description": "jupyter-lab-test",
Expand Down Expand Up @@ -210,7 +219,10 @@ func (f *Fake) GetMap(name, namespace string, client *kubernetes.Clientset) (*co
},
"jupyter-lab-not-ports": {
"cpuImageName": "jupyter-lab-test",
"gpuImageName": "jupyter-lab-test",
"gpuImageSet": {
"nvidia.com/gpu": "nvidia-gpu-image",
"amd.com/gpu": ""
},
"icon_name": "test4.png",
"command": ["jupyter-lab-test"],
"description": "jupyter-lab-test",
Expand Down
43 changes: 32 additions & 11 deletions pkg/open-hydra/open-hydra_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,9 @@ var _ = Describe("open-hydra-server combineDeviceList test", func() {
pluginList = apis.PluginList{
Sandboxes: map[string]apis.Sandbox{
"jupyter-lab": apis.Sandbox{
GPUImageName: "test",
GPUImageSet: map[string]string{
"nvidia.com/gpu": "nvidia-gpu-image",
},
CPUImageName: "test",
Command: []string{"test"},
Description: "test",
Expand Down Expand Up @@ -607,7 +609,7 @@ var _ = Describe("open-hydra-server authorization test", func() {
var fakeDb *database.Faker
var container *restful.Container
var req *restful.Request
var device1, device2, device3, device4, device5, device6 *xDeviceV1.Device
var device1, device2, device3, device4, device5, device6, device7, device8 *xDeviceV1.Device
var setting *xSetting.Setting
var openHydraUsersURL = fmt.Sprintf("http://localhost/apis/%s/v1/%s", option.GroupVersion.Group, OpenHydraUserPath)
var openHydraDevicesURL = fmt.Sprintf("http://localhost/apis/%s/v1/%s", option.GroupVersion.Group, DevicePath)
Expand Down Expand Up @@ -663,7 +665,7 @@ var _ = Describe("open-hydra-server authorization test", func() {

return defaultHeader
}
var createDevice = func(name, sandboxName string, gpu uint8) *xDeviceV1.Device {
var createDevice = func(name, sandboxName, gpuDriver string, gpu uint8) *xDeviceV1.Device {
return &xDeviceV1.Device{
ObjectMeta: metaV1.ObjectMeta{
Name: name,
Expand All @@ -672,6 +674,7 @@ var _ = Describe("open-hydra-server authorization test", func() {
OpenHydraUsername: name,
DeviceGpu: gpu,
SandboxName: sandboxName,
GpuDriver: gpuDriver,
},
}
}
Expand Down Expand Up @@ -792,12 +795,14 @@ var _ = Describe("open-hydra-server authorization test", func() {
student = createFakeUser("student", "student", 2)
newTeacher = createFakeUser("newTeacher", "newTeacher", 1)
newStudent = createFakeUser("newStudent", "newStudent", 2)
device1 = createDevice("teacher", "jupyter-lab", 1)
device2 = createDevice("student", "jupyter-lab", 0)
device3 = createDevice("student", "jupyter-lab", 1)
device4 = createDevice("student", "", 1)
device5 = createDevice("student", "jupyter-lab-lot-ports", 0)
device6 = createDevice("student", "jupyter-lab-no-ports", 0)
device1 = createDevice("teacher", "jupyter-lab", "nvidia.com/gpu", 1)
device2 = createDevice("student", "jupyter-lab", "", 0)
device3 = createDevice("student", "jupyter-lab", "nvidia.com/gpu", 1)
device4 = createDevice("student", "", "nvidia.com/gpu", 1)
device5 = createDevice("student", "jupyter-lab-lot-ports", "", 0)
device6 = createDevice("student", "jupyter-lab-no-ports", "", 0)
device7 = createDevice("student", "jupyter-lab", "amd.com/gpu", 1)
device8 = createDevice("student", "jupyter-lab", "huawei", 1)
setting = &xSetting.Setting{
ObjectMeta: metaV1.ObjectMeta{
Name: "default",
Expand Down Expand Up @@ -981,6 +986,22 @@ var _ = Describe("open-hydra-server authorization test", func() {

})

It("open-hydra device should be rejected due to image not set in plugin config", func() {
// exceed port limit should be denied
body3, err := json.Marshal(device7)
Expect(err).To(BeNil())
_, r2 := callApi(http.MethodPost, openHydraDevicesURL, createTokenValue(teacher, nil), bytes.NewReader(body3))
Expect(r2.Code).To(Equal(http.StatusBadRequest))
})

It("open-hydra device should be rejected due to gpu huawei is not pre-config so it't not allowed to use", func() {
// exceed port limit should be denied
body3, err := json.Marshal(device8)
Expect(err).To(BeNil())
_, r2 := callApi(http.MethodPost, openHydraDevicesURL, createTokenValue(teacher, nil), bytes.NewReader(body3))
Expect(r2.Code).To(Equal(http.StatusBadRequest))
})

It("open-hydra device create should be rejected due to no sandbox name is set", func() {
body1, err := json.Marshal(device4)
Expect(err).To(BeNil())
Expand Down Expand Up @@ -1053,9 +1074,9 @@ var _ = Describe("open-hydra-server authorization test", func() {
Expect(target.Spec.PluginList.Sandboxes["test"].IconName).To(Equal("test1.png"))
Expect(target.Spec.PluginList.Sandboxes["test"].VolumeMounts[0].Name).To(Equal("jupyter-lab"))
Expect(target.Spec.PluginList.Sandboxes["test"].VolumeMounts[0].MountPath).To(Equal("/root/notebook"))
Expect(target.Spec.PluginList.Sandboxes["jupyter-lab"].GPUImageName).To(Equal("jupyter-lab-test"))
Expect(target.Spec.PluginList.Sandboxes["jupyter-lab"].GPUImageSet["nvidia.com/gpu"]).To(Equal("nvidia-gpu-image"))
Expect(target.Spec.PluginList.Sandboxes["jupyter-lab"].IconName).To(Equal("test2.png"))
Expect(target.Spec.PluginList.Sandboxes["jupyter-lab-lot-ports"].GPUImageName).To(Equal("jupyter-lab-test"))
Expect(target.Spec.PluginList.Sandboxes["jupyter-lab-lot-ports"].GPUImageSet["nvidia.com/gpu"]).To(Equal("nvidia-gpu-image"))
})

It("open-hydra update setting by teacher should be ok", func() {
Expand Down

0 comments on commit a37fa69

Please sign in to comment.