diff --git a/README.md b/README.md index b73676c03..2d7d1a68f 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,8 @@ will see 3G device memory inside container [![cambricon MLU](https://img.shields.io/badge/Cambricon-Mlu-blue)](docs/cambricon-mlu-support.md) [![hygon DCU](https://img.shields.io/badge/Hygon-DCU-blue)](docs/hygon-dcu-support.md) [![iluvatar GPU](https://img.shields.io/badge/Iluvatar-GPU-blue)](docs/iluvatar-gpu-support.md) +[![mthreads GPU](https://img.shields.io/badge/Mthreads-GPU-blue)](docs/mthreads-support.md) +[![ascend NPU](https://img.shields.io/badge/Ascend-GPU-blue)](https://github.com/Project-HAMi/ascend-device-plugin/blob/main/README.md) ## Architect diff --git a/README_cn.md b/README_cn.md index 3653376cc..cedd8f546 100644 --- a/README_cn.md +++ b/README_cn.md @@ -22,6 +22,8 @@ [![寒武纪 MLU](https://img.shields.io/badge/寒武纪-Mlu-blue)](docs/cambricon-mlu-support_cn.md) [![海光 DCU](https://img.shields.io/badge/海光-DCU-blue)](docs/hygon-dcu-support.md) [![天数智芯 GPU](https://img.shields.io/badge/天数智芯-GPU-blue)](docs/iluvatar-gpu-support_cn.md) +[![摩尔线程 GPU](https://img.shields.io/badge/摩尔线程-GPU-blue)](docs/mthreads-support_cn.md) +[![华为昇腾 NPU](https://img.shields.io/badge/华为昇腾-NPU-blue)](https://github.com/Project-HAMi/ascend-device-plugin/blob/main/README_cn.md) ## 简介 diff --git a/charts/hami/templates/scheduler/configmap.yaml b/charts/hami/templates/scheduler/configmap.yaml index f3380d1ce..c843bb9e3 100644 --- a/charts/hami/templates/scheduler/configmap.yaml +++ b/charts/hami/templates/scheduler/configmap.yaml @@ -32,6 +32,14 @@ data: }, {{- end }} {{- end }} + {{- if .Values.devices.mthreads.enabled }} + {{- range .Values.devices.mthreads.resources }} + { + "name": "{{ . }}", + "ignoredByScheduler": true + }, + {{- end }} + {{- end }} { "name": "{{ .Values.resourceName }}", "ignoredByScheduler": true diff --git a/charts/hami/templates/scheduler/configmapnew.yaml b/charts/hami/templates/scheduler/configmapnew.yaml index e9badb76f..a2b326378 100644 --- a/charts/hami/templates/scheduler/configmapnew.yaml +++ b/charts/hami/templates/scheduler/configmapnew.yaml @@ -55,4 +55,10 @@ data: ignoredByScheduler: true {{- end }} {{- end }} + {{- if .Values.devices.mthreads.enabled }} + {{- range .Values.devices.mthreads.resources }} + - name: {{ . }} + ignoredByScheduler: true + {{- end }} + {{- end }} {{- end }} diff --git a/charts/hami/values.yaml b/charts/hami/values.yaml index cebad9495..97d6e1e02 100644 --- a/charts/hami/values.yaml +++ b/charts/hami/values.yaml @@ -136,6 +136,10 @@ devicePlugin: tolerations: [] devices: + mthreads: + enabled: false + resources: + - mthreads.com/vgpu ascend: enabled: false image: "" diff --git a/docs/mthreads-support.md b/docs/mthreads-support.md new file mode 100644 index 000000000..f98899390 --- /dev/null +++ b/docs/mthreads-support.md @@ -0,0 +1,67 @@ +## Introduction + +**We now support mthreads.com/vgpu by implementing most device-sharing features as nvidia-GPU**, including: + +***GPU sharing***: Each task can allocate a portion of GPU instead of a whole GPU card, thus GPU can be shared among multiple tasks. + +***Device Memory Control***: GPUs can be allocated with certain device memory size on certain type(i.e MTT S4000) and have made it that it does not exceed the boundary. + +***Device Core Control***: GPUs can be allocated with limited compute cores on certain type(i.e MTT S4000) and have made it that it does not exceed the boundary. + +## Important Notes + +1. Device sharing for multi-cards is not supported. + +2. Only one mthreads device can be shared in a pod(even there are multiple containers). + +3. Support allocating exclusive mthreads GPU by specifying mthreads.com/vgpu only. + +4. These features are tested on MTT S4000 + +## Prerequisites + +* [MT CloudNative Toolkits > 1.9.0](https://docs.mthreads.com/cloud-native/cloud-native-doc-online/) +* driver version >= 1.2.0 + +## Enabling GPU-sharing Support + +* Deploy MT-CloudNative Toolkit on mthreads nodes (Please consult your device provider to aquire its package and document) + +> **NOTICE:** *You can remove mt-mutating-webhook and mt-gpu-scheduler after installation(optional).* + +* set the 'devices.mthreads.enabled = true' when installing hami + +``` +helm install hami hami-charts/hami --set scheduler.kubeScheduler.imageTag={your kubernetes version} --set device.mthreads.enabled=true -n kube-system +``` + +## Running Mthreads jobs + +Mthreads GPUs can now be requested by a container +using the `mthreads.com/vgpu`, `mthreads.com/sgpu-memory` and `mthreads.com/sgpu-core` resource type: + +``` +apiVersion: v1 +kind: Pod +metadata: + name: gpushare-pod-default +spec: + restartPolicy: OnFailure + containers: + - image: core.harbor.zlidc.mthreads.com:30003/mt-ai/lm-qy2:v17-mpc + imagePullPolicy: IfNotPresent + name: gpushare-pod-1 + command: ["sleep"] + args: ["100000"] + resources: + limits: + mthreads.com/vgpu: 1 + mthreads.com/sgpu-memory: 32 + mthreads.com/sgpu-core: 8 +``` + +> **NOTICE1:** *Each unit of sgpu-memory indicates 512M device memory* + +> **NOTICE2:** *You can find more examples in [examples/mthreads folder](../examples/mthreads/)* + + \ No newline at end of file diff --git a/docs/mthreads-support_cn.md b/docs/mthreads-support_cn.md new file mode 100644 index 000000000..1ababa95a --- /dev/null +++ b/docs/mthreads-support_cn.md @@ -0,0 +1,68 @@ +## 简介 + +本组件支持复用摩尔线程GPU设备,并为此提供以下几种与vGPU类似的复用功能,包括: + +***GPU 共享***: 每个任务可以只占用一部分显卡,多个任务可以共享一张显卡 + +***可限制分配的显存大小***: 你现在可以用显存值(例如3000M)来分配MLU,本组件会确保任务使用的显存不会超过分配数值、 + +***可限制分配的算力核组比例***: 你现在可以用算力核组数量(例如8个)来分配GPU,本组件会确保任务使用的显存不会超过分配数值 + +## 注意事项 + +1. 暂时不支持多卡切片,多卡任务只能分配整卡 + +2. 一个pod只能使用一个GPU生成的切片,即使该pod中有多个容器 + +3. 支持独占模式,只指定`mthreads.com/vgpu`即为独占申请 + +4. 本特性目前只支持MTT S4000设备 + +## 节点需求 + +* [MT CloudNative Toolkits > 1.9.0](https://docs.mthreads.com/cloud-native/cloud-native-doc-online/) +* 驱动版本 >= 1.2.0 + +## 开启GPU复用 + +* 部署'gpu-manager',天数智芯的GPU共享需要配合厂家提供的'MT-CloudNative Toolkit'一起使用,请联系设备提供方获取 + +> **注意:** *(可选),部署完之后,卸载掉mt-mutating-webhook与mt-scheduler组件,因为这部分功能将由HAMi调度器提供* + +* 在安装HAMi时配置'devices.mthreads.enabled = true'参数 + +``` +helm install hami hami-charts/hami --set scheduler.kubeScheduler.imageTag={your kubernetes version} --set device.mthreads.enabled=true -n kube-system +``` + +## 运行GPU任务 + +通过指定`mthreads.com/vgpu`, `mthreads.com/sgpu-memory` and `mthreads.com/sgpu-core`这3个参数,可以确定容器申请的切片个数,对应的显存和算力核组 + +``` +apiVersion: v1 +kind: Pod +metadata: + name: gpushare-pod-default +spec: + restartPolicy: OnFailure + containers: + - image: core.harbor.zlidc.mthreads.com:30003/mt-ai/lm-qy2:v17-mpc + imagePullPolicy: IfNotPresent + name: gpushare-pod-1 + command: ["sleep"] + args: ["100000"] + resources: + limits: + mthreads.com/vgpu: 1 + mthreads.com/sgpu-memory: 32 + mthreads.com/sgpu-core: 8 +``` + +> **注意1:** *每一单位的sgpu-memory代表512M的显存.* + +> **注意2:** *查看更多的[用例](../examples/mthreads/).* + + + + diff --git a/examples/mthreads/default_use.yaml b/examples/mthreads/default_use.yaml new file mode 100644 index 000000000..8b2714c83 --- /dev/null +++ b/examples/mthreads/default_use.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Pod +metadata: + name: gpushare-pod-default +spec: + restartPolicy: OnFailure + containers: + - image: core.harbor.zlidc.mthreads.com:30003/mt-ai/lm-qy2:v17-mpc + imagePullPolicy: IfNotPresent + name: gpushare-pod-1 + command: ["sleep"] + args: ["100000"] + resources: + limits: + mthreads.com/vgpu: 1 + mthreads.com/sgpu-memory: 32 + mthreads.com/sgpu-core: 8 \ No newline at end of file diff --git a/examples/mthreads/multi_cards.yaml b/examples/mthreads/multi_cards.yaml new file mode 100644 index 000000000..e30eda1aa --- /dev/null +++ b/examples/mthreads/multi_cards.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Pod +metadata: + name: gpushare-pod-multi-cards +spec: + restartPolicy: OnFailure + containers: + - image: core.harbor.zlidc.mthreads.com:30003/mt-ai/lm-qy2:v17-mpc + imagePullPolicy: IfNotPresent + name: gpushare-pod-1 + command: ["sleep"] + args: ["100000"] + resources: + limits: + mthreads.com/vgpu: 2 \ No newline at end of file diff --git a/examples/mthreads/use_exclusive.yaml b/examples/mthreads/use_exclusive.yaml new file mode 100644 index 000000000..d1687fdc4 --- /dev/null +++ b/examples/mthreads/use_exclusive.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Pod +metadata: + name: gpushare-pod-exclusive +spec: + restartPolicy: OnFailure + containers: + - image: core.harbor.zlidc.mthreads.com:30003/mt-ai/lm-qy2:v17-mpc + imagePullPolicy: IfNotPresent + name: gpushare-pod-1 + command: ["sleep"] + args: ["100000"] + resources: + limits: + mthreads.com/vgpu: 1 \ No newline at end of file diff --git a/pkg/device/ascend/device.go b/pkg/device/ascend/device.go index a529b956d..0aa84ba28 100644 --- a/pkg/device/ascend/device.go +++ b/pkg/device/ascend/device.go @@ -108,7 +108,7 @@ func (dev *Devices) CommonWord() string { return dev.config.CommonWord } -func (dev *Devices) MutateAdmission(ctr *corev1.Container) (bool, error) { +func (dev *Devices) MutateAdmission(ctr *corev1.Container, p *corev1.Pod) (bool, error) { count, ok := ctr.Resources.Limits[corev1.ResourceName(dev.config.ResourceName)] if !ok { return false, nil @@ -197,7 +197,7 @@ func (dev *Devices) CheckType(annos map[string]string, d util.DeviceUsage, n uti func (dev *Devices) CheckUUID(annos map[string]string, d util.DeviceUsage) bool { userUUID, ok := annos[dev.useUUIDAnno] if ok { - klog.V(5).Infof("check uuid for Iluvatar user uuid [%s], device id is %s", userUUID, d.ID) + klog.V(5).Infof("check uuid for ascend user uuid [%s], device id is %s", userUUID, d.ID) // use , symbol to connect multiple uuid userUUIDs := strings.Split(userUUID, ",") for _, uuid := range userUUIDs { @@ -210,7 +210,7 @@ func (dev *Devices) CheckUUID(annos map[string]string, d util.DeviceUsage) bool noUserUUID, ok := annos[dev.noUseUUIDAnno] if ok { - klog.V(5).Infof("check uuid for Iluvatar not user uuid [%s], device id is %s", noUserUUID, d.ID) + klog.V(5).Infof("check uuid for ascend not user uuid [%s], device id is %s", noUserUUID, d.ID) // use , symbol to connect multiple uuid noUserUUIDs := strings.Split(noUserUUID, ",") for _, uuid := range noUserUUIDs { @@ -268,3 +268,7 @@ func (dev *Devices) GenerateResourceRequests(ctr *corev1.Container) util.Contain } return util.ContainerDeviceRequest{} } + +func (dev *Devices) CustomFilterRule(allocated *util.PodDevices, toAllocate util.ContainerDevices, device *util.DeviceUsage) bool { + return true +} diff --git a/pkg/device/cambricon/device.go b/pkg/device/cambricon/device.go index 451022efc..0db8880f1 100644 --- a/pkg/device/cambricon/device.go +++ b/pkg/device/cambricon/device.go @@ -200,7 +200,7 @@ func (dev *CambriconDevices) AssertNuma(annos map[string]string) bool { return false } -func (dev *CambriconDevices) MutateAdmission(ctr *corev1.Container) (bool, error) { +func (dev *CambriconDevices) MutateAdmission(ctr *corev1.Container, p *corev1.Pod) (bool, error) { _, ok := ctr.Resources.Limits[corev1.ResourceName(MLUResourceCount)] return ok, nil } @@ -308,3 +308,7 @@ func (dev *CambriconDevices) PatchAnnotations(annoinput *map[string]string, pd u } return *annoinput } + +func (dev *CambriconDevices) CustomFilterRule(allocated *util.PodDevices, toAllocate util.ContainerDevices, device *util.DeviceUsage) bool { + return true +} diff --git a/pkg/device/devices.go b/pkg/device/devices.go index 7c6c9630a..f0c62d7d5 100644 --- a/pkg/device/devices.go +++ b/pkg/device/devices.go @@ -27,6 +27,7 @@ import ( "github.com/Project-HAMi/HAMi/pkg/device/cambricon" "github.com/Project-HAMi/HAMi/pkg/device/hygon" "github.com/Project-HAMi/HAMi/pkg/device/iluvatar" + "github.com/Project-HAMi/HAMi/pkg/device/mthreads" "github.com/Project-HAMi/HAMi/pkg/device/nvidia" "github.com/Project-HAMi/HAMi/pkg/util" "github.com/Project-HAMi/HAMi/pkg/util/client" @@ -39,7 +40,7 @@ import ( type Devices interface { CommonWord() string - MutateAdmission(ctr *corev1.Container) (bool, error) + MutateAdmission(ctr *corev1.Container, pod *corev1.Pod) (bool, error) CheckHealth(devType string, n *corev1.Node) (bool, bool) NodeCleanUp(nn string) error GetNodeDevices(n corev1.Node) ([]*api.DeviceInfo, error) @@ -50,6 +51,7 @@ type Devices interface { ReleaseNodeLock(n *corev1.Node, p *corev1.Pod) error GenerateResourceRequests(ctr *corev1.Container) util.ContainerDeviceRequest PatchAnnotations(annoinput *map[string]string, pd util.PodDevices) map[string]string + CustomFilterRule(allocated *util.PodDevices, toAllicate util.ContainerDevices, device *util.DeviceUsage) bool // This should not be associated with a specific device object //ParseConfig(fs *flag.FlagSet) } @@ -74,12 +76,14 @@ func InitDevices() { devices[nvidia.NvidiaGPUDevice] = nvidia.InitNvidiaDevice() devices[hygon.HygonDCUDevice] = hygon.InitDCUDevice() devices[iluvatar.IluvatarGPUDevice] = iluvatar.InitIluvatarDevice() + devices[mthreads.MthreadsGPUDevice] = mthreads.InitMthreadsDevice() //devices[d.AscendDevice] = d.InitDevice() //devices[ascend.Ascend310PName] = ascend.InitAscend310P() DevicesToHandle = append(DevicesToHandle, nvidia.NvidiaGPUCommonWord) DevicesToHandle = append(DevicesToHandle, cambricon.CambriconMLUCommonWord) DevicesToHandle = append(DevicesToHandle, hygon.HygonDCUCommonWord) DevicesToHandle = append(DevicesToHandle, iluvatar.IluvatarGPUCommonWord) + DevicesToHandle = append(DevicesToHandle, mthreads.MthreadsGPUCommonWord) //DevicesToHandle = append(DevicesToHandle, d.AscendDevice) //DevicesToHandle = append(DevicesToHandle, ascend.Ascend310PName) for _, dev := range ascend.InitDevices() { @@ -138,6 +142,7 @@ func GlobalFlagSet() *flag.FlagSet { hygon.ParseConfig(fs) iluvatar.ParseConfig(fs) nvidia.ParseConfig(fs) + mthreads.ParseConfig(fs) fs.BoolVar(&DebugMode, "debug", false, "debug mode") klog.InitFlags(fs) return fs diff --git a/pkg/device/hygon/device.go b/pkg/device/hygon/device.go index c929070d4..7adc3742c 100644 --- a/pkg/device/hygon/device.go +++ b/pkg/device/hygon/device.go @@ -67,7 +67,7 @@ func ParseConfig(fs *flag.FlagSet) { fs.StringVar(&HygonResourceCores, "dcu-cores", "hygon.com/dcucores", "dcu core resource") } -func (dev *DCUDevices) MutateAdmission(ctr *corev1.Container) (bool, error) { +func (dev *DCUDevices) MutateAdmission(ctr *corev1.Container, p *corev1.Pod) (bool, error) { _, ok := ctr.Resources.Limits[corev1.ResourceName(HygonResourceCount)] return ok, nil } @@ -237,3 +237,7 @@ func (dev *DCUDevices) PatchAnnotations(annoinput *map[string]string, pd util.Po } return *annoinput } + +func (dev *DCUDevices) CustomFilterRule(allocated *util.PodDevices, toAllocate util.ContainerDevices, device *util.DeviceUsage) bool { + return true +} diff --git a/pkg/device/iluvatar/device.go b/pkg/device/iluvatar/device.go index 3d6e09c6f..3845a7c89 100644 --- a/pkg/device/iluvatar/device.go +++ b/pkg/device/iluvatar/device.go @@ -64,7 +64,7 @@ func ParseConfig(fs *flag.FlagSet) { fs.StringVar(&IluvatarResourceCores, "iluvatar-cores", "iluvatar.ai/vcuda-core", "iluvatar core resource") } -func (dev *IluvatarDevices) MutateAdmission(ctr *corev1.Container) (bool, error) { +func (dev *IluvatarDevices) MutateAdmission(ctr *corev1.Container, p *corev1.Pod) (bool, error) { count, ok := ctr.Resources.Limits[corev1.ResourceName(IluvatarResourceCount)] if ok { if count.Value() > 1 { @@ -217,3 +217,7 @@ func (dev *IluvatarDevices) GenerateResourceRequests(ctr *corev1.Container) util } return util.ContainerDeviceRequest{} } + +func (dev *IluvatarDevices) CustomFilterRule(allocated *util.PodDevices, toAllocate util.ContainerDevices, device *util.DeviceUsage) bool { + return true +} diff --git a/pkg/device/mthreads/device.go b/pkg/device/mthreads/device.go new file mode 100644 index 000000000..c69f58ebc --- /dev/null +++ b/pkg/device/mthreads/device.go @@ -0,0 +1,265 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package mthreads + +import ( + "errors" + "flag" + "fmt" + "strconv" + "strings" + "time" + + "github.com/Project-HAMi/HAMi/pkg/api" + "github.com/Project-HAMi/HAMi/pkg/util" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/klog/v2" +) + +type MthreadsDevices struct { +} + +const ( + MthreadsGPUDevice = "Mthreads" + MthreadsGPUCommonWord = "Mthreads" + MthreadsDeviceSelection = "mthreads.com/gpu-index" + // IluvatarUseUUID is user can use specify Iluvatar device for set Iluvatar UUID. + MthreadsUseUUID = "mthreads.ai/use-gpuuuid" + // IluvatarNoUseUUID is user can not use specify Iluvatar device for set Iluvatar UUID. + MthreadsNoUseUUID = "mthreads.ai/nouse-gpuuuid" + MthreadsAssignedGPUIndex = "mthreads.com/gpu-index" + MthreadsAssignedNode = "mthreads.com/predicate-node" + MthreadsPredicateTime = "mthreads.com/predicate-time" + coresPerMthreadsGPU = 16 + memoryPerMthreadsGPU = 96 +) + +var ( + MthreadsResourceCount string + MthreadsResourceMemory string + MthreadsResourceCores string + legalMemoryslices = []int64{2, 4, 8, 16, 32, 64, 96} +) + +func InitMthreadsDevice() *MthreadsDevices { + util.InRequestDevices[MthreadsGPUDevice] = "hami.io/mthreads-vgpu-devices-to-allocate" + util.SupportDevices[MthreadsGPUDevice] = "hami.io/mthreads-vgpu-devices-allocated" + return &MthreadsDevices{} +} + +func (dev *MthreadsDevices) CommonWord() string { + return MthreadsGPUCommonWord +} + +func ParseConfig(fs *flag.FlagSet) { + fs.StringVar(&MthreadsResourceCount, "mthreads-name", "mthreads.com/vgpu", "mthreads resource count") + fs.StringVar(&MthreadsResourceMemory, "mthreads-memory", "mthreads.com/sgpu-memory", "mthreads memory resource") + fs.StringVar(&MthreadsResourceCores, "mthreads-cores", "mthreads.com/sgpu-core", "mthreads core resource") +} + +func (dev *MthreadsDevices) MutateAdmission(ctr *corev1.Container, p *corev1.Pod) (bool, error) { + count, ok := ctr.Resources.Limits[corev1.ResourceName(MthreadsResourceCount)] + if ok { + if count.Value() > 1 { + ctr.Resources.Limits[corev1.ResourceName(MthreadsResourceCores)] = *resource.NewQuantity(count.Value()*int64(coresPerMthreadsGPU), resource.DecimalSI) + ctr.Resources.Limits[corev1.ResourceName(MthreadsResourceMemory)] = *resource.NewQuantity(count.Value()*int64(memoryPerMthreadsGPU), resource.DecimalSI) + p.Annotations["mthreads.com/request-gpu-num"] = fmt.Sprint(count.Value()) + return ok, nil + } + mem, memok := ctr.Resources.Limits[corev1.ResourceName(MthreadsResourceMemory)] + if !memok { + ctr.Resources.Limits[corev1.ResourceName(MthreadsResourceCores)] = *resource.NewQuantity(count.Value()*int64(coresPerMthreadsGPU), resource.DecimalSI) + ctr.Resources.Limits[corev1.ResourceName(MthreadsResourceMemory)] = *resource.NewQuantity(count.Value()*int64(memoryPerMthreadsGPU), resource.DecimalSI) + } else { + memnum, _ := mem.AsInt64() + found := false + for _, val := range legalMemoryslices { + if memnum == val { + found = true + break + } + } + if !found { + return true, errors.New("sGPU memory request value is invalid, valid values are [1, 2, 4, 8, 16, 32, 64, 96]") + } + } + } + return ok, nil +} + +func (dev *MthreadsDevices) GetNodeDevices(n corev1.Node) ([]*api.DeviceInfo, error) { + nodedevices := []*api.DeviceInfo{} + i := 0 + cores, _ := n.Status.Capacity.Name(corev1.ResourceName(MthreadsResourceCores), resource.DecimalSI).AsInt64() + memoryTotal, _ := n.Status.Capacity.Name(corev1.ResourceName(MthreadsResourceMemory), resource.DecimalSI).AsInt64() + for int64(i)*coresPerMthreadsGPU < cores { + nodedevices = append(nodedevices, &api.DeviceInfo{ + Index: i, + ID: n.Name + "-mthreads-" + fmt.Sprint(i), + Count: 100, + Devmem: int32(memoryTotal * 512 * coresPerMthreadsGPU / cores), + Devcore: coresPerMthreadsGPU, + Type: MthreadsGPUDevice, + Numa: 0, + Health: true, + }) + i++ + } + return nodedevices, nil +} + +func (dev *MthreadsDevices) PatchAnnotations(annoinput *map[string]string, pd util.PodDevices) map[string]string { + devlist, ok := pd[MthreadsGPUDevice] + if ok && len(devlist) > 0 { + (*annoinput)[util.SupportDevices[MthreadsGPUDevice]] = util.EncodePodSingleDevice(devlist) + for _, dp := range devlist { + if len(dp) > 0 { + value := "" + for _, val := range dp { + value = value + fmt.Sprint(val.Idx) + "," + } + if len(value) > 0 { + (*annoinput)[MthreadsAssignedGPUIndex] = strings.TrimRight(value, ",") + //(*annoinput)[MthreadsAssignedNode]= + tmp := strconv.FormatInt(time.Now().UnixNano(), 10) + (*annoinput)[MthreadsPredicateTime] = tmp + (*annoinput)[MthreadsAssignedNode] = (*annoinput)[util.AssignedNodeAnnotations] + } + } + } + } + klog.Infoln("annoinput", (*annoinput)) + return *annoinput +} + +func (dev *MthreadsDevices) LockNode(n *corev1.Node, p *corev1.Pod) error { + return nil +} + +func (dev *MthreadsDevices) ReleaseNodeLock(n *corev1.Node, p *corev1.Pod) error { + return nil +} + +func (dev *MthreadsDevices) NodeCleanUp(nn string) error { + return nil +} + +func (dev *MthreadsDevices) CheckType(annos map[string]string, d util.DeviceUsage, n util.ContainerDeviceRequest) (bool, bool, bool) { + if strings.Compare(n.Type, MthreadsGPUDevice) == 0 { + return true, true, false + } + return false, false, false +} + +func (dev *MthreadsDevices) CheckUUID(annos map[string]string, d util.DeviceUsage) bool { + userUUID, ok := annos[MthreadsUseUUID] + if ok { + klog.V(5).Infof("check uuid for Iluvatar user uuid [%s], device id is %s", userUUID, d.ID) + // use , symbol to connect multiple uuid + userUUIDs := strings.Split(userUUID, ",") + for _, uuid := range userUUIDs { + if d.ID == uuid { + return true + } + } + return false + } + + noUserUUID, ok := annos[MthreadsNoUseUUID] + if ok { + klog.V(5).Infof("check uuid for Iluvatar not user uuid [%s], device id is %s", noUserUUID, d.ID) + // use , symbol to connect multiple uuid + noUserUUIDs := strings.Split(noUserUUID, ",") + for _, uuid := range noUserUUIDs { + if d.ID == uuid { + return false + } + } + return true + } + return true +} + +func (dev *MthreadsDevices) CheckHealth(devType string, n *corev1.Node) (bool, bool) { + return true, true +} + +func (dev *MthreadsDevices) GenerateResourceRequests(ctr *corev1.Container) util.ContainerDeviceRequest { + klog.Info("Counting mthreads devices") + mthreadsResourceCount := corev1.ResourceName(MthreadsResourceCount) + mthreadsResourceMem := corev1.ResourceName(MthreadsResourceMemory) + mthreadsResourceCores := corev1.ResourceName(MthreadsResourceCores) + v, ok := ctr.Resources.Limits[mthreadsResourceCount] + if !ok { + v, ok = ctr.Resources.Requests[mthreadsResourceCount] + } + if ok { + if n, ok := v.AsInt64(); ok { + klog.Info("Found iluvatar devices") + memnum := 0 + mem, ok := ctr.Resources.Limits[mthreadsResourceMem] + if !ok { + mem, ok = ctr.Resources.Requests[mthreadsResourceMem] + } + if ok { + memnums, ok := mem.AsInt64() + if ok { + memnum = int(memnums) * 512 + } + } + corenum := int32(0) + core, ok := ctr.Resources.Limits[mthreadsResourceCores] + if !ok { + core, ok = ctr.Resources.Requests[mthreadsResourceCores] + } + if ok { + corenums, ok := core.AsInt64() + if ok { + corenum = int32(corenums) + } + } + + mempnum := 0 + if memnum == 0 { + mempnum = 100 + } + + return util.ContainerDeviceRequest{ + Nums: int32(n), + Type: MthreadsGPUDevice, + Memreq: int32(memnum) / int32(n), + MemPercentagereq: int32(mempnum), + Coresreq: corenum / int32(n), + } + } + } + return util.ContainerDeviceRequest{} +} + +func (dev *MthreadsDevices) CustomFilterRule(allocated *util.PodDevices, toAllocate util.ContainerDevices, device *util.DeviceUsage) bool { + for _, ctrs := range (*allocated)[device.Type] { + for _, ctrdev := range ctrs { + if strings.Compare(ctrdev.UUID, device.ID) != 0 { + klog.InfoS("Mthreads needs all devices on a device", "used", ctrdev.UUID, "allocating", device.ID) + return false + } + } + } + return true +} diff --git a/pkg/device/nvidia/device.go b/pkg/device/nvidia/device.go index 0d035e6e8..903f495cc 100644 --- a/pkg/device/nvidia/device.go +++ b/pkg/device/nvidia/device.go @@ -136,7 +136,7 @@ func (dev *NvidiaGPUDevices) GetNodeDevices(n corev1.Node) ([]*api.DeviceInfo, e return nodedevices, nil } -func (dev *NvidiaGPUDevices) MutateAdmission(ctr *corev1.Container) (bool, error) { +func (dev *NvidiaGPUDevices) MutateAdmission(ctr *corev1.Container, p *corev1.Pod) (bool, error) { /*gpu related */ priority, ok := ctr.Resources.Limits[corev1.ResourceName(ResourcePriority)] if ok { @@ -323,3 +323,7 @@ func (dev *NvidiaGPUDevices) GenerateResourceRequests(ctr *corev1.Container) uti } return util.ContainerDeviceRequest{} } + +func (dev *NvidiaGPUDevices) CustomFilterRule(allocated *util.PodDevices, toAllocate util.ContainerDevices, device *util.DeviceUsage) bool { + return true +} diff --git a/pkg/device/nvidia/device_test.go b/pkg/device/nvidia/device_test.go index b48fe2668..c6451c043 100644 --- a/pkg/device/nvidia/device_test.go +++ b/pkg/device/nvidia/device_test.go @@ -108,7 +108,7 @@ func Test_MutateAdmission(t *testing.T) { gpuDevices := &NvidiaGPUDevices{} for _, test := range tests { t.Run(test.name, func(t *testing.T) { - got, _ := gpuDevices.MutateAdmission(test.args) + got, _ := gpuDevices.MutateAdmission(test.args, &corev1.Pod{}) if test.want != got { t.Fatalf("exec MutateAdmission method expect return is %+v, but got is %+v", test.want, got) } diff --git a/pkg/scheduler/score.go b/pkg/scheduler/score.go index c0f01ebc8..7ef37b707 100644 --- a/pkg/scheduler/score.go +++ b/pkg/scheduler/score.go @@ -37,13 +37,12 @@ func viewStatus(usage NodeUsage) { func checkType(annos map[string]string, d util.DeviceUsage, n util.ContainerDeviceRequest) (bool, bool) { //General type check, NVIDIA->NVIDIA MLU->MLU - klog.Infoln("Type contains", d.Type, n.Type) + klog.V(3).InfoS("Type check", "device", d.Type, "req", n.Type) if !strings.Contains(d.Type, n.Type) { return false, false } - for idx, val := range device.GetDevices() { + for _, val := range device.GetDevices() { found, pass, numaAssert := val.CheckType(annos, d, n) - klog.Infoln("idx", idx, found, pass) if found { return pass, numaAssert } @@ -63,7 +62,7 @@ func checkUUID(annos map[string]string, d util.DeviceUsage, n util.ContainerDevi return result } -func fitInCertainDevice(node *NodeUsage, request util.ContainerDeviceRequest, annos map[string]string, pod *corev1.Pod) (bool, map[string]util.ContainerDevices) { +func fitInCertainDevice(node *NodeUsage, request util.ContainerDeviceRequest, annos map[string]string, pod *corev1.Pod, allocated *util.PodDevices) (bool, map[string]util.ContainerDevices) { k := request originReq := k.Nums prevnuma := -1 @@ -122,6 +121,9 @@ func fitInCertainDevice(node *NodeUsage, request util.ContainerDeviceRequest, an klog.V(5).InfoS("can't allocate core=0 job to an already full GPU", "pod", klog.KObj(pod), "device index", i, "device", node.Devices.DeviceLists[i].Device.ID) continue } + if !device.GetDevices()[k.Type].CustomFilterRule(allocated, tmpDevs[k.Type], node.Devices.DeviceLists[i].Device) { + continue + } if k.Nums > 0 { klog.InfoS("first fitted", "pod", klog.KObj(pod), "device", node.Devices.DeviceLists[i].Device.ID) k.Nums-- @@ -159,7 +161,7 @@ func fitInDevices(node *NodeUsage, requests util.ContainerDeviceRequests, annos return false, 0 } sort.Sort(node.Devices) - fit, tmpDevs := fitInCertainDevice(node, k, annos, pod) + fit, tmpDevs := fitInCertainDevice(node, k, annos, pod, devinput) if fit { for _, val := range tmpDevs[k.Type] { total += node.Devices.DeviceLists[val.Idx].Device.Count diff --git a/pkg/scheduler/webhook.go b/pkg/scheduler/webhook.go index dd72727b9..cddc11976 100644 --- a/pkg/scheduler/webhook.go +++ b/pkg/scheduler/webhook.go @@ -71,7 +71,7 @@ func (h *webhook) Handle(_ context.Context, req admission.Request) admission.Res } } for _, val := range device.GetDevices() { - found, err := val.MutateAdmission(c) + found, err := val.MutateAdmission(c, pod) if err != nil { klog.Errorf("validating pod failed:%s", err.Error()) return admission.Errored(http.StatusInternalServerError, err)