main
czd 3 months ago
parent a6b4910d89
commit 264e3c9f0c

@ -0,0 +1,15 @@
apiVersion: v2
appVersion: 2.5.0
description: Heterogeneous AI Computing Virtualization Middleware
keywords:
- vgpu
- gpu
kubeVersion: '>= 1.16.0'
maintainers:
- email: limengxuan@4paradigm.com
name: limengxuan
- email: xiaozhang0210@hotmail.com
name: zhangxiao
name: hami
type: application
version: 2.5.0

@ -0,0 +1,3 @@
** Please be patient while the chart is being deployed **
Resource name: {{ .Values.resourceName }}

@ -0,0 +1,108 @@
{{/*
Expand the name of the chart.
*/}}
{{- define "hami-vgpu.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{/*
Create a default fully qualified app name.
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
If release name contains chart name it will be used as a full name.
*/}}
{{- define "hami-vgpu.fullname" -}}
{{- if .Values.fullnameOverride -}}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
{{- else }}
{{- $name := default .Chart.Name .Values.nameOverride -}}
{{- if contains $name .Release.Name }}
{{- .Release.Name | trunc 63 | trimSuffix "-" -}}
{{- else -}}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{- end -}}
{{- end -}}
{{/*
The app name for Scheduler
*/}}
{{- define "hami-vgpu.scheduler" -}}
{{- printf "%s-scheduler" ( include "hami-vgpu.fullname" . ) | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{/*
The app name for DevicePlugin
*/}}
{{- define "hami-vgpu.device-plugin" -}}
{{- printf "%s-device-plugin" ( include "hami-vgpu.fullname" . ) | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{/*
The tls secret name for Scheduler
*/}}
{{- define "hami-vgpu.scheduler.tls" -}}
{{- printf "%s-scheduler-tls" ( include "hami-vgpu.fullname" . ) | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{/*
The webhook name
*/}}
{{- define "hami-vgpu.scheduler.webhook" -}}
{{- printf "%s-webhook" ( include "hami-vgpu.fullname" . ) | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{/*
Create chart name and version as used by the chart label.
*/}}
{{- define "hami-vgpu.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/*
Common labels
*/}}
{{- define "hami-vgpu.labels" -}}
helm.sh/chart: {{ include "hami-vgpu.chart" . }}
{{ include "hami-vgpu.selectorLabels" . }}
{{- if .Chart.AppVersion }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
{{- end }}
{{/*
Selector labels
*/}}
{{- define "hami-vgpu.selectorLabels" -}}
app.kubernetes.io/name: {{ include "hami-vgpu.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
{{- end }}
{{/*
Image registry secret name
*/}}
{{- define "hami-vgpu.imagePullSecrets" -}}
imagePullSecrets: {{ toYaml .Values.imagePullSecrets | nindent 2 }}
{{- end }}
{{/*
Resolve the tag for kubeScheduler.
*/}}
{{- define "resolvedKubeSchedulerTag" -}}
{{- if .Values.scheduler.kubeScheduler.imageTag }}
{{- .Values.scheduler.kubeScheduler.imageTag | trim -}}
{{- else }}
{{- include "strippedKubeVersion" . | trim -}}
{{- end }}
{{- end }}
{{/*
Remove the part after the `+` in the Kubernetes version string.
v1.31.1+k3s1 -> v1.31.1
v1.31.1 -> v1.31.1
*/}}
{{- define "strippedKubeVersion" -}}
{{- $parts := split "+" .Capabilities.KubeVersion.Version -}}
{{- print $parts._0 -}}
{{- end -}}

@ -0,0 +1,24 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "hami-vgpu.device-plugin" . }}
labels:
app.kubernetes.io/component: hami-device-plugin
{{- include "hami-vgpu.labels" . | nindent 4 }}
data:
config.json: |
{
"nodeconfig": [
{
"name": "m5-cloudinfra-online02",
"operatingmode": "hami-core",
"devicememoryscaling": 1.8,
"devicesplitcount": 10,
"migstrategy":"none",
"filterdevices": {
"uuid": [],
"index": []
}
}
]
}

@ -0,0 +1,166 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: {{ include "hami-vgpu.device-plugin" . }}
labels:
app.kubernetes.io/component: hami-device-plugin
{{- include "hami-vgpu.labels" . | nindent 4 }}
{{- with .Values.global.labels }}
{{- toYaml . | nindent 4 }}
{{- end }}
{{- if .Values.global.annotations }}
annotations: {{ toYaml .Values.global.annotations | nindent 4}}
{{- end }}
spec:
updateStrategy:
{{- with .Values.devicePlugin.updateStrategy }}
{{- toYaml . | nindent 4 }}
{{- end }}
selector:
matchLabels:
app.kubernetes.io/component: hami-device-plugin
{{- include "hami-vgpu.selectorLabels" . | nindent 6 }}
template:
metadata:
labels:
app.kubernetes.io/component: hami-device-plugin
hami.io/webhook: ignore
{{- include "hami-vgpu.selectorLabels" . | nindent 8 }}
{{- if .Values.devicePlugin.podAnnotations }}
annotations: {{ toYaml .Values.devicePlugin.podAnnotations | nindent 8 }}
{{- end }}
spec:
{{- if .Values.devicePlugin.runtimeClassName }}
runtimeClassName: {{ .Values.devicePlugin.runtimeClassName }}
{{- end }}
{{- include "hami-vgpu.imagePullSecrets" . | nindent 6}}
serviceAccountName: {{ include "hami-vgpu.device-plugin" . }}
priorityClassName: system-node-critical
hostPID: true
hostNetwork: true
containers:
- name: device-plugin
image: {{ .Values.devicePlugin.image }}:{{ .Values.version }}
imagePullPolicy: {{ .Values.devicePlugin.imagePullPolicy | quote }}
lifecycle:
postStart:
exec:
command: ["/bin/sh","-c", {{ printf "/k8s-vgpu/bin/vgpu-init.sh %s/vgpu/" .Values.global.gpuHookPath | quote }}]
command:
- nvidia-device-plugin
- --config-file=/device-config.yaml
- --mig-strategy={{ .Values.devicePlugin.migStrategy }}
- --disable-core-limit={{ .Values.devicePlugin.disablecorelimit }}
{{- range .Values.devicePlugin.extraArgs }}
- {{ . }}
{{- end }}
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: NVIDIA_MIG_MONITOR_DEVICES
value: all
- name: HOOK_PATH
value: {{ .Values.global.gpuHookPath }}
{{- if typeIs "bool" .Values.devicePlugin.passDeviceSpecsEnabled }}
- name: PASS_DEVICE_SPECS
value: {{ .Values.devicePlugin.passDeviceSpecsEnabled | quote }}
{{- end }}
securityContext:
privileged: true
allowPrivilegeEscalation: true
capabilities:
drop: ["ALL"]
add: ["SYS_ADMIN"]
resources:
{{- toYaml .Values.devicePlugin.resources | nindent 12 }}
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: lib
mountPath: {{ printf "%s%s" .Values.global.gpuHookPath "/vgpu" }}
- name: usrbin
mountPath: /usrbin
- name: deviceconfig
mountPath: /config
- name: hosttmp
mountPath: /tmp
- name: device-config
mountPath: /device-config.yaml
subPath: device-config.yaml
- name: vgpu-monitor
image: {{ .Values.devicePlugin.image }}:{{ .Values.version }}
imagePullPolicy: {{ .Values.devicePlugin.imagePullPolicy | quote }}
command: ["vGPUmonitor"]
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
add: ["SYS_ADMIN"]
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: NVIDIA_VISIBLE_DEVICES
value: "all"
- name: NVIDIA_MIG_MONITOR_DEVICES
value: all
- name: HOOK_PATH
value: {{ .Values.global.gpuHookPath }}/vgpu
resources:
{{- toYaml .Values.devicePlugin.vgpuMonitor.resources | nindent 12 }}
volumeMounts:
- name: ctrs
mountPath: {{ .Values.devicePlugin.monitorctrPath }}
- name: dockers
mountPath: /run/docker
- name: containerds
mountPath: /run/containerd
- name: sysinfo
mountPath: /sysinfo
- name: hostvar
mountPath: /hostvar
- name: hosttmp
mountPath: /tmp
volumes:
- name: ctrs
hostPath:
path: {{ .Values.devicePlugin.monitorctrPath }}
- name: hosttmp
hostPath:
path: /tmp
- name: dockers
hostPath:
path: /run/docker
- name: containerds
hostPath:
path: /run/containerd
- name: device-plugin
hostPath:
path: {{ .Values.devicePlugin.pluginPath }}
- name: lib
hostPath:
path: {{ .Values.devicePlugin.libPath }}
- name: usrbin
hostPath:
path: /usr/bin
- name: sysinfo
hostPath:
path: /sys
- name: hostvar
hostPath:
path: /var
- name: deviceconfig
configMap:
name: {{ template "hami-vgpu.device-plugin" . }}
- name: device-config
configMap:
name: {{ include "hami-vgpu.scheduler" . }}-device
{{- if .Values.devicePlugin.nvidianodeSelector }}
nodeSelector: {{ toYaml .Values.devicePlugin.nvidianodeSelector | nindent 8 }}
{{- end }}
{{- if .Values.devicePlugin.tolerations }}
tolerations: {{ toYaml .Values.devicePlugin.tolerations | nindent 8 }}
{{- end }}

@ -0,0 +1,27 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ include "hami-vgpu.device-plugin" . }}-monitor
rules:
- apiGroups:
- ""
resources:
- pods
verbs:
- get
- create
- watch
- list
- update
- patch
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- update
- list
- patch

@ -0,0 +1,16 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ include "hami-vgpu.device-plugin" . }}
labels:
app.kubernetes.io/component: "hami-device-plugin"
{{- include "hami-vgpu.labels" . | nindent 4 }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
#name: cluster-admin
name: {{ include "hami-vgpu.device-plugin" . }}-monitor
subjects:
- kind: ServiceAccount
name: {{ include "hami-vgpu.device-plugin" . }}
namespace: {{ .Release.Namespace | quote }}

@ -0,0 +1,23 @@
apiVersion: v1
kind: Service
metadata:
name: {{ include "hami-vgpu.device-plugin" . }}-monitor
labels:
app.kubernetes.io/component: hami-device-plugin
{{- include "hami-vgpu.labels" . | nindent 4 }}
{{- if .Values.scheduler.service.labels }}
{{ toYaml .Values.scheduler.service.labels | indent 4 }}
{{- end }}
{{- if .Values.scheduler.service.annotations }}
annotations: {{ toYaml .Values.scheduler.service.annotations | nindent 4 }}
{{- end }}
spec:
externalTrafficPolicy: Local
selector:
app.kubernetes.io/component: hami-device-plugin
type: NodePort
ports:
- name: monitorport
port: {{ .Values.devicePlugin.service.httpPort }}
targetPort: 9394
nodePort: {{ .Values.devicePlugin.service.httpPort }}

@ -0,0 +1,8 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ include "hami-vgpu.device-plugin" . }}
namespace: {{ .Release.Namespace | quote }}
labels:
app.kubernetes.io/component: "hami-device-plugin"
{{- include "hami-vgpu.labels" . | nindent 4 }}

@ -0,0 +1,88 @@
{{- if .Values.scheduler.kubeScheduler.enabled }}
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "hami-vgpu.scheduler" . }}
labels:
app.kubernetes.io/component: hami-scheduler
{{- include "hami-vgpu.labels" . | nindent 4 }}
data:
config.json: |
{
"kind": "Policy",
"apiVersion": "v1",
"extenders": [
{
"urlPrefix": "https://127.0.0.1:443",
"filterVerb": "filter",
"bindVerb": "bind",
"enableHttps": true,
"weight": 1,
"nodeCacheCapable": true,
"httpTimeout": 30000000000,
"tlsConfig": {
"insecure": true
},
"managedResources": [
{{- if .Values.devices.ascend.enabled }}
{{- range .Values.devices.ascend.customresources }}
{
"name": "{{ . }}",
"ignoredByScheduler": true
},
{{- end }}
{{- end }}
{{- if .Values.devices.mthreads.enabled }}
{{- range .Values.devices.mthreads.customresources }}
{
"name": "{{ . }}",
"ignoredByScheduler": true
},
{{- end }}
{{- end }}
{
"name": "{{ .Values.resourceName }}",
"ignoredByScheduler": true
},
{
"name": "{{ .Values.resourceMem }}",
"ignoredByScheduler": true
},
{
"name": "{{ .Values.resourceCores }}",
"ignoredByScheduler": true
},
{
"name": "{{ .Values.resourceMemPercentage }}",
"ignoredByScheduler": true
},
{
"name": "{{ .Values.resourcePriority }}",
"ignoredByScheduler": true
},
{
"name": "{{ .Values.mluResourceName }}",
"ignoredByScheduler": true
},
{
"name": "{{ .Values.dcuResourceName }}",
"ignoredByScheduler": true
},
{
"name": "{{ .Values.dcuResourceMem }}",
"ignoredByScheduler": true
},
{
"name": "{{ .Values.dcuResourceCores }}",
"ignoredByScheduler": true
},
{
"name": "{{ .Values.iluvatarResourceName }}",
"ignoredByScheduler": true
}
],
"ignoreable": false
}
]
}
{{- end }}

@ -0,0 +1,64 @@
{{- if .Values.scheduler.kubeScheduler.enabled }}
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "hami-vgpu.scheduler" . }}-newversion
labels:
app.kubernetes.io/component: hami-scheduler
{{- include "hami-vgpu.labels" . | nindent 4 }}
data:
config.yaml: |
{{- if gt (.Capabilities.KubeVersion.Minor | int) 25}}
apiVersion: kubescheduler.config.k8s.io/v1
{{- else }}
apiVersion: kubescheduler.config.k8s.io/v1beta2
{{- end }}
kind: KubeSchedulerConfiguration
leaderElection:
leaderElect: false
profiles:
- schedulerName: {{ .Values.schedulerName }}
extenders:
- urlPrefix: "https://127.0.0.1:443"
filterVerb: filter
bindVerb: bind
nodeCacheCapable: true
weight: 1
httpTimeout: 30s
enableHTTPS: true
tlsConfig:
insecure: true
managedResources:
- name: {{ .Values.resourceName }}
ignoredByScheduler: true
- name: {{ .Values.resourceMem }}
ignoredByScheduler: true
- name: {{ .Values.resourceCores }}
ignoredByScheduler: true
- name: {{ .Values.resourceMemPercentage }}
ignoredByScheduler: true
- name: {{ .Values.resourcePriority }}
ignoredByScheduler: true
- name: {{ .Values.mluResourceName }}
ignoredByScheduler: true
- name: {{ .Values.dcuResourceName }}
ignoredByScheduler: true
- name: {{ .Values.dcuResourceMem }}
ignoredByScheduler: true
- name: {{ .Values.dcuResourceCores }}
ignoredByScheduler: true
- name: {{ .Values.iluvatarResourceName }}
ignoredByScheduler: true
{{- if .Values.devices.ascend.enabled }}
{{- range .Values.devices.ascend.customresources }}
- name: {{ . }}
ignoredByScheduler: true
{{- end }}
{{- end }}
{{- if .Values.devices.mthreads.enabled }}
{{- range .Values.devices.mthreads.customresources }}
- name: {{ . }}
ignoredByScheduler: true
{{- end }}
{{- end }}
{{- end }}

@ -0,0 +1,152 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "hami-vgpu.scheduler" . }}
labels:
app.kubernetes.io/component: hami-scheduler
{{- include "hami-vgpu.labels" . | nindent 4 }}
{{- with .Values.global.labels }}
{{- toYaml . | nindent 4 }}
{{- end }}
{{- if .Values.global.annotations }}
annotations: {{ toYaml .Values.global.annotations | nindent 4}}
{{- end }}
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/component: hami-scheduler
{{- include "hami-vgpu.selectorLabels" . | nindent 6 }}
template:
metadata:
labels:
app.kubernetes.io/component: hami-scheduler
{{- include "hami-vgpu.selectorLabels" . | nindent 8 }}
hami.io/webhook: ignore
{{- if .Values.scheduler.podAnnotations }}
annotations: {{ toYaml .Values.scheduler.podAnnotations | nindent 8 }}
{{- end }}
spec:
{{- include "hami-vgpu.imagePullSecrets" . | nindent 6}}
serviceAccountName: {{ include "hami-vgpu.scheduler" . }}
priorityClassName: system-node-critical
containers:
{{- if .Values.scheduler.kubeScheduler.enabled }}
- name: kube-scheduler
image: "{{ .Values.scheduler.kubeScheduler.image }}:{{ include "resolvedKubeSchedulerTag" . }}"
imagePullPolicy: {{ .Values.scheduler.kubeScheduler.imagePullPolicy | quote }}
command:
- kube-scheduler
{{- if ge (.Capabilities.KubeVersion.Minor | int) 22 }}
{{- range .Values.scheduler.kubeScheduler.extraNewArgs }}
- {{ . }}
{{- end }}
{{- else }}
- --scheduler-name={{ .Values.schedulerName }}
{{- range .Values.scheduler.kubeScheduler.extraArgs }}
- {{ . }}
{{- end }}
{{- end }}
- --leader-elect={{ .Values.scheduler.leaderElect }}
- --leader-elect-resource-name={{ .Values.schedulerName }}
- --leader-elect-resource-namespace={{ .Release.Namespace }}
resources:
{{- toYaml .Values.scheduler.kubeScheduler.resources | nindent 12 }}
volumeMounts:
- name: scheduler-config
mountPath: /config
{{- end }}
{{- if .Values.scheduler.livenessProbe }}
livenessProbe:
failureThreshold: 8
httpGet:
path: /healthz
port: 10259
scheme: HTTPS
initialDelaySeconds: 10
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 15
{{- end }}
- name: vgpu-scheduler-extender
image: {{ .Values.scheduler.extender.image }}:{{ .Values.version }}
imagePullPolicy: {{ .Values.scheduler.extender.imagePullPolicy | quote }}
env:
{{- if .Values.global.managedNodeSelectorEnable }}
{{- range $key, $value := .Values.global.managedNodeSelector }}
- name: NODE_SELECTOR_{{ $key | upper | replace "-" "_" }}
value: "{{ $value }}"
{{- end }}
{{- end }}
command:
- scheduler
- --http_bind=0.0.0.0:443
- --cert_file=/tls/tls.crt
- --key_file=/tls/tls.key
- --scheduler-name={{ .Values.schedulerName }}
- --metrics-bind-address={{ .Values.scheduler.metricsBindAddress }}
- --node-scheduler-policy={{ .Values.scheduler.defaultSchedulerPolicy.nodeSchedulerPolicy }}
- --gpu-scheduler-policy={{ .Values.scheduler.defaultSchedulerPolicy.gpuSchedulerPolicy }}
- --device-config-file=/device-config.yaml
{{- if .Values.devices.ascend.enabled }}
- --enable-ascend=true
{{- end }}
{{- if .Values.scheduler.nodeLabelSelector }}
- --node-label-selector={{- $first := true -}}
{{- range $key, $value := .Values.scheduler.nodeLabelSelector -}}
{{- if not $first }},{{ end -}}
{{- $key }}={{ $value -}}
{{- $first = false -}}
{{- end -}}
{{- end }}
{{- range .Values.scheduler.extender.extraArgs }}
- {{ . }}
{{- end }}
ports:
- name: http
containerPort: 443
protocol: TCP
resources:
{{- toYaml .Values.scheduler.extender.resources | nindent 12 }}
volumeMounts:
- name: tls-config
mountPath: /tls
- name: device-config
mountPath: /device-config.yaml
subPath: device-config.yaml
{{- if .Values.scheduler.livenessProbe }}
livenessProbe:
httpGet:
path: /healthz
port: 443
scheme: HTTPS
initialDelaySeconds: 10
periodSeconds: 10
failureThreshold: 3
timeoutSeconds: 5
{{- end }}
volumes:
- name: tls-config
secret:
secretName: {{ template "hami-vgpu.scheduler.tls" . }}
{{- if .Values.scheduler.kubeScheduler.enabled }}
- name: scheduler-config
configMap:
{{- if ge (.Capabilities.KubeVersion.Minor | int) 22 }}
name: {{ template "hami-vgpu.scheduler" . }}-newversion
{{- else }}
name: {{ template "hami-vgpu.scheduler" . }}
{{- end }}
{{- end }}
- name: device-config
configMap:
name: {{ include "hami-vgpu.scheduler" . }}-device
{{- if .Values.scheduler.nodeSelector }}
nodeSelector: {{ toYaml .Values.scheduler.nodeSelector | nindent 8 }}
{{- end }}
{{- if .Values.scheduler.tolerations }}
tolerations: {{ toYaml .Values.scheduler.tolerations | nindent 8 }}
{{- end }}
{{- if .Values.scheduler.nodeName }}
nodeName: {{ .Values.scheduler.nodeName }}
{{- end }}

@ -0,0 +1,177 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "hami-vgpu.scheduler" . }}-device
labels:
app.kubernetes.io/component: hami-scheduler
{{- include "hami-vgpu.labels" . | nindent 4 }}
data:
device-config.yaml: |-
{{- if .Files.Glob "files/device-config.yaml" }}
{{- .Files.Get "files/device-config.yaml" | nindent 4}}
{{- else }}
nvidia:
resourceCountName: {{ .Values.resourceName }}
resourceMemoryName: {{ .Values.resourceMem }}
resourceMemoryPercentageName: {{ .Values.resourceMemPercentage }}
resourceCoreName: {{ .Values.resourceCores }}
resourcePriorityName: {{ .Values.resourcePriority }}
overwriteEnv: false
defaultMemory: 0
defaultCores: 0
defaultGPUNum: 1
deviceSplitCount: {{ .Values.devicePlugin.deviceSplitCount }}
deviceMemoryScaling: {{ .Values.devicePlugin.deviceMemoryScaling }}
deviceCoreScaling: {{ .Values.devicePlugin.deviceCoreScaling }}
knownMigGeometries:
- models: [ "A30" ]
allowedGeometries:
-
- name: 1g.6gb
memory: 6144
count: 4
-
- name: 2g.12gb
memory: 12288
count: 2
-
- name: 4g.24gb
memory: 24576
count: 1
- models: [ "A100-SXM4-40GB", "A100-40GB-PCIe", "A100-PCIE-40GB", "A100-SXM4-40GB" ]
allowedGeometries:
-
- name: 1g.5gb
memory: 5120
count: 7
-
- name: 2g.10gb
memory: 10240
count: 3
- name: 1g.5gb
memory: 5120
count: 1
-
- name: 3g.20gb
memory: 20480
count: 2
-
- name: 7g.40gb
memory: 40960
count: 1
- models: [ "A100-SXM4-80GB", "A100-80GB-PCIe", "A100-PCIE-80GB"]
allowedGeometries:
-
- name: 1g.10gb
memory: 10240
count: 7
-
- name: 2g.20gb
memory: 20480
count: 3
- name: 1g.10gb
memory: 10240
count: 1
-
- name: 3g.40gb
memory: 40960
count: 2
-
- name: 7g.79gb
memory: 80896
count: 1
cambricon:
resourceCountName: {{ .Values.mluResourceName }}
resourceMemoryName: {{ .Values.mluResourceMem }}
resourceCoreName: {{ .Values.mluResourceCores }}
hygon:
resourceCountName: {{ .Values.dcuResourceName }}
resourceMemoryName: {{ .Values.dcuResourceMem }}
resourceCoreName: {{ .Values.dcuResourceCores }}
metax:
resourceCountName: "metax-tech.com/gpu"
mthreads:
resourceCountName: "mthreads.com/vgpu"
resourceMemoryName: "mthreads.com/sgpu-memory"
resourceCoreName: "mthreads.com/sgpu-core"
iluvatar:
resourceCountName: {{ .Values.iluvatarResourceName }}
resourceMemoryName: {{ .Values.iluvatarResourceMem }}
resourceCoreName: {{ .Values.iluvatarResourceCore }}
vnpus:
- chipName: 910B
commonWord: Ascend910A
resourceName: huawei.com/Ascend910A
resourceMemoryName: huawei.com/Ascend910A-memory
memoryAllocatable: 32768
memoryCapacity: 32768
aiCore: 30
templates:
- name: vir02
memory: 2184
aiCore: 2
- name: vir04
memory: 4369
aiCore: 4
- name: vir08
memory: 8738
aiCore: 8
- name: vir16
memory: 17476
aiCore: 16
- chipName: 910B3
commonWord: Ascend910B
resourceName: huawei.com/Ascend910B
resourceMemoryName: huawei.com/Ascend910B-memory
memoryAllocatable: 65536
memoryCapacity: 65536
aiCore: 20
aiCPU: 7
templates:
- name: vir05_1c_16g
memory: 16384
aiCore: 5
aiCPU: 1
- name: vir10_3c_32g
memory: 32768
aiCore: 10
aiCPU: 3
- chipName: 910B4
commonWord: Ascend910B4
resourceName: huawei.com/Ascend910B4
resourceMemoryName: huawei.com/Ascend910B4-memory
memoryAllocatable: 32768
memoryCapacity: 32768
aiCore: 20
aiCPU: 7
templates:
- name: vir05_1c_8g
memory: 8192
aiCore: 5
aiCPU: 1
- name: vir10_3c_16g
memory: 16384
aiCore: 10
aiCPU: 3
- chipName: 310P3
commonWord: Ascend310P
resourceName: huawei.com/Ascend310P
resourceMemoryName: huawei.com/Ascend310P-memory
memoryAllocatable: 21527
memoryCapacity: 24576
aiCore: 8
aiCPU: 7
templates:
- name: vir01
memory: 3072
aiCore: 1
aiCPU: 1
- name: vir02
memory: 6144
aiCore: 2
aiCPU: 2
- name: vir04
memory: 12288
aiCore: 4
aiCPU: 4
{{ end }}

@ -0,0 +1,26 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ include "hami-vgpu.fullname" . }}-admission
annotations:
"helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
labels:
{{- include "hami-vgpu.labels" . | nindent 4 }}
app.kubernetes.io/component: admission-webhook
rules:
- apiGroups:
- admissionregistration.k8s.io
resources:
#- validatingwebhookconfigurations
- mutatingwebhookconfigurations
verbs:
- get
- update
{{- if .Values.podSecurityPolicy.enabled }}
- apiGroups: ['extensions']
resources: ['podsecuritypolicies']
verbs: ['use']
resourceNames:
- {{ include "hami-vgpu.fullname" . }}-admission
{{- end }}

@ -0,0 +1,18 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ include "hami-vgpu.fullname" . }}-admission
annotations:
"helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
labels:
{{- include "hami-vgpu.labels" . | nindent 4 }}
app.kubernetes.io/component: admission-webhook
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: {{ include "hami-vgpu.fullname" . }}-admission
subjects:
- kind: ServiceAccount
name: {{ include "hami-vgpu.fullname" . }}-admission
namespace: {{ .Release.Namespace | quote }}

@ -0,0 +1,60 @@
apiVersion: batch/v1
kind: Job
metadata:
name: {{ include "hami-vgpu.fullname" . }}-admission-create
annotations:
"helm.sh/hook": pre-install,pre-upgrade
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
labels:
{{- include "hami-vgpu.labels" . | nindent 4 }}
app.kubernetes.io/component: admission-webhook
spec:
{{- if .Capabilities.APIVersions.Has "batch/v1alpha1" }}
# Alpha feature since k8s 1.12
ttlSecondsAfterFinished: 0
{{- end }}
template:
metadata:
name: {{ include "hami-vgpu.fullname" . }}-admission-create
{{- if .Values.scheduler.patch.podAnnotations }}
annotations: {{ toYaml .Values.scheduler.patch.podAnnotations | nindent 8 }}
{{- end }}
labels:
{{- include "hami-vgpu.labels" . | nindent 8 }}
app.kubernetes.io/component: admission-webhook
hami.io/webhook: ignore
spec:
{{- include "hami-vgpu.imagePullSecrets" . | nindent 6}}
{{- if .Values.scheduler.patch.priorityClassName }}
priorityClassName: {{ .Values.scheduler.patch.priorityClassName }}
{{- end }}
containers:
- name: create
{{- if ge (.Capabilities.KubeVersion.Minor | int) 22 }}
image: {{ .Values.scheduler.patch.imageNew }}
{{- else }}
image: {{ .Values.scheduler.patch.image }}
{{- end }}
imagePullPolicy: {{ .Values.scheduler.patch.imagePullPolicy }}
args:
- create
- --cert-name=tls.crt
- --key-name=tls.key
{{- if .Values.scheduler.admissionWebhook.customURL.enabled }}
- --host={{ printf "%s.%s.svc,127.0.0.1,%s" (include "hami-vgpu.scheduler" .) .Release.Namespace .Values.scheduler.admissionWebhook.customURL.host}}
{{- else }}
- --host={{ printf "%s.%s.svc,127.0.0.1" (include "hami-vgpu.scheduler" .) .Release.Namespace }}
{{- end }}
- --namespace={{ .Release.Namespace }}
- --secret-name={{ include "hami-vgpu.scheduler.tls" . }}
restartPolicy: OnFailure
serviceAccountName: {{ include "hami-vgpu.fullname" . }}-admission
{{- if .Values.scheduler.patch.nodeSelector }}
nodeSelector: {{ toYaml .Values.scheduler.patch.nodeSelector | nindent 8 }}
{{- end }}
{{- if .Values.scheduler.patch.tolerations }}
tolerations: {{ toYaml .Values.scheduler.patch.tolerations | nindent 8 }}
{{- end }}
securityContext:
runAsNonRoot: true
runAsUser: {{ .Values.scheduler.patch.runAsUser }}

@ -0,0 +1,55 @@
apiVersion: batch/v1
kind: Job
metadata:
name: {{ include "hami-vgpu.fullname" . }}-admission-patch
annotations:
"helm.sh/hook": post-install,post-upgrade
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
labels:
{{- include "hami-vgpu.labels" . | nindent 4 }}
app.kubernetes.io/component: admission-webhook
spec:
{{- if .Capabilities.APIVersions.Has "batch/v1alpha1" }}
# Alpha feature since k8s 1.12
ttlSecondsAfterFinished: 0
{{- end }}
template:
metadata:
name: {{ include "hami-vgpu.fullname" . }}-admission-patch
{{- if .Values.scheduler.patch.podAnnotations }}
annotations: {{ toYaml .Values.scheduler.patch.podAnnotations | nindent 8 }}
{{- end }}
labels:
{{- include "hami-vgpu.labels" . | nindent 8 }}
app.kubernetes.io/component: admission-webhook
hami.io/webhook: ignore
spec:
{{- include "hami-vgpu.imagePullSecrets" . | nindent 6}}
{{- if .Values.scheduler.patch.priorityClassName }}
priorityClassName: {{ .Values.scheduler.patch.priorityClassName }}
{{- end }}
containers:
- name: patch
{{- if ge (.Capabilities.KubeVersion.Minor | int) 22 }}
image: {{ .Values.scheduler.patch.imageNew }}
{{- else }}
image: {{ .Values.scheduler.patch.image }}
{{- end }}
imagePullPolicy: {{ .Values.scheduler.patch.imagePullPolicy }}
args:
- patch
- --webhook-name={{ include "hami-vgpu.scheduler.webhook" . }}
- --namespace={{ .Release.Namespace }}
- --patch-validating=false
- --secret-name={{ include "hami-vgpu.scheduler.tls" . }}
restartPolicy: OnFailure
serviceAccountName: {{ include "hami-vgpu.fullname" . }}-admission
{{- if .Values.scheduler.patch.nodeSelector }}
nodeSelector: {{ toYaml .Values.scheduler.patch.nodeSelector | nindent 8 }}
{{- end }}
{{- if .Values.scheduler.patch.tolerations }}
tolerations: {{ toYaml .Values.scheduler.patch.tolerations | nindent 8 }}
{{- end }}
securityContext:
runAsNonRoot: true
runAsUser: {{ .Values.scheduler.patch.runAsUser }}

@ -0,0 +1,36 @@
{{- if .Values.podSecurityPolicy.enabled }}
apiVersion: policy/v1beta1
kind: PodSecurityPolicy
metadata:
name: {{ include "hami-vgpu.fullname" . }}-admission
annotations:
"helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
labels:
{{- include "hami-vgpu.labels" . | nindent 4 }}
app.kubernetes.io/component: admission-webhook
spec:
allowPrivilegeEscalation: false
fsGroup:
ranges:
- max: 65535
min: 1
rule: MustRunAs
requiredDropCapabilities:
- ALL
runAsUser:
rule: MustRunAsNonRoot
seLinux:
rule: RunAsAny
supplementalGroups:
ranges:
- max: 65535
min: 1
rule: MustRunAs
volumes:
- configMap
- emptyDir
- projected
- secret
- downwardAPI
{{- end }}

@ -0,0 +1,18 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: {{ include "hami-vgpu.fullname" . }}-admission
annotations:
"helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
labels:
{{- include "hami-vgpu.labels" . | nindent 4 }}
app.kubernetes.io/component: admission-webhook
rules:
- apiGroups:
- ""
resources:
- secrets
verbs:
- get
- create

@ -0,0 +1,18 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: {{ include "hami-vgpu.fullname" . }}-admission
annotations:
"helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
labels:
{{- include "hami-vgpu.labels" . | nindent 4 }}
app.kubernetes.io/component: admission-webhook
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: {{ include "hami-vgpu.fullname" . }}-admission
subjects:
- kind: ServiceAccount
name: {{ include "hami-vgpu.fullname" . }}-admission
namespace: {{ .Release.Namespace | quote }}

@ -0,0 +1,10 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ include "hami-vgpu.fullname" . }}-admission
annotations:
"helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
labels:
{{- include "hami-vgpu.labels" . | nindent 4 }}
app.kubernetes.io/component: admission-webhook

@ -0,0 +1,15 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ include "hami-vgpu.scheduler" . }}
labels:
app.kubernetes.io/component: "hami-scheduler"
{{- include "hami-vgpu.labels" . | nindent 4 }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: cluster-admin
subjects:
- kind: ServiceAccount
name: {{ include "hami-vgpu.scheduler" . }}
namespace: {{ .Release.Namespace | quote }}

@ -0,0 +1,30 @@
apiVersion: v1
kind: Service
metadata:
name: {{ include "hami-vgpu.scheduler" . }}
labels:
app.kubernetes.io/component: hami-scheduler
{{- include "hami-vgpu.labels" . | nindent 4 }}
{{- if .Values.scheduler.service.labels }}
{{ toYaml .Values.scheduler.service.labels | indent 4 }}
{{- end }}
{{- if .Values.scheduler.service.annotations }}
annotations: {{ toYaml .Values.scheduler.service.annotations | nindent 4 }}
{{- end }}
spec:
type: NodePort
ports:
- name: http
port: {{ .Values.scheduler.service.httpPort }}
targetPort: 443
nodePort: {{ .Values.scheduler.service.schedulerPort }}
protocol: TCP
- name: monitor
port: {{ .Values.scheduler.service.monitorPort }}
targetPort: {{ (split ":" (printf "%s" .Values.scheduler.metricsBindAddress))._1 }}
nodePort: {{ .Values.scheduler.service.monitorPort }}
protocol: TCP
selector:
app.kubernetes.io/component: hami-scheduler
{{- include "hami-vgpu.selectorLabels" . | nindent 4 }}

@ -0,0 +1,8 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ include "hami-vgpu.scheduler" . }}
namespace: {{ .Release.Namespace | quote }}
labels:
app.kubernetes.io/component: "hami-scheduler"
{{- include "hami-vgpu.labels" . | nindent 4 }}

@ -0,0 +1,51 @@
apiVersion: admissionregistration.k8s.io/v1
kind: MutatingWebhookConfiguration
metadata:
name: {{ include "hami-vgpu.scheduler.webhook" . }}
webhooks:
- admissionReviewVersions:
- v1beta1
clientConfig:
{{- if .Values.scheduler.admissionWebhook.customURL.enabled }}
url: https://{{ .Values.scheduler.admissionWebhook.customURL.host}}:{{.Values.scheduler.admissionWebhook.customURL.port}}{{.Values.scheduler.admissionWebhook.customURL.path}}
{{- else }}
service:
name: {{ include "hami-vgpu.scheduler" . }}
namespace: {{ .Release.Namespace }}
path: /webhook
port: {{ .Values.scheduler.service.httpPort }}
{{- end }}
failurePolicy: {{ .Values.scheduler.admissionWebhook.failurePolicy }}
matchPolicy: Equivalent
name: vgpu.hami.io
namespaceSelector:
matchExpressions:
- key: hami.io/webhook
operator: NotIn
values:
- ignore
{{- if .Values.scheduler.admissionWebhook.whitelistNamespaces }}
- key: kubernetes.io/metadata.name
operator: NotIn
values:
{{- toYaml .Values.scheduler.admissionWebhook.whitelistNamespaces | nindent 10 }}
{{- end }}
objectSelector:
matchExpressions:
- key: hami.io/webhook
operator: NotIn
values:
- ignore
reinvocationPolicy: {{ .Values.scheduler.admissionWebhook.reinvocationPolicy }}
rules:
- apiGroups:
- ""
apiVersions:
- v1
operations:
- CREATE
resources:
- pods
scope: '*'
sideEffects: None
timeoutSeconds: 10

@ -0,0 +1,204 @@
# Default values for hami-vgpu.
nameOverride: ""
fullnameOverride: ""
imagePullSecrets: [ ]
version: "v2.5.0"
#Nvidia GPU Parameters
resourceName: "nvidia.com/gpu"
resourceMem: "nvidia.com/gpumem"
resourceMemPercentage: "nvidia.com/gpumem-percentage"
resourceCores: "nvidia.com/gpucores"
resourcePriority: "nvidia.com/priority"
#MLU Parameters
mluResourceName: "cambricon.com/vmlu"
mluResourceMem: "cambricon.com/mlu.smlu.vmemory"
mluResourceCores: "cambricon.com/mlu.smlu.vcore"
#Hygon DCU Parameters
dcuResourceName: "hygon.com/dcunum"
dcuResourceMem: "hygon.com/dcumem"
dcuResourceCores: "hygon.com/dcucores"
#Iluvatar GPU Parameters
iluvatarResourceName: "iluvatar.ai/vgpu"
iluvatarResourceMem: "iluvatar.ai/vcuda-memory"
iluvatarResourceCore: "iluvatar.ai/vcuda-core"
schedulerName: "hami-scheduler"
podSecurityPolicy:
enabled: false
global:
gpuHookPath: /usr/local
labels: {}
annotations: {}
managedNodeSelectorEnable: false
managedNodeSelector:
usage: "gpu"
scheduler:
# @param nodeName defines the node name and the nvidia-vgpu-scheduler-scheduler will schedule to the node.
# if we install the nvidia-vgpu-scheduler-scheduler as default scheduler, we need to remove the k8s default
# scheduler pod from the cluster first, we must specify node name to skip the schedule workflow.
nodeName: ""
#nodeLabelSelector:
# "gpu": "on"
overwriteEnv: "false"
defaultSchedulerPolicy:
nodeSchedulerPolicy: binpack
gpuSchedulerPolicy: spread
metricsBindAddress: ":9395"
livenessProbe: false
leaderElect: true
kubeScheduler:
# @param enabled indicate whether to run kube-scheduler container in the scheduler pod, it's true by default.
enabled: true
image: registry.cn-hangzhou.aliyuncs.com/google_containers/kube-scheduler
imageTag: ""
imagePullPolicy: IfNotPresent
resources: {}
# If you do want to specify resources, uncomment the following lines, adjust them as necessary.
# and remove the curly braces after 'resources:'.
# limits:
# cpu: 1000m
# memory: 1000Mi
# requests:
# cpu: 100m
# memory: 100Mi
extraNewArgs:
- --config=/config/config.yaml
- -v=4
extraArgs:
- --policy-config-file=/config/config.json
- -v=4
extender:
image: "projecthami/hami"
imagePullPolicy: IfNotPresent
resources: {}
# If you do want to specify resources, uncomment the following lines, adjust them as necessary,
# and remove the curly braces after 'resources:'.
# limits:
# cpu: 1000m
# memory: 1000Mi
# requests:
# cpu: 100m
# memory: 100Mi
extraArgs:
- --debug
- -v=4
podAnnotations: {}
tolerations: []
#serviceAccountName: "hami-vgpu-scheduler-sa"
admissionWebhook:
customURL:
enabled: false
# must be an endpoint using https.
# should generate host certs here
host: 127.0.0.1 # hostname or ip, can be your node'IP if you want to use https://<nodeIP>:<schedulerPort>/<path>
port: 31998
path: /webhook
whitelistNamespaces:
# Specify the namespaces that the webhook will not be applied to.
# - default
# - kube-system
# - istio-system
reinvocationPolicy: Never
failurePolicy: Ignore
patch:
image: docker.io/jettech/kube-webhook-certgen:v1.5.2
imageNew: liangjw/kube-webhook-certgen:v1.1.1
imagePullPolicy: IfNotPresent
priorityClassName: ""
podAnnotations: {}
nodeSelector: {}
tolerations: []
runAsUser: 2000
service:
httpPort: 443
schedulerPort: 31998
monitorPort: 31993
labels: {}
annotations: {}
devicePlugin:
image: "projecthami/hami"
monitorimage: "projecthami/hami"
monitorctrPath: /usr/local/vgpu/containers
imagePullPolicy: IfNotPresent
deviceSplitCount: 40
deviceMemoryScaling: 1
deviceCoreScaling: 1
runtimeClassName: ""
migStrategy: "none"
disablecorelimit: "false"
passDeviceSpecsEnabled: true
extraArgs:
- -v=4
service:
httpPort: 31992
pluginPath: /var/lib/kubelet/device-plugins
libPath: /usr/local/vgpu
podAnnotations: {}
nvidianodeSelector:
gpu: "on"
tolerations: []
# The updateStrategy for DevicePlugin DaemonSet.
# If you want to update the DaemonSet by manual, set type as "OnDelete".
# We recommend use OnDelete update strategy because DevicePlugin pod restart will cause business pod restart, this behavior is destructive.
# Otherwise, you can use RollingUpdate update strategy to rolling update DevicePlugin pod.
updateStrategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 1
resources: {}
# If you do want to specify resources, uncomment the following lines, adjust them as necessary.
# and remove the curly braces after 'resources:'.
# limits:
# cpu: 1000m
# memory: 1000Mi
# requests:
# cpu: 100m
# memory: 100Mi
vgpuMonitor:
resources: {}
# If you do want to specify resources, uncomment the following lines, adjust them as necessary.
# and remove the curly braces after 'resources:'.
# limits:
# cpu: 1000m
# memory: 1000Mi
# requests:
# cpu: 100m
# memory: 100Mi
devices:
mthreads:
enabled: false
customresources:
- mthreads.com/vgpu
ascend:
enabled: false
image: ""
imagePullPolicy: IfNotPresent
extraArgs: []
nodeSelector:
ascend: "on"
tolerations: []
customresources:
- huawei.com/Ascend910A
- huawei.com/Ascend910A-memory
- huawei.com/Ascend910B
- huawei.com/Ascend910B-memory
- huawei.com/Ascend910B4
- huawei.com/Ascend910B4-memory
- huawei.com/Ascend310P
- huawei.com/Ascend310P-memory

@ -0,0 +1 @@
helm upgrade -hami ./hami/ --namespace kube-system
Loading…
Cancel
Save