parent
a6b4910d89
commit
264e3c9f0c
@ -0,0 +1,15 @@
|
||||
apiVersion: v2
|
||||
appVersion: 2.5.0
|
||||
description: Heterogeneous AI Computing Virtualization Middleware
|
||||
keywords:
|
||||
- vgpu
|
||||
- gpu
|
||||
kubeVersion: '>= 1.16.0'
|
||||
maintainers:
|
||||
- email: limengxuan@4paradigm.com
|
||||
name: limengxuan
|
||||
- email: xiaozhang0210@hotmail.com
|
||||
name: zhangxiao
|
||||
name: hami
|
||||
type: application
|
||||
version: 2.5.0
|
@ -0,0 +1,3 @@
|
||||
** Please be patient while the chart is being deployed **
|
||||
Resource name: {{ .Values.resourceName }}
|
||||
|
@ -0,0 +1,108 @@
|
||||
{{/*
|
||||
Expand the name of the chart.
|
||||
*/}}
|
||||
{{- define "hami-vgpu.name" -}}
|
||||
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
|
||||
{{- end -}}
|
||||
|
||||
{{/*
|
||||
Create a default fully qualified app name.
|
||||
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
|
||||
If release name contains chart name it will be used as a full name.
|
||||
*/}}
|
||||
{{- define "hami-vgpu.fullname" -}}
|
||||
{{- if .Values.fullnameOverride -}}
|
||||
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
|
||||
{{- else }}
|
||||
{{- $name := default .Chart.Name .Values.nameOverride -}}
|
||||
{{- if contains $name .Release.Name }}
|
||||
{{- .Release.Name | trunc 63 | trimSuffix "-" -}}
|
||||
{{- else -}}
|
||||
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
{{/*
|
||||
The app name for Scheduler
|
||||
*/}}
|
||||
{{- define "hami-vgpu.scheduler" -}}
|
||||
{{- printf "%s-scheduler" ( include "hami-vgpu.fullname" . ) | trunc 63 | trimSuffix "-" -}}
|
||||
{{- end -}}
|
||||
|
||||
{{/*
|
||||
The app name for DevicePlugin
|
||||
*/}}
|
||||
{{- define "hami-vgpu.device-plugin" -}}
|
||||
{{- printf "%s-device-plugin" ( include "hami-vgpu.fullname" . ) | trunc 63 | trimSuffix "-" -}}
|
||||
{{- end -}}
|
||||
|
||||
{{/*
|
||||
The tls secret name for Scheduler
|
||||
*/}}
|
||||
{{- define "hami-vgpu.scheduler.tls" -}}
|
||||
{{- printf "%s-scheduler-tls" ( include "hami-vgpu.fullname" . ) | trunc 63 | trimSuffix "-" -}}
|
||||
{{- end -}}
|
||||
|
||||
{{/*
|
||||
The webhook name
|
||||
*/}}
|
||||
{{- define "hami-vgpu.scheduler.webhook" -}}
|
||||
{{- printf "%s-webhook" ( include "hami-vgpu.fullname" . ) | trunc 63 | trimSuffix "-" -}}
|
||||
{{- end -}}
|
||||
|
||||
{{/*
|
||||
Create chart name and version as used by the chart label.
|
||||
*/}}
|
||||
{{- define "hami-vgpu.chart" -}}
|
||||
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Common labels
|
||||
*/}}
|
||||
{{- define "hami-vgpu.labels" -}}
|
||||
helm.sh/chart: {{ include "hami-vgpu.chart" . }}
|
||||
{{ include "hami-vgpu.selectorLabels" . }}
|
||||
{{- if .Chart.AppVersion }}
|
||||
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
|
||||
{{- end }}
|
||||
app.kubernetes.io/managed-by: {{ .Release.Service }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Selector labels
|
||||
*/}}
|
||||
{{- define "hami-vgpu.selectorLabels" -}}
|
||||
app.kubernetes.io/name: {{ include "hami-vgpu.name" . }}
|
||||
app.kubernetes.io/instance: {{ .Release.Name }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Image registry secret name
|
||||
*/}}
|
||||
{{- define "hami-vgpu.imagePullSecrets" -}}
|
||||
imagePullSecrets: {{ toYaml .Values.imagePullSecrets | nindent 2 }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Resolve the tag for kubeScheduler.
|
||||
*/}}
|
||||
{{- define "resolvedKubeSchedulerTag" -}}
|
||||
{{- if .Values.scheduler.kubeScheduler.imageTag }}
|
||||
{{- .Values.scheduler.kubeScheduler.imageTag | trim -}}
|
||||
{{- else }}
|
||||
{{- include "strippedKubeVersion" . | trim -}}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
|
||||
{{/*
|
||||
Remove the part after the `+` in the Kubernetes version string.
|
||||
v1.31.1+k3s1 -> v1.31.1
|
||||
v1.31.1 -> v1.31.1
|
||||
*/}}
|
||||
{{- define "strippedKubeVersion" -}}
|
||||
{{- $parts := split "+" .Capabilities.KubeVersion.Version -}}
|
||||
{{- print $parts._0 -}}
|
||||
{{- end -}}
|
@ -0,0 +1,24 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.device-plugin" . }}
|
||||
labels:
|
||||
app.kubernetes.io/component: hami-device-plugin
|
||||
{{- include "hami-vgpu.labels" . | nindent 4 }}
|
||||
data:
|
||||
config.json: |
|
||||
{
|
||||
"nodeconfig": [
|
||||
{
|
||||
"name": "m5-cloudinfra-online02",
|
||||
"operatingmode": "hami-core",
|
||||
"devicememoryscaling": 1.8,
|
||||
"devicesplitcount": 10,
|
||||
"migstrategy":"none",
|
||||
"filterdevices": {
|
||||
"uuid": [],
|
||||
"index": []
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
@ -0,0 +1,166 @@
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.device-plugin" . }}
|
||||
labels:
|
||||
app.kubernetes.io/component: hami-device-plugin
|
||||
{{- include "hami-vgpu.labels" . | nindent 4 }}
|
||||
{{- with .Values.global.labels }}
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.global.annotations }}
|
||||
annotations: {{ toYaml .Values.global.annotations | nindent 4}}
|
||||
{{- end }}
|
||||
spec:
|
||||
updateStrategy:
|
||||
{{- with .Values.devicePlugin.updateStrategy }}
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/component: hami-device-plugin
|
||||
{{- include "hami-vgpu.selectorLabels" . | nindent 6 }}
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: hami-device-plugin
|
||||
hami.io/webhook: ignore
|
||||
{{- include "hami-vgpu.selectorLabels" . | nindent 8 }}
|
||||
{{- if .Values.devicePlugin.podAnnotations }}
|
||||
annotations: {{ toYaml .Values.devicePlugin.podAnnotations | nindent 8 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
{{- if .Values.devicePlugin.runtimeClassName }}
|
||||
runtimeClassName: {{ .Values.devicePlugin.runtimeClassName }}
|
||||
{{- end }}
|
||||
{{- include "hami-vgpu.imagePullSecrets" . | nindent 6}}
|
||||
serviceAccountName: {{ include "hami-vgpu.device-plugin" . }}
|
||||
priorityClassName: system-node-critical
|
||||
hostPID: true
|
||||
hostNetwork: true
|
||||
containers:
|
||||
- name: device-plugin
|
||||
image: {{ .Values.devicePlugin.image }}:{{ .Values.version }}
|
||||
imagePullPolicy: {{ .Values.devicePlugin.imagePullPolicy | quote }}
|
||||
lifecycle:
|
||||
postStart:
|
||||
exec:
|
||||
command: ["/bin/sh","-c", {{ printf "/k8s-vgpu/bin/vgpu-init.sh %s/vgpu/" .Values.global.gpuHookPath | quote }}]
|
||||
command:
|
||||
- nvidia-device-plugin
|
||||
- --config-file=/device-config.yaml
|
||||
- --mig-strategy={{ .Values.devicePlugin.migStrategy }}
|
||||
- --disable-core-limit={{ .Values.devicePlugin.disablecorelimit }}
|
||||
{{- range .Values.devicePlugin.extraArgs }}
|
||||
- {{ . }}
|
||||
{{- end }}
|
||||
env:
|
||||
- name: NODE_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
- name: NVIDIA_MIG_MONITOR_DEVICES
|
||||
value: all
|
||||
- name: HOOK_PATH
|
||||
value: {{ .Values.global.gpuHookPath }}
|
||||
{{- if typeIs "bool" .Values.devicePlugin.passDeviceSpecsEnabled }}
|
||||
- name: PASS_DEVICE_SPECS
|
||||
value: {{ .Values.devicePlugin.passDeviceSpecsEnabled | quote }}
|
||||
{{- end }}
|
||||
securityContext:
|
||||
privileged: true
|
||||
allowPrivilegeEscalation: true
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
add: ["SYS_ADMIN"]
|
||||
resources:
|
||||
{{- toYaml .Values.devicePlugin.resources | nindent 12 }}
|
||||
volumeMounts:
|
||||
- name: device-plugin
|
||||
mountPath: /var/lib/kubelet/device-plugins
|
||||
- name: lib
|
||||
mountPath: {{ printf "%s%s" .Values.global.gpuHookPath "/vgpu" }}
|
||||
- name: usrbin
|
||||
mountPath: /usrbin
|
||||
- name: deviceconfig
|
||||
mountPath: /config
|
||||
- name: hosttmp
|
||||
mountPath: /tmp
|
||||
- name: device-config
|
||||
mountPath: /device-config.yaml
|
||||
subPath: device-config.yaml
|
||||
- name: vgpu-monitor
|
||||
image: {{ .Values.devicePlugin.image }}:{{ .Values.version }}
|
||||
imagePullPolicy: {{ .Values.devicePlugin.imagePullPolicy | quote }}
|
||||
command: ["vGPUmonitor"]
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
add: ["SYS_ADMIN"]
|
||||
env:
|
||||
- name: NODE_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
- name: NVIDIA_VISIBLE_DEVICES
|
||||
value: "all"
|
||||
- name: NVIDIA_MIG_MONITOR_DEVICES
|
||||
value: all
|
||||
- name: HOOK_PATH
|
||||
value: {{ .Values.global.gpuHookPath }}/vgpu
|
||||
resources:
|
||||
{{- toYaml .Values.devicePlugin.vgpuMonitor.resources | nindent 12 }}
|
||||
volumeMounts:
|
||||
- name: ctrs
|
||||
mountPath: {{ .Values.devicePlugin.monitorctrPath }}
|
||||
- name: dockers
|
||||
mountPath: /run/docker
|
||||
- name: containerds
|
||||
mountPath: /run/containerd
|
||||
- name: sysinfo
|
||||
mountPath: /sysinfo
|
||||
- name: hostvar
|
||||
mountPath: /hostvar
|
||||
- name: hosttmp
|
||||
mountPath: /tmp
|
||||
volumes:
|
||||
- name: ctrs
|
||||
hostPath:
|
||||
path: {{ .Values.devicePlugin.monitorctrPath }}
|
||||
- name: hosttmp
|
||||
hostPath:
|
||||
path: /tmp
|
||||
- name: dockers
|
||||
hostPath:
|
||||
path: /run/docker
|
||||
- name: containerds
|
||||
hostPath:
|
||||
path: /run/containerd
|
||||
- name: device-plugin
|
||||
hostPath:
|
||||
path: {{ .Values.devicePlugin.pluginPath }}
|
||||
- name: lib
|
||||
hostPath:
|
||||
path: {{ .Values.devicePlugin.libPath }}
|
||||
- name: usrbin
|
||||
hostPath:
|
||||
path: /usr/bin
|
||||
- name: sysinfo
|
||||
hostPath:
|
||||
path: /sys
|
||||
- name: hostvar
|
||||
hostPath:
|
||||
path: /var
|
||||
- name: deviceconfig
|
||||
configMap:
|
||||
name: {{ template "hami-vgpu.device-plugin" . }}
|
||||
- name: device-config
|
||||
configMap:
|
||||
name: {{ include "hami-vgpu.scheduler" . }}-device
|
||||
{{- if .Values.devicePlugin.nvidianodeSelector }}
|
||||
nodeSelector: {{ toYaml .Values.devicePlugin.nvidianodeSelector | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.devicePlugin.tolerations }}
|
||||
tolerations: {{ toYaml .Values.devicePlugin.tolerations | nindent 8 }}
|
||||
{{- end }}
|
@ -0,0 +1,27 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.device-plugin" . }}-monitor
|
||||
rules:
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- pods
|
||||
verbs:
|
||||
- get
|
||||
- create
|
||||
- watch
|
||||
- list
|
||||
- update
|
||||
- patch
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- nodes
|
||||
verbs:
|
||||
- get
|
||||
- update
|
||||
- list
|
||||
- patch
|
||||
|
||||
|
@ -0,0 +1,16 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.device-plugin" . }}
|
||||
labels:
|
||||
app.kubernetes.io/component: "hami-device-plugin"
|
||||
{{- include "hami-vgpu.labels" . | nindent 4 }}
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
#name: cluster-admin
|
||||
name: {{ include "hami-vgpu.device-plugin" . }}-monitor
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: {{ include "hami-vgpu.device-plugin" . }}
|
||||
namespace: {{ .Release.Namespace | quote }}
|
@ -0,0 +1,23 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.device-plugin" . }}-monitor
|
||||
labels:
|
||||
app.kubernetes.io/component: hami-device-plugin
|
||||
{{- include "hami-vgpu.labels" . | nindent 4 }}
|
||||
{{- if .Values.scheduler.service.labels }}
|
||||
{{ toYaml .Values.scheduler.service.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.scheduler.service.annotations }}
|
||||
annotations: {{ toYaml .Values.scheduler.service.annotations | nindent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
externalTrafficPolicy: Local
|
||||
selector:
|
||||
app.kubernetes.io/component: hami-device-plugin
|
||||
type: NodePort
|
||||
ports:
|
||||
- name: monitorport
|
||||
port: {{ .Values.devicePlugin.service.httpPort }}
|
||||
targetPort: 9394
|
||||
nodePort: {{ .Values.devicePlugin.service.httpPort }}
|
@ -0,0 +1,8 @@
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.device-plugin" . }}
|
||||
namespace: {{ .Release.Namespace | quote }}
|
||||
labels:
|
||||
app.kubernetes.io/component: "hami-device-plugin"
|
||||
{{- include "hami-vgpu.labels" . | nindent 4 }}
|
@ -0,0 +1,88 @@
|
||||
{{- if .Values.scheduler.kubeScheduler.enabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.scheduler" . }}
|
||||
labels:
|
||||
app.kubernetes.io/component: hami-scheduler
|
||||
{{- include "hami-vgpu.labels" . | nindent 4 }}
|
||||
data:
|
||||
config.json: |
|
||||
{
|
||||
"kind": "Policy",
|
||||
"apiVersion": "v1",
|
||||
"extenders": [
|
||||
{
|
||||
"urlPrefix": "https://127.0.0.1:443",
|
||||
"filterVerb": "filter",
|
||||
"bindVerb": "bind",
|
||||
"enableHttps": true,
|
||||
"weight": 1,
|
||||
"nodeCacheCapable": true,
|
||||
"httpTimeout": 30000000000,
|
||||
"tlsConfig": {
|
||||
"insecure": true
|
||||
},
|
||||
"managedResources": [
|
||||
{{- if .Values.devices.ascend.enabled }}
|
||||
{{- range .Values.devices.ascend.customresources }}
|
||||
{
|
||||
"name": "{{ . }}",
|
||||
"ignoredByScheduler": true
|
||||
},
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if .Values.devices.mthreads.enabled }}
|
||||
{{- range .Values.devices.mthreads.customresources }}
|
||||
{
|
||||
"name": "{{ . }}",
|
||||
"ignoredByScheduler": true
|
||||
},
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{
|
||||
"name": "{{ .Values.resourceName }}",
|
||||
"ignoredByScheduler": true
|
||||
},
|
||||
{
|
||||
"name": "{{ .Values.resourceMem }}",
|
||||
"ignoredByScheduler": true
|
||||
},
|
||||
{
|
||||
"name": "{{ .Values.resourceCores }}",
|
||||
"ignoredByScheduler": true
|
||||
},
|
||||
{
|
||||
"name": "{{ .Values.resourceMemPercentage }}",
|
||||
"ignoredByScheduler": true
|
||||
},
|
||||
{
|
||||
"name": "{{ .Values.resourcePriority }}",
|
||||
"ignoredByScheduler": true
|
||||
},
|
||||
{
|
||||
"name": "{{ .Values.mluResourceName }}",
|
||||
"ignoredByScheduler": true
|
||||
},
|
||||
{
|
||||
"name": "{{ .Values.dcuResourceName }}",
|
||||
"ignoredByScheduler": true
|
||||
},
|
||||
{
|
||||
"name": "{{ .Values.dcuResourceMem }}",
|
||||
"ignoredByScheduler": true
|
||||
},
|
||||
{
|
||||
"name": "{{ .Values.dcuResourceCores }}",
|
||||
"ignoredByScheduler": true
|
||||
},
|
||||
{
|
||||
"name": "{{ .Values.iluvatarResourceName }}",
|
||||
"ignoredByScheduler": true
|
||||
}
|
||||
],
|
||||
"ignoreable": false
|
||||
}
|
||||
]
|
||||
}
|
||||
{{- end }}
|
@ -0,0 +1,64 @@
|
||||
{{- if .Values.scheduler.kubeScheduler.enabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.scheduler" . }}-newversion
|
||||
labels:
|
||||
app.kubernetes.io/component: hami-scheduler
|
||||
{{- include "hami-vgpu.labels" . | nindent 4 }}
|
||||
data:
|
||||
config.yaml: |
|
||||
{{- if gt (.Capabilities.KubeVersion.Minor | int) 25}}
|
||||
apiVersion: kubescheduler.config.k8s.io/v1
|
||||
{{- else }}
|
||||
apiVersion: kubescheduler.config.k8s.io/v1beta2
|
||||
{{- end }}
|
||||
kind: KubeSchedulerConfiguration
|
||||
leaderElection:
|
||||
leaderElect: false
|
||||
profiles:
|
||||
- schedulerName: {{ .Values.schedulerName }}
|
||||
extenders:
|
||||
- urlPrefix: "https://127.0.0.1:443"
|
||||
filterVerb: filter
|
||||
bindVerb: bind
|
||||
nodeCacheCapable: true
|
||||
weight: 1
|
||||
httpTimeout: 30s
|
||||
enableHTTPS: true
|
||||
tlsConfig:
|
||||
insecure: true
|
||||
managedResources:
|
||||
- name: {{ .Values.resourceName }}
|
||||
ignoredByScheduler: true
|
||||
- name: {{ .Values.resourceMem }}
|
||||
ignoredByScheduler: true
|
||||
- name: {{ .Values.resourceCores }}
|
||||
ignoredByScheduler: true
|
||||
- name: {{ .Values.resourceMemPercentage }}
|
||||
ignoredByScheduler: true
|
||||
- name: {{ .Values.resourcePriority }}
|
||||
ignoredByScheduler: true
|
||||
- name: {{ .Values.mluResourceName }}
|
||||
ignoredByScheduler: true
|
||||
- name: {{ .Values.dcuResourceName }}
|
||||
ignoredByScheduler: true
|
||||
- name: {{ .Values.dcuResourceMem }}
|
||||
ignoredByScheduler: true
|
||||
- name: {{ .Values.dcuResourceCores }}
|
||||
ignoredByScheduler: true
|
||||
- name: {{ .Values.iluvatarResourceName }}
|
||||
ignoredByScheduler: true
|
||||
{{- if .Values.devices.ascend.enabled }}
|
||||
{{- range .Values.devices.ascend.customresources }}
|
||||
- name: {{ . }}
|
||||
ignoredByScheduler: true
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if .Values.devices.mthreads.enabled }}
|
||||
{{- range .Values.devices.mthreads.customresources }}
|
||||
- name: {{ . }}
|
||||
ignoredByScheduler: true
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
@ -0,0 +1,152 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.scheduler" . }}
|
||||
labels:
|
||||
app.kubernetes.io/component: hami-scheduler
|
||||
{{- include "hami-vgpu.labels" . | nindent 4 }}
|
||||
{{- with .Values.global.labels }}
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.global.annotations }}
|
||||
annotations: {{ toYaml .Values.global.annotations | nindent 4}}
|
||||
{{- end }}
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/component: hami-scheduler
|
||||
{{- include "hami-vgpu.selectorLabels" . | nindent 6 }}
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: hami-scheduler
|
||||
{{- include "hami-vgpu.selectorLabels" . | nindent 8 }}
|
||||
hami.io/webhook: ignore
|
||||
{{- if .Values.scheduler.podAnnotations }}
|
||||
annotations: {{ toYaml .Values.scheduler.podAnnotations | nindent 8 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
{{- include "hami-vgpu.imagePullSecrets" . | nindent 6}}
|
||||
serviceAccountName: {{ include "hami-vgpu.scheduler" . }}
|
||||
priorityClassName: system-node-critical
|
||||
containers:
|
||||
{{- if .Values.scheduler.kubeScheduler.enabled }}
|
||||
- name: kube-scheduler
|
||||
image: "{{ .Values.scheduler.kubeScheduler.image }}:{{ include "resolvedKubeSchedulerTag" . }}"
|
||||
imagePullPolicy: {{ .Values.scheduler.kubeScheduler.imagePullPolicy | quote }}
|
||||
command:
|
||||
- kube-scheduler
|
||||
{{- if ge (.Capabilities.KubeVersion.Minor | int) 22 }}
|
||||
{{- range .Values.scheduler.kubeScheduler.extraNewArgs }}
|
||||
- {{ . }}
|
||||
{{- end }}
|
||||
{{- else }}
|
||||
- --scheduler-name={{ .Values.schedulerName }}
|
||||
{{- range .Values.scheduler.kubeScheduler.extraArgs }}
|
||||
- {{ . }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- --leader-elect={{ .Values.scheduler.leaderElect }}
|
||||
- --leader-elect-resource-name={{ .Values.schedulerName }}
|
||||
- --leader-elect-resource-namespace={{ .Release.Namespace }}
|
||||
resources:
|
||||
{{- toYaml .Values.scheduler.kubeScheduler.resources | nindent 12 }}
|
||||
volumeMounts:
|
||||
- name: scheduler-config
|
||||
mountPath: /config
|
||||
{{- end }}
|
||||
{{- if .Values.scheduler.livenessProbe }}
|
||||
livenessProbe:
|
||||
failureThreshold: 8
|
||||
httpGet:
|
||||
path: /healthz
|
||||
port: 10259
|
||||
scheme: HTTPS
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
successThreshold: 1
|
||||
timeoutSeconds: 15
|
||||
{{- end }}
|
||||
- name: vgpu-scheduler-extender
|
||||
image: {{ .Values.scheduler.extender.image }}:{{ .Values.version }}
|
||||
imagePullPolicy: {{ .Values.scheduler.extender.imagePullPolicy | quote }}
|
||||
env:
|
||||
{{- if .Values.global.managedNodeSelectorEnable }}
|
||||
{{- range $key, $value := .Values.global.managedNodeSelector }}
|
||||
- name: NODE_SELECTOR_{{ $key | upper | replace "-" "_" }}
|
||||
value: "{{ $value }}"
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
command:
|
||||
- scheduler
|
||||
- --http_bind=0.0.0.0:443
|
||||
- --cert_file=/tls/tls.crt
|
||||
- --key_file=/tls/tls.key
|
||||
- --scheduler-name={{ .Values.schedulerName }}
|
||||
- --metrics-bind-address={{ .Values.scheduler.metricsBindAddress }}
|
||||
- --node-scheduler-policy={{ .Values.scheduler.defaultSchedulerPolicy.nodeSchedulerPolicy }}
|
||||
- --gpu-scheduler-policy={{ .Values.scheduler.defaultSchedulerPolicy.gpuSchedulerPolicy }}
|
||||
- --device-config-file=/device-config.yaml
|
||||
{{- if .Values.devices.ascend.enabled }}
|
||||
- --enable-ascend=true
|
||||
{{- end }}
|
||||
{{- if .Values.scheduler.nodeLabelSelector }}
|
||||
- --node-label-selector={{- $first := true -}}
|
||||
{{- range $key, $value := .Values.scheduler.nodeLabelSelector -}}
|
||||
{{- if not $first }},{{ end -}}
|
||||
{{- $key }}={{ $value -}}
|
||||
{{- $first = false -}}
|
||||
{{- end -}}
|
||||
{{- end }}
|
||||
{{- range .Values.scheduler.extender.extraArgs }}
|
||||
- {{ . }}
|
||||
{{- end }}
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 443
|
||||
protocol: TCP
|
||||
resources:
|
||||
{{- toYaml .Values.scheduler.extender.resources | nindent 12 }}
|
||||
volumeMounts:
|
||||
- name: tls-config
|
||||
mountPath: /tls
|
||||
- name: device-config
|
||||
mountPath: /device-config.yaml
|
||||
subPath: device-config.yaml
|
||||
{{- if .Values.scheduler.livenessProbe }}
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /healthz
|
||||
port: 443
|
||||
scheme: HTTPS
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
failureThreshold: 3
|
||||
timeoutSeconds: 5
|
||||
{{- end }}
|
||||
volumes:
|
||||
- name: tls-config
|
||||
secret:
|
||||
secretName: {{ template "hami-vgpu.scheduler.tls" . }}
|
||||
{{- if .Values.scheduler.kubeScheduler.enabled }}
|
||||
- name: scheduler-config
|
||||
configMap:
|
||||
{{- if ge (.Capabilities.KubeVersion.Minor | int) 22 }}
|
||||
name: {{ template "hami-vgpu.scheduler" . }}-newversion
|
||||
{{- else }}
|
||||
name: {{ template "hami-vgpu.scheduler" . }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- name: device-config
|
||||
configMap:
|
||||
name: {{ include "hami-vgpu.scheduler" . }}-device
|
||||
{{- if .Values.scheduler.nodeSelector }}
|
||||
nodeSelector: {{ toYaml .Values.scheduler.nodeSelector | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.scheduler.tolerations }}
|
||||
tolerations: {{ toYaml .Values.scheduler.tolerations | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.scheduler.nodeName }}
|
||||
nodeName: {{ .Values.scheduler.nodeName }}
|
||||
{{- end }}
|
@ -0,0 +1,177 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.scheduler" . }}-device
|
||||
labels:
|
||||
app.kubernetes.io/component: hami-scheduler
|
||||
{{- include "hami-vgpu.labels" . | nindent 4 }}
|
||||
data:
|
||||
device-config.yaml: |-
|
||||
{{- if .Files.Glob "files/device-config.yaml" }}
|
||||
{{- .Files.Get "files/device-config.yaml" | nindent 4}}
|
||||
{{- else }}
|
||||
nvidia:
|
||||
resourceCountName: {{ .Values.resourceName }}
|
||||
resourceMemoryName: {{ .Values.resourceMem }}
|
||||
resourceMemoryPercentageName: {{ .Values.resourceMemPercentage }}
|
||||
resourceCoreName: {{ .Values.resourceCores }}
|
||||
resourcePriorityName: {{ .Values.resourcePriority }}
|
||||
overwriteEnv: false
|
||||
defaultMemory: 0
|
||||
defaultCores: 0
|
||||
defaultGPUNum: 1
|
||||
deviceSplitCount: {{ .Values.devicePlugin.deviceSplitCount }}
|
||||
deviceMemoryScaling: {{ .Values.devicePlugin.deviceMemoryScaling }}
|
||||
deviceCoreScaling: {{ .Values.devicePlugin.deviceCoreScaling }}
|
||||
knownMigGeometries:
|
||||
- models: [ "A30" ]
|
||||
allowedGeometries:
|
||||
-
|
||||
- name: 1g.6gb
|
||||
memory: 6144
|
||||
count: 4
|
||||
-
|
||||
- name: 2g.12gb
|
||||
memory: 12288
|
||||
count: 2
|
||||
-
|
||||
- name: 4g.24gb
|
||||
memory: 24576
|
||||
count: 1
|
||||
- models: [ "A100-SXM4-40GB", "A100-40GB-PCIe", "A100-PCIE-40GB", "A100-SXM4-40GB" ]
|
||||
allowedGeometries:
|
||||
-
|
||||
- name: 1g.5gb
|
||||
memory: 5120
|
||||
count: 7
|
||||
-
|
||||
- name: 2g.10gb
|
||||
memory: 10240
|
||||
count: 3
|
||||
- name: 1g.5gb
|
||||
memory: 5120
|
||||
count: 1
|
||||
-
|
||||
- name: 3g.20gb
|
||||
memory: 20480
|
||||
count: 2
|
||||
-
|
||||
- name: 7g.40gb
|
||||
memory: 40960
|
||||
count: 1
|
||||
- models: [ "A100-SXM4-80GB", "A100-80GB-PCIe", "A100-PCIE-80GB"]
|
||||
allowedGeometries:
|
||||
-
|
||||
- name: 1g.10gb
|
||||
memory: 10240
|
||||
count: 7
|
||||
-
|
||||
- name: 2g.20gb
|
||||
memory: 20480
|
||||
count: 3
|
||||
- name: 1g.10gb
|
||||
memory: 10240
|
||||
count: 1
|
||||
-
|
||||
- name: 3g.40gb
|
||||
memory: 40960
|
||||
count: 2
|
||||
-
|
||||
- name: 7g.79gb
|
||||
memory: 80896
|
||||
count: 1
|
||||
cambricon:
|
||||
resourceCountName: {{ .Values.mluResourceName }}
|
||||
resourceMemoryName: {{ .Values.mluResourceMem }}
|
||||
resourceCoreName: {{ .Values.mluResourceCores }}
|
||||
hygon:
|
||||
resourceCountName: {{ .Values.dcuResourceName }}
|
||||
resourceMemoryName: {{ .Values.dcuResourceMem }}
|
||||
resourceCoreName: {{ .Values.dcuResourceCores }}
|
||||
metax:
|
||||
resourceCountName: "metax-tech.com/gpu"
|
||||
mthreads:
|
||||
resourceCountName: "mthreads.com/vgpu"
|
||||
resourceMemoryName: "mthreads.com/sgpu-memory"
|
||||
resourceCoreName: "mthreads.com/sgpu-core"
|
||||
iluvatar:
|
||||
resourceCountName: {{ .Values.iluvatarResourceName }}
|
||||
resourceMemoryName: {{ .Values.iluvatarResourceMem }}
|
||||
resourceCoreName: {{ .Values.iluvatarResourceCore }}
|
||||
vnpus:
|
||||
- chipName: 910B
|
||||
commonWord: Ascend910A
|
||||
resourceName: huawei.com/Ascend910A
|
||||
resourceMemoryName: huawei.com/Ascend910A-memory
|
||||
memoryAllocatable: 32768
|
||||
memoryCapacity: 32768
|
||||
aiCore: 30
|
||||
templates:
|
||||
- name: vir02
|
||||
memory: 2184
|
||||
aiCore: 2
|
||||
- name: vir04
|
||||
memory: 4369
|
||||
aiCore: 4
|
||||
- name: vir08
|
||||
memory: 8738
|
||||
aiCore: 8
|
||||
- name: vir16
|
||||
memory: 17476
|
||||
aiCore: 16
|
||||
- chipName: 910B3
|
||||
commonWord: Ascend910B
|
||||
resourceName: huawei.com/Ascend910B
|
||||
resourceMemoryName: huawei.com/Ascend910B-memory
|
||||
memoryAllocatable: 65536
|
||||
memoryCapacity: 65536
|
||||
aiCore: 20
|
||||
aiCPU: 7
|
||||
templates:
|
||||
- name: vir05_1c_16g
|
||||
memory: 16384
|
||||
aiCore: 5
|
||||
aiCPU: 1
|
||||
- name: vir10_3c_32g
|
||||
memory: 32768
|
||||
aiCore: 10
|
||||
aiCPU: 3
|
||||
- chipName: 910B4
|
||||
commonWord: Ascend910B4
|
||||
resourceName: huawei.com/Ascend910B4
|
||||
resourceMemoryName: huawei.com/Ascend910B4-memory
|
||||
memoryAllocatable: 32768
|
||||
memoryCapacity: 32768
|
||||
aiCore: 20
|
||||
aiCPU: 7
|
||||
templates:
|
||||
- name: vir05_1c_8g
|
||||
memory: 8192
|
||||
aiCore: 5
|
||||
aiCPU: 1
|
||||
- name: vir10_3c_16g
|
||||
memory: 16384
|
||||
aiCore: 10
|
||||
aiCPU: 3
|
||||
- chipName: 310P3
|
||||
commonWord: Ascend310P
|
||||
resourceName: huawei.com/Ascend310P
|
||||
resourceMemoryName: huawei.com/Ascend310P-memory
|
||||
memoryAllocatable: 21527
|
||||
memoryCapacity: 24576
|
||||
aiCore: 8
|
||||
aiCPU: 7
|
||||
templates:
|
||||
- name: vir01
|
||||
memory: 3072
|
||||
aiCore: 1
|
||||
aiCPU: 1
|
||||
- name: vir02
|
||||
memory: 6144
|
||||
aiCore: 2
|
||||
aiCPU: 2
|
||||
- name: vir04
|
||||
memory: 12288
|
||||
aiCore: 4
|
||||
aiCPU: 4
|
||||
{{ end }}
|
@ -0,0 +1,26 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.fullname" . }}-admission
|
||||
annotations:
|
||||
"helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
|
||||
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
|
||||
labels:
|
||||
{{- include "hami-vgpu.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: admission-webhook
|
||||
rules:
|
||||
- apiGroups:
|
||||
- admissionregistration.k8s.io
|
||||
resources:
|
||||
#- validatingwebhookconfigurations
|
||||
- mutatingwebhookconfigurations
|
||||
verbs:
|
||||
- get
|
||||
- update
|
||||
{{- if .Values.podSecurityPolicy.enabled }}
|
||||
- apiGroups: ['extensions']
|
||||
resources: ['podsecuritypolicies']
|
||||
verbs: ['use']
|
||||
resourceNames:
|
||||
- {{ include "hami-vgpu.fullname" . }}-admission
|
||||
{{- end }}
|
@ -0,0 +1,18 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.fullname" . }}-admission
|
||||
annotations:
|
||||
"helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
|
||||
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
|
||||
labels:
|
||||
{{- include "hami-vgpu.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: admission-webhook
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: {{ include "hami-vgpu.fullname" . }}-admission
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: {{ include "hami-vgpu.fullname" . }}-admission
|
||||
namespace: {{ .Release.Namespace | quote }}
|
@ -0,0 +1,60 @@
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.fullname" . }}-admission-create
|
||||
annotations:
|
||||
"helm.sh/hook": pre-install,pre-upgrade
|
||||
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
|
||||
labels:
|
||||
{{- include "hami-vgpu.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: admission-webhook
|
||||
spec:
|
||||
{{- if .Capabilities.APIVersions.Has "batch/v1alpha1" }}
|
||||
# Alpha feature since k8s 1.12
|
||||
ttlSecondsAfterFinished: 0
|
||||
{{- end }}
|
||||
template:
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.fullname" . }}-admission-create
|
||||
{{- if .Values.scheduler.patch.podAnnotations }}
|
||||
annotations: {{ toYaml .Values.scheduler.patch.podAnnotations | nindent 8 }}
|
||||
{{- end }}
|
||||
labels:
|
||||
{{- include "hami-vgpu.labels" . | nindent 8 }}
|
||||
app.kubernetes.io/component: admission-webhook
|
||||
hami.io/webhook: ignore
|
||||
spec:
|
||||
{{- include "hami-vgpu.imagePullSecrets" . | nindent 6}}
|
||||
{{- if .Values.scheduler.patch.priorityClassName }}
|
||||
priorityClassName: {{ .Values.scheduler.patch.priorityClassName }}
|
||||
{{- end }}
|
||||
containers:
|
||||
- name: create
|
||||
{{- if ge (.Capabilities.KubeVersion.Minor | int) 22 }}
|
||||
image: {{ .Values.scheduler.patch.imageNew }}
|
||||
{{- else }}
|
||||
image: {{ .Values.scheduler.patch.image }}
|
||||
{{- end }}
|
||||
imagePullPolicy: {{ .Values.scheduler.patch.imagePullPolicy }}
|
||||
args:
|
||||
- create
|
||||
- --cert-name=tls.crt
|
||||
- --key-name=tls.key
|
||||
{{- if .Values.scheduler.admissionWebhook.customURL.enabled }}
|
||||
- --host={{ printf "%s.%s.svc,127.0.0.1,%s" (include "hami-vgpu.scheduler" .) .Release.Namespace .Values.scheduler.admissionWebhook.customURL.host}}
|
||||
{{- else }}
|
||||
- --host={{ printf "%s.%s.svc,127.0.0.1" (include "hami-vgpu.scheduler" .) .Release.Namespace }}
|
||||
{{- end }}
|
||||
- --namespace={{ .Release.Namespace }}
|
||||
- --secret-name={{ include "hami-vgpu.scheduler.tls" . }}
|
||||
restartPolicy: OnFailure
|
||||
serviceAccountName: {{ include "hami-vgpu.fullname" . }}-admission
|
||||
{{- if .Values.scheduler.patch.nodeSelector }}
|
||||
nodeSelector: {{ toYaml .Values.scheduler.patch.nodeSelector | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.scheduler.patch.tolerations }}
|
||||
tolerations: {{ toYaml .Values.scheduler.patch.tolerations | nindent 8 }}
|
||||
{{- end }}
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: {{ .Values.scheduler.patch.runAsUser }}
|
@ -0,0 +1,55 @@
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.fullname" . }}-admission-patch
|
||||
annotations:
|
||||
"helm.sh/hook": post-install,post-upgrade
|
||||
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
|
||||
labels:
|
||||
{{- include "hami-vgpu.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: admission-webhook
|
||||
spec:
|
||||
{{- if .Capabilities.APIVersions.Has "batch/v1alpha1" }}
|
||||
# Alpha feature since k8s 1.12
|
||||
ttlSecondsAfterFinished: 0
|
||||
{{- end }}
|
||||
template:
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.fullname" . }}-admission-patch
|
||||
{{- if .Values.scheduler.patch.podAnnotations }}
|
||||
annotations: {{ toYaml .Values.scheduler.patch.podAnnotations | nindent 8 }}
|
||||
{{- end }}
|
||||
labels:
|
||||
{{- include "hami-vgpu.labels" . | nindent 8 }}
|
||||
app.kubernetes.io/component: admission-webhook
|
||||
hami.io/webhook: ignore
|
||||
spec:
|
||||
{{- include "hami-vgpu.imagePullSecrets" . | nindent 6}}
|
||||
{{- if .Values.scheduler.patch.priorityClassName }}
|
||||
priorityClassName: {{ .Values.scheduler.patch.priorityClassName }}
|
||||
{{- end }}
|
||||
containers:
|
||||
- name: patch
|
||||
{{- if ge (.Capabilities.KubeVersion.Minor | int) 22 }}
|
||||
image: {{ .Values.scheduler.patch.imageNew }}
|
||||
{{- else }}
|
||||
image: {{ .Values.scheduler.patch.image }}
|
||||
{{- end }}
|
||||
imagePullPolicy: {{ .Values.scheduler.patch.imagePullPolicy }}
|
||||
args:
|
||||
- patch
|
||||
- --webhook-name={{ include "hami-vgpu.scheduler.webhook" . }}
|
||||
- --namespace={{ .Release.Namespace }}
|
||||
- --patch-validating=false
|
||||
- --secret-name={{ include "hami-vgpu.scheduler.tls" . }}
|
||||
restartPolicy: OnFailure
|
||||
serviceAccountName: {{ include "hami-vgpu.fullname" . }}-admission
|
||||
{{- if .Values.scheduler.patch.nodeSelector }}
|
||||
nodeSelector: {{ toYaml .Values.scheduler.patch.nodeSelector | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.scheduler.patch.tolerations }}
|
||||
tolerations: {{ toYaml .Values.scheduler.patch.tolerations | nindent 8 }}
|
||||
{{- end }}
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: {{ .Values.scheduler.patch.runAsUser }}
|
@ -0,0 +1,36 @@
|
||||
{{- if .Values.podSecurityPolicy.enabled }}
|
||||
apiVersion: policy/v1beta1
|
||||
kind: PodSecurityPolicy
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.fullname" . }}-admission
|
||||
annotations:
|
||||
"helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
|
||||
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
|
||||
labels:
|
||||
{{- include "hami-vgpu.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: admission-webhook
|
||||
spec:
|
||||
allowPrivilegeEscalation: false
|
||||
fsGroup:
|
||||
ranges:
|
||||
- max: 65535
|
||||
min: 1
|
||||
rule: MustRunAs
|
||||
requiredDropCapabilities:
|
||||
- ALL
|
||||
runAsUser:
|
||||
rule: MustRunAsNonRoot
|
||||
seLinux:
|
||||
rule: RunAsAny
|
||||
supplementalGroups:
|
||||
ranges:
|
||||
- max: 65535
|
||||
min: 1
|
||||
rule: MustRunAs
|
||||
volumes:
|
||||
- configMap
|
||||
- emptyDir
|
||||
- projected
|
||||
- secret
|
||||
- downwardAPI
|
||||
{{- end }}
|
@ -0,0 +1,18 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.fullname" . }}-admission
|
||||
annotations:
|
||||
"helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
|
||||
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
|
||||
labels:
|
||||
{{- include "hami-vgpu.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: admission-webhook
|
||||
rules:
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- secrets
|
||||
verbs:
|
||||
- get
|
||||
- create
|
@ -0,0 +1,18 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.fullname" . }}-admission
|
||||
annotations:
|
||||
"helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
|
||||
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
|
||||
labels:
|
||||
{{- include "hami-vgpu.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: admission-webhook
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: {{ include "hami-vgpu.fullname" . }}-admission
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: {{ include "hami-vgpu.fullname" . }}-admission
|
||||
namespace: {{ .Release.Namespace | quote }}
|
@ -0,0 +1,10 @@
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.fullname" . }}-admission
|
||||
annotations:
|
||||
"helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
|
||||
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
|
||||
labels:
|
||||
{{- include "hami-vgpu.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: admission-webhook
|
@ -0,0 +1,15 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.scheduler" . }}
|
||||
labels:
|
||||
app.kubernetes.io/component: "hami-scheduler"
|
||||
{{- include "hami-vgpu.labels" . | nindent 4 }}
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: cluster-admin
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: {{ include "hami-vgpu.scheduler" . }}
|
||||
namespace: {{ .Release.Namespace | quote }}
|
@ -0,0 +1,30 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.scheduler" . }}
|
||||
labels:
|
||||
app.kubernetes.io/component: hami-scheduler
|
||||
{{- include "hami-vgpu.labels" . | nindent 4 }}
|
||||
{{- if .Values.scheduler.service.labels }}
|
||||
{{ toYaml .Values.scheduler.service.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.scheduler.service.annotations }}
|
||||
annotations: {{ toYaml .Values.scheduler.service.annotations | nindent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
type: NodePort
|
||||
ports:
|
||||
- name: http
|
||||
port: {{ .Values.scheduler.service.httpPort }}
|
||||
targetPort: 443
|
||||
nodePort: {{ .Values.scheduler.service.schedulerPort }}
|
||||
protocol: TCP
|
||||
- name: monitor
|
||||
port: {{ .Values.scheduler.service.monitorPort }}
|
||||
targetPort: {{ (split ":" (printf "%s" .Values.scheduler.metricsBindAddress))._1 }}
|
||||
nodePort: {{ .Values.scheduler.service.monitorPort }}
|
||||
protocol: TCP
|
||||
selector:
|
||||
app.kubernetes.io/component: hami-scheduler
|
||||
{{- include "hami-vgpu.selectorLabels" . | nindent 4 }}
|
||||
|
@ -0,0 +1,8 @@
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.scheduler" . }}
|
||||
namespace: {{ .Release.Namespace | quote }}
|
||||
labels:
|
||||
app.kubernetes.io/component: "hami-scheduler"
|
||||
{{- include "hami-vgpu.labels" . | nindent 4 }}
|
@ -0,0 +1,51 @@
|
||||
apiVersion: admissionregistration.k8s.io/v1
|
||||
kind: MutatingWebhookConfiguration
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.scheduler.webhook" . }}
|
||||
webhooks:
|
||||
- admissionReviewVersions:
|
||||
- v1beta1
|
||||
clientConfig:
|
||||
{{- if .Values.scheduler.admissionWebhook.customURL.enabled }}
|
||||
url: https://{{ .Values.scheduler.admissionWebhook.customURL.host}}:{{.Values.scheduler.admissionWebhook.customURL.port}}{{.Values.scheduler.admissionWebhook.customURL.path}}
|
||||
{{- else }}
|
||||
service:
|
||||
name: {{ include "hami-vgpu.scheduler" . }}
|
||||
namespace: {{ .Release.Namespace }}
|
||||
path: /webhook
|
||||
port: {{ .Values.scheduler.service.httpPort }}
|
||||
{{- end }}
|
||||
failurePolicy: {{ .Values.scheduler.admissionWebhook.failurePolicy }}
|
||||
matchPolicy: Equivalent
|
||||
name: vgpu.hami.io
|
||||
namespaceSelector:
|
||||
matchExpressions:
|
||||
- key: hami.io/webhook
|
||||
operator: NotIn
|
||||
values:
|
||||
- ignore
|
||||
{{- if .Values.scheduler.admissionWebhook.whitelistNamespaces }}
|
||||
- key: kubernetes.io/metadata.name
|
||||
operator: NotIn
|
||||
values:
|
||||
{{- toYaml .Values.scheduler.admissionWebhook.whitelistNamespaces | nindent 10 }}
|
||||
{{- end }}
|
||||
objectSelector:
|
||||
matchExpressions:
|
||||
- key: hami.io/webhook
|
||||
operator: NotIn
|
||||
values:
|
||||
- ignore
|
||||
reinvocationPolicy: {{ .Values.scheduler.admissionWebhook.reinvocationPolicy }}
|
||||
rules:
|
||||
- apiGroups:
|
||||
- ""
|
||||
apiVersions:
|
||||
- v1
|
||||
operations:
|
||||
- CREATE
|
||||
resources:
|
||||
- pods
|
||||
scope: '*'
|
||||
sideEffects: None
|
||||
timeoutSeconds: 10
|
@ -0,0 +1,204 @@
|
||||
# Default values for hami-vgpu.
|
||||
|
||||
nameOverride: ""
|
||||
fullnameOverride: ""
|
||||
imagePullSecrets: [ ]
|
||||
version: "v2.5.0"
|
||||
|
||||
#Nvidia GPU Parameters
|
||||
resourceName: "nvidia.com/gpu"
|
||||
resourceMem: "nvidia.com/gpumem"
|
||||
resourceMemPercentage: "nvidia.com/gpumem-percentage"
|
||||
resourceCores: "nvidia.com/gpucores"
|
||||
resourcePriority: "nvidia.com/priority"
|
||||
|
||||
#MLU Parameters
|
||||
mluResourceName: "cambricon.com/vmlu"
|
||||
mluResourceMem: "cambricon.com/mlu.smlu.vmemory"
|
||||
mluResourceCores: "cambricon.com/mlu.smlu.vcore"
|
||||
|
||||
#Hygon DCU Parameters
|
||||
dcuResourceName: "hygon.com/dcunum"
|
||||
dcuResourceMem: "hygon.com/dcumem"
|
||||
dcuResourceCores: "hygon.com/dcucores"
|
||||
|
||||
#Iluvatar GPU Parameters
|
||||
iluvatarResourceName: "iluvatar.ai/vgpu"
|
||||
iluvatarResourceMem: "iluvatar.ai/vcuda-memory"
|
||||
iluvatarResourceCore: "iluvatar.ai/vcuda-core"
|
||||
|
||||
schedulerName: "hami-scheduler"
|
||||
|
||||
podSecurityPolicy:
|
||||
enabled: false
|
||||
|
||||
global:
|
||||
gpuHookPath: /usr/local
|
||||
labels: {}
|
||||
annotations: {}
|
||||
managedNodeSelectorEnable: false
|
||||
managedNodeSelector:
|
||||
usage: "gpu"
|
||||
|
||||
|
||||
scheduler:
|
||||
# @param nodeName defines the node name and the nvidia-vgpu-scheduler-scheduler will schedule to the node.
|
||||
# if we install the nvidia-vgpu-scheduler-scheduler as default scheduler, we need to remove the k8s default
|
||||
# scheduler pod from the cluster first, we must specify node name to skip the schedule workflow.
|
||||
nodeName: ""
|
||||
#nodeLabelSelector:
|
||||
# "gpu": "on"
|
||||
overwriteEnv: "false"
|
||||
defaultSchedulerPolicy:
|
||||
nodeSchedulerPolicy: binpack
|
||||
gpuSchedulerPolicy: spread
|
||||
metricsBindAddress: ":9395"
|
||||
livenessProbe: false
|
||||
leaderElect: true
|
||||
kubeScheduler:
|
||||
# @param enabled indicate whether to run kube-scheduler container in the scheduler pod, it's true by default.
|
||||
enabled: true
|
||||
image: registry.cn-hangzhou.aliyuncs.com/google_containers/kube-scheduler
|
||||
imageTag: ""
|
||||
imagePullPolicy: IfNotPresent
|
||||
resources: {}
|
||||
# If you do want to specify resources, uncomment the following lines, adjust them as necessary.
|
||||
# and remove the curly braces after 'resources:'.
|
||||
# limits:
|
||||
# cpu: 1000m
|
||||
# memory: 1000Mi
|
||||
# requests:
|
||||
# cpu: 100m
|
||||
# memory: 100Mi
|
||||
extraNewArgs:
|
||||
- --config=/config/config.yaml
|
||||
- -v=4
|
||||
extraArgs:
|
||||
- --policy-config-file=/config/config.json
|
||||
- -v=4
|
||||
extender:
|
||||
image: "projecthami/hami"
|
||||
imagePullPolicy: IfNotPresent
|
||||
resources: {}
|
||||
# If you do want to specify resources, uncomment the following lines, adjust them as necessary,
|
||||
# and remove the curly braces after 'resources:'.
|
||||
# limits:
|
||||
# cpu: 1000m
|
||||
# memory: 1000Mi
|
||||
# requests:
|
||||
# cpu: 100m
|
||||
# memory: 100Mi
|
||||
extraArgs:
|
||||
- --debug
|
||||
- -v=4
|
||||
podAnnotations: {}
|
||||
tolerations: []
|
||||
#serviceAccountName: "hami-vgpu-scheduler-sa"
|
||||
admissionWebhook:
|
||||
customURL:
|
||||
enabled: false
|
||||
# must be an endpoint using https.
|
||||
# should generate host certs here
|
||||
host: 127.0.0.1 # hostname or ip, can be your node'IP if you want to use https://<nodeIP>:<schedulerPort>/<path>
|
||||
port: 31998
|
||||
path: /webhook
|
||||
whitelistNamespaces:
|
||||
# Specify the namespaces that the webhook will not be applied to.
|
||||
# - default
|
||||
# - kube-system
|
||||
# - istio-system
|
||||
reinvocationPolicy: Never
|
||||
failurePolicy: Ignore
|
||||
patch:
|
||||
image: docker.io/jettech/kube-webhook-certgen:v1.5.2
|
||||
imageNew: liangjw/kube-webhook-certgen:v1.1.1
|
||||
imagePullPolicy: IfNotPresent
|
||||
priorityClassName: ""
|
||||
podAnnotations: {}
|
||||
nodeSelector: {}
|
||||
tolerations: []
|
||||
runAsUser: 2000
|
||||
service:
|
||||
httpPort: 443
|
||||
schedulerPort: 31998
|
||||
monitorPort: 31993
|
||||
labels: {}
|
||||
annotations: {}
|
||||
|
||||
devicePlugin:
|
||||
image: "projecthami/hami"
|
||||
monitorimage: "projecthami/hami"
|
||||
monitorctrPath: /usr/local/vgpu/containers
|
||||
imagePullPolicy: IfNotPresent
|
||||
deviceSplitCount: 40
|
||||
deviceMemoryScaling: 1
|
||||
deviceCoreScaling: 1
|
||||
runtimeClassName: ""
|
||||
migStrategy: "none"
|
||||
disablecorelimit: "false"
|
||||
passDeviceSpecsEnabled: true
|
||||
extraArgs:
|
||||
- -v=4
|
||||
|
||||
service:
|
||||
httpPort: 31992
|
||||
|
||||
pluginPath: /var/lib/kubelet/device-plugins
|
||||
libPath: /usr/local/vgpu
|
||||
|
||||
podAnnotations: {}
|
||||
nvidianodeSelector:
|
||||
gpu: "on"
|
||||
tolerations: []
|
||||
# The updateStrategy for DevicePlugin DaemonSet.
|
||||
# If you want to update the DaemonSet by manual, set type as "OnDelete".
|
||||
# We recommend use OnDelete update strategy because DevicePlugin pod restart will cause business pod restart, this behavior is destructive.
|
||||
# Otherwise, you can use RollingUpdate update strategy to rolling update DevicePlugin pod.
|
||||
updateStrategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxUnavailable: 1
|
||||
|
||||
resources: {}
|
||||
# If you do want to specify resources, uncomment the following lines, adjust them as necessary.
|
||||
# and remove the curly braces after 'resources:'.
|
||||
# limits:
|
||||
# cpu: 1000m
|
||||
# memory: 1000Mi
|
||||
# requests:
|
||||
# cpu: 100m
|
||||
# memory: 100Mi
|
||||
|
||||
vgpuMonitor:
|
||||
resources: {}
|
||||
# If you do want to specify resources, uncomment the following lines, adjust them as necessary.
|
||||
# and remove the curly braces after 'resources:'.
|
||||
# limits:
|
||||
# cpu: 1000m
|
||||
# memory: 1000Mi
|
||||
# requests:
|
||||
# cpu: 100m
|
||||
# memory: 100Mi
|
||||
|
||||
devices:
|
||||
mthreads:
|
||||
enabled: false
|
||||
customresources:
|
||||
- mthreads.com/vgpu
|
||||
ascend:
|
||||
enabled: false
|
||||
image: ""
|
||||
imagePullPolicy: IfNotPresent
|
||||
extraArgs: []
|
||||
nodeSelector:
|
||||
ascend: "on"
|
||||
tolerations: []
|
||||
customresources:
|
||||
- huawei.com/Ascend910A
|
||||
- huawei.com/Ascend910A-memory
|
||||
- huawei.com/Ascend910B
|
||||
- huawei.com/Ascend910B-memory
|
||||
- huawei.com/Ascend910B4
|
||||
- huawei.com/Ascend910B4-memory
|
||||
- huawei.com/Ascend310P
|
||||
- huawei.com/Ascend310P-memory
|
@ -0,0 +1 @@
|
||||
helm upgrade -hami ./hami/ --namespace kube-system
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in new issue