diff --git a/helm/hami/Chart.yaml b/helm/hami/Chart.yaml new file mode 100644 index 0000000..e0b1fe6 --- /dev/null +++ b/helm/hami/Chart.yaml @@ -0,0 +1,15 @@ +apiVersion: v2 +appVersion: 2.5.0 +description: Heterogeneous AI Computing Virtualization Middleware +keywords: +- vgpu +- gpu +kubeVersion: '>= 1.16.0' +maintainers: +- email: limengxuan@4paradigm.com + name: limengxuan +- email: xiaozhang0210@hotmail.com + name: zhangxiao +name: hami +type: application +version: 2.5.0 diff --git a/helm/hami/templates/NOTES.txt b/helm/hami/templates/NOTES.txt new file mode 100644 index 0000000..15bb121 --- /dev/null +++ b/helm/hami/templates/NOTES.txt @@ -0,0 +1,3 @@ +** Please be patient while the chart is being deployed ** +Resource name: {{ .Values.resourceName }} + diff --git a/helm/hami/templates/_helpers.tpl b/helm/hami/templates/_helpers.tpl new file mode 100644 index 0000000..2e2366a --- /dev/null +++ b/helm/hami/templates/_helpers.tpl @@ -0,0 +1,108 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "hami-vgpu.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "hami-vgpu.fullname" -}} +{{- if .Values.fullnameOverride -}} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride -}} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} +{{- end -}} +{{- end -}} +{{- end -}} + +{{/* +The app name for Scheduler +*/}} +{{- define "hami-vgpu.scheduler" -}} +{{- printf "%s-scheduler" ( include "hami-vgpu.fullname" . ) | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +The app name for DevicePlugin +*/}} +{{- define "hami-vgpu.device-plugin" -}} +{{- printf "%s-device-plugin" ( include "hami-vgpu.fullname" . ) | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +The tls secret name for Scheduler +*/}} +{{- define "hami-vgpu.scheduler.tls" -}} +{{- printf "%s-scheduler-tls" ( include "hami-vgpu.fullname" . ) | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +The webhook name +*/}} +{{- define "hami-vgpu.scheduler.webhook" -}} +{{- printf "%s-webhook" ( include "hami-vgpu.fullname" . ) | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "hami-vgpu.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "hami-vgpu.labels" -}} +helm.sh/chart: {{ include "hami-vgpu.chart" . }} +{{ include "hami-vgpu.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "hami-vgpu.selectorLabels" -}} +app.kubernetes.io/name: {{ include "hami-vgpu.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Image registry secret name +*/}} +{{- define "hami-vgpu.imagePullSecrets" -}} +imagePullSecrets: {{ toYaml .Values.imagePullSecrets | nindent 2 }} +{{- end }} + +{{/* + Resolve the tag for kubeScheduler. +*/}} +{{- define "resolvedKubeSchedulerTag" -}} +{{- if .Values.scheduler.kubeScheduler.imageTag }} +{{- .Values.scheduler.kubeScheduler.imageTag | trim -}} +{{- else }} +{{- include "strippedKubeVersion" . | trim -}} +{{- end }} +{{- end }} + + +{{/* + Remove the part after the `+` in the Kubernetes version string. + v1.31.1+k3s1 -> v1.31.1 + v1.31.1 -> v1.31.1 +*/}} +{{- define "strippedKubeVersion" -}} +{{- $parts := split "+" .Capabilities.KubeVersion.Version -}} +{{- print $parts._0 -}} +{{- end -}} diff --git a/helm/hami/templates/device-plugin/configmap.yaml b/helm/hami/templates/device-plugin/configmap.yaml new file mode 100644 index 0000000..4842739 --- /dev/null +++ b/helm/hami/templates/device-plugin/configmap.yaml @@ -0,0 +1,24 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "hami-vgpu.device-plugin" . }} + labels: + app.kubernetes.io/component: hami-device-plugin + {{- include "hami-vgpu.labels" . | nindent 4 }} +data: + config.json: | + { + "nodeconfig": [ + { + "name": "m5-cloudinfra-online02", + "operatingmode": "hami-core", + "devicememoryscaling": 1.8, + "devicesplitcount": 10, + "migstrategy":"none", + "filterdevices": { + "uuid": [], + "index": [] + } + } + ] + } \ No newline at end of file diff --git a/helm/hami/templates/device-plugin/daemonsetnvidia.yaml b/helm/hami/templates/device-plugin/daemonsetnvidia.yaml new file mode 100644 index 0000000..5e0217c --- /dev/null +++ b/helm/hami/templates/device-plugin/daemonsetnvidia.yaml @@ -0,0 +1,166 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: {{ include "hami-vgpu.device-plugin" . }} + labels: + app.kubernetes.io/component: hami-device-plugin + {{- include "hami-vgpu.labels" . | nindent 4 }} + {{- with .Values.global.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} + {{- if .Values.global.annotations }} + annotations: {{ toYaml .Values.global.annotations | nindent 4}} + {{- end }} +spec: + updateStrategy: + {{- with .Values.devicePlugin.updateStrategy }} + {{- toYaml . | nindent 4 }} + {{- end }} + selector: + matchLabels: + app.kubernetes.io/component: hami-device-plugin + {{- include "hami-vgpu.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + app.kubernetes.io/component: hami-device-plugin + hami.io/webhook: ignore + {{- include "hami-vgpu.selectorLabels" . | nindent 8 }} + {{- if .Values.devicePlugin.podAnnotations }} + annotations: {{ toYaml .Values.devicePlugin.podAnnotations | nindent 8 }} + {{- end }} + spec: + {{- if .Values.devicePlugin.runtimeClassName }} + runtimeClassName: {{ .Values.devicePlugin.runtimeClassName }} + {{- end }} + {{- include "hami-vgpu.imagePullSecrets" . | nindent 6}} + serviceAccountName: {{ include "hami-vgpu.device-plugin" . }} + priorityClassName: system-node-critical + hostPID: true + hostNetwork: true + containers: + - name: device-plugin + image: {{ .Values.devicePlugin.image }}:{{ .Values.version }} + imagePullPolicy: {{ .Values.devicePlugin.imagePullPolicy | quote }} + lifecycle: + postStart: + exec: + command: ["/bin/sh","-c", {{ printf "/k8s-vgpu/bin/vgpu-init.sh %s/vgpu/" .Values.global.gpuHookPath | quote }}] + command: + - nvidia-device-plugin + - --config-file=/device-config.yaml + - --mig-strategy={{ .Values.devicePlugin.migStrategy }} + - --disable-core-limit={{ .Values.devicePlugin.disablecorelimit }} + {{- range .Values.devicePlugin.extraArgs }} + - {{ . }} + {{- end }} + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: NVIDIA_MIG_MONITOR_DEVICES + value: all + - name: HOOK_PATH + value: {{ .Values.global.gpuHookPath }} + {{- if typeIs "bool" .Values.devicePlugin.passDeviceSpecsEnabled }} + - name: PASS_DEVICE_SPECS + value: {{ .Values.devicePlugin.passDeviceSpecsEnabled | quote }} + {{- end }} + securityContext: + privileged: true + allowPrivilegeEscalation: true + capabilities: + drop: ["ALL"] + add: ["SYS_ADMIN"] + resources: + {{- toYaml .Values.devicePlugin.resources | nindent 12 }} + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + - name: lib + mountPath: {{ printf "%s%s" .Values.global.gpuHookPath "/vgpu" }} + - name: usrbin + mountPath: /usrbin + - name: deviceconfig + mountPath: /config + - name: hosttmp + mountPath: /tmp + - name: device-config + mountPath: /device-config.yaml + subPath: device-config.yaml + - name: vgpu-monitor + image: {{ .Values.devicePlugin.image }}:{{ .Values.version }} + imagePullPolicy: {{ .Values.devicePlugin.imagePullPolicy | quote }} + command: ["vGPUmonitor"] + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + add: ["SYS_ADMIN"] + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: NVIDIA_VISIBLE_DEVICES + value: "all" + - name: NVIDIA_MIG_MONITOR_DEVICES + value: all + - name: HOOK_PATH + value: {{ .Values.global.gpuHookPath }}/vgpu + resources: + {{- toYaml .Values.devicePlugin.vgpuMonitor.resources | nindent 12 }} + volumeMounts: + - name: ctrs + mountPath: {{ .Values.devicePlugin.monitorctrPath }} + - name: dockers + mountPath: /run/docker + - name: containerds + mountPath: /run/containerd + - name: sysinfo + mountPath: /sysinfo + - name: hostvar + mountPath: /hostvar + - name: hosttmp + mountPath: /tmp + volumes: + - name: ctrs + hostPath: + path: {{ .Values.devicePlugin.monitorctrPath }} + - name: hosttmp + hostPath: + path: /tmp + - name: dockers + hostPath: + path: /run/docker + - name: containerds + hostPath: + path: /run/containerd + - name: device-plugin + hostPath: + path: {{ .Values.devicePlugin.pluginPath }} + - name: lib + hostPath: + path: {{ .Values.devicePlugin.libPath }} + - name: usrbin + hostPath: + path: /usr/bin + - name: sysinfo + hostPath: + path: /sys + - name: hostvar + hostPath: + path: /var + - name: deviceconfig + configMap: + name: {{ template "hami-vgpu.device-plugin" . }} + - name: device-config + configMap: + name: {{ include "hami-vgpu.scheduler" . }}-device + {{- if .Values.devicePlugin.nvidianodeSelector }} + nodeSelector: {{ toYaml .Values.devicePlugin.nvidianodeSelector | nindent 8 }} + {{- end }} + {{- if .Values.devicePlugin.tolerations }} + tolerations: {{ toYaml .Values.devicePlugin.tolerations | nindent 8 }} + {{- end }} diff --git a/helm/hami/templates/device-plugin/monitorrole.yaml b/helm/hami/templates/device-plugin/monitorrole.yaml new file mode 100644 index 0000000..2c5e8e7 --- /dev/null +++ b/helm/hami/templates/device-plugin/monitorrole.yaml @@ -0,0 +1,27 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "hami-vgpu.device-plugin" . }}-monitor +rules: + - apiGroups: + - "" + resources: + - pods + verbs: + - get + - create + - watch + - list + - update + - patch + - apiGroups: + - "" + resources: + - nodes + verbs: + - get + - update + - list + - patch + + diff --git a/helm/hami/templates/device-plugin/monitorrolebinding.yaml b/helm/hami/templates/device-plugin/monitorrolebinding.yaml new file mode 100644 index 0000000..3d45e3a --- /dev/null +++ b/helm/hami/templates/device-plugin/monitorrolebinding.yaml @@ -0,0 +1,16 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "hami-vgpu.device-plugin" . }} + labels: + app.kubernetes.io/component: "hami-device-plugin" + {{- include "hami-vgpu.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + #name: cluster-admin + name: {{ include "hami-vgpu.device-plugin" . }}-monitor +subjects: + - kind: ServiceAccount + name: {{ include "hami-vgpu.device-plugin" . }} + namespace: {{ .Release.Namespace | quote }} diff --git a/helm/hami/templates/device-plugin/monitorservice.yaml b/helm/hami/templates/device-plugin/monitorservice.yaml new file mode 100644 index 0000000..edfc380 --- /dev/null +++ b/helm/hami/templates/device-plugin/monitorservice.yaml @@ -0,0 +1,23 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "hami-vgpu.device-plugin" . }}-monitor + labels: + app.kubernetes.io/component: hami-device-plugin + {{- include "hami-vgpu.labels" . | nindent 4 }} + {{- if .Values.scheduler.service.labels }} + {{ toYaml .Values.scheduler.service.labels | indent 4 }} + {{- end }} + {{- if .Values.scheduler.service.annotations }} + annotations: {{ toYaml .Values.scheduler.service.annotations | nindent 4 }} + {{- end }} +spec: + externalTrafficPolicy: Local + selector: + app.kubernetes.io/component: hami-device-plugin + type: NodePort + ports: + - name: monitorport + port: {{ .Values.devicePlugin.service.httpPort }} + targetPort: 9394 + nodePort: {{ .Values.devicePlugin.service.httpPort }} \ No newline at end of file diff --git a/helm/hami/templates/device-plugin/monitorserviceaccount.yaml b/helm/hami/templates/device-plugin/monitorserviceaccount.yaml new file mode 100644 index 0000000..076d9dd --- /dev/null +++ b/helm/hami/templates/device-plugin/monitorserviceaccount.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "hami-vgpu.device-plugin" . }} + namespace: {{ .Release.Namespace | quote }} + labels: + app.kubernetes.io/component: "hami-device-plugin" + {{- include "hami-vgpu.labels" . | nindent 4 }} diff --git a/helm/hami/templates/scheduler/configmap.yaml b/helm/hami/templates/scheduler/configmap.yaml new file mode 100644 index 0000000..b69ee15 --- /dev/null +++ b/helm/hami/templates/scheduler/configmap.yaml @@ -0,0 +1,88 @@ +{{- if .Values.scheduler.kubeScheduler.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "hami-vgpu.scheduler" . }} + labels: + app.kubernetes.io/component: hami-scheduler + {{- include "hami-vgpu.labels" . | nindent 4 }} +data: + config.json: | + { + "kind": "Policy", + "apiVersion": "v1", + "extenders": [ + { + "urlPrefix": "https://127.0.0.1:443", + "filterVerb": "filter", + "bindVerb": "bind", + "enableHttps": true, + "weight": 1, + "nodeCacheCapable": true, + "httpTimeout": 30000000000, + "tlsConfig": { + "insecure": true + }, + "managedResources": [ + {{- if .Values.devices.ascend.enabled }} + {{- range .Values.devices.ascend.customresources }} + { + "name": "{{ . }}", + "ignoredByScheduler": true + }, + {{- end }} + {{- end }} + {{- if .Values.devices.mthreads.enabled }} + {{- range .Values.devices.mthreads.customresources }} + { + "name": "{{ . }}", + "ignoredByScheduler": true + }, + {{- end }} + {{- end }} + { + "name": "{{ .Values.resourceName }}", + "ignoredByScheduler": true + }, + { + "name": "{{ .Values.resourceMem }}", + "ignoredByScheduler": true + }, + { + "name": "{{ .Values.resourceCores }}", + "ignoredByScheduler": true + }, + { + "name": "{{ .Values.resourceMemPercentage }}", + "ignoredByScheduler": true + }, + { + "name": "{{ .Values.resourcePriority }}", + "ignoredByScheduler": true + }, + { + "name": "{{ .Values.mluResourceName }}", + "ignoredByScheduler": true + }, + { + "name": "{{ .Values.dcuResourceName }}", + "ignoredByScheduler": true + }, + { + "name": "{{ .Values.dcuResourceMem }}", + "ignoredByScheduler": true + }, + { + "name": "{{ .Values.dcuResourceCores }}", + "ignoredByScheduler": true + }, + { + "name": "{{ .Values.iluvatarResourceName }}", + "ignoredByScheduler": true + } + ], + "ignoreable": false + } + ] + } +{{- end }} diff --git a/helm/hami/templates/scheduler/configmapnew.yaml b/helm/hami/templates/scheduler/configmapnew.yaml new file mode 100644 index 0000000..acc5076 --- /dev/null +++ b/helm/hami/templates/scheduler/configmapnew.yaml @@ -0,0 +1,64 @@ +{{- if .Values.scheduler.kubeScheduler.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "hami-vgpu.scheduler" . }}-newversion + labels: + app.kubernetes.io/component: hami-scheduler + {{- include "hami-vgpu.labels" . | nindent 4 }} +data: + config.yaml: | + {{- if gt (.Capabilities.KubeVersion.Minor | int) 25}} + apiVersion: kubescheduler.config.k8s.io/v1 + {{- else }} + apiVersion: kubescheduler.config.k8s.io/v1beta2 + {{- end }} + kind: KubeSchedulerConfiguration + leaderElection: + leaderElect: false + profiles: + - schedulerName: {{ .Values.schedulerName }} + extenders: + - urlPrefix: "https://127.0.0.1:443" + filterVerb: filter + bindVerb: bind + nodeCacheCapable: true + weight: 1 + httpTimeout: 30s + enableHTTPS: true + tlsConfig: + insecure: true + managedResources: + - name: {{ .Values.resourceName }} + ignoredByScheduler: true + - name: {{ .Values.resourceMem }} + ignoredByScheduler: true + - name: {{ .Values.resourceCores }} + ignoredByScheduler: true + - name: {{ .Values.resourceMemPercentage }} + ignoredByScheduler: true + - name: {{ .Values.resourcePriority }} + ignoredByScheduler: true + - name: {{ .Values.mluResourceName }} + ignoredByScheduler: true + - name: {{ .Values.dcuResourceName }} + ignoredByScheduler: true + - name: {{ .Values.dcuResourceMem }} + ignoredByScheduler: true + - name: {{ .Values.dcuResourceCores }} + ignoredByScheduler: true + - name: {{ .Values.iluvatarResourceName }} + ignoredByScheduler: true + {{- if .Values.devices.ascend.enabled }} + {{- range .Values.devices.ascend.customresources }} + - name: {{ . }} + ignoredByScheduler: true + {{- end }} + {{- end }} + {{- if .Values.devices.mthreads.enabled }} + {{- range .Values.devices.mthreads.customresources }} + - name: {{ . }} + ignoredByScheduler: true + {{- end }} + {{- end }} +{{- end }} diff --git a/helm/hami/templates/scheduler/deployment.yaml b/helm/hami/templates/scheduler/deployment.yaml new file mode 100644 index 0000000..31da049 --- /dev/null +++ b/helm/hami/templates/scheduler/deployment.yaml @@ -0,0 +1,152 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "hami-vgpu.scheduler" . }} + labels: + app.kubernetes.io/component: hami-scheduler + {{- include "hami-vgpu.labels" . | nindent 4 }} + {{- with .Values.global.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} + {{- if .Values.global.annotations }} + annotations: {{ toYaml .Values.global.annotations | nindent 4}} + {{- end }} +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/component: hami-scheduler + {{- include "hami-vgpu.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + app.kubernetes.io/component: hami-scheduler + {{- include "hami-vgpu.selectorLabels" . | nindent 8 }} + hami.io/webhook: ignore + {{- if .Values.scheduler.podAnnotations }} + annotations: {{ toYaml .Values.scheduler.podAnnotations | nindent 8 }} + {{- end }} + spec: + {{- include "hami-vgpu.imagePullSecrets" . | nindent 6}} + serviceAccountName: {{ include "hami-vgpu.scheduler" . }} + priorityClassName: system-node-critical + containers: + {{- if .Values.scheduler.kubeScheduler.enabled }} + - name: kube-scheduler + image: "{{ .Values.scheduler.kubeScheduler.image }}:{{ include "resolvedKubeSchedulerTag" . }}" + imagePullPolicy: {{ .Values.scheduler.kubeScheduler.imagePullPolicy | quote }} + command: + - kube-scheduler + {{- if ge (.Capabilities.KubeVersion.Minor | int) 22 }} + {{- range .Values.scheduler.kubeScheduler.extraNewArgs }} + - {{ . }} + {{- end }} + {{- else }} + - --scheduler-name={{ .Values.schedulerName }} + {{- range .Values.scheduler.kubeScheduler.extraArgs }} + - {{ . }} + {{- end }} + {{- end }} + - --leader-elect={{ .Values.scheduler.leaderElect }} + - --leader-elect-resource-name={{ .Values.schedulerName }} + - --leader-elect-resource-namespace={{ .Release.Namespace }} + resources: + {{- toYaml .Values.scheduler.kubeScheduler.resources | nindent 12 }} + volumeMounts: + - name: scheduler-config + mountPath: /config + {{- end }} + {{- if .Values.scheduler.livenessProbe }} + livenessProbe: + failureThreshold: 8 + httpGet: + path: /healthz + port: 10259 + scheme: HTTPS + initialDelaySeconds: 10 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 15 + {{- end }} + - name: vgpu-scheduler-extender + image: {{ .Values.scheduler.extender.image }}:{{ .Values.version }} + imagePullPolicy: {{ .Values.scheduler.extender.imagePullPolicy | quote }} + env: + {{- if .Values.global.managedNodeSelectorEnable }} + {{- range $key, $value := .Values.global.managedNodeSelector }} + - name: NODE_SELECTOR_{{ $key | upper | replace "-" "_" }} + value: "{{ $value }}" + {{- end }} + {{- end }} + command: + - scheduler + - --http_bind=0.0.0.0:443 + - --cert_file=/tls/tls.crt + - --key_file=/tls/tls.key + - --scheduler-name={{ .Values.schedulerName }} + - --metrics-bind-address={{ .Values.scheduler.metricsBindAddress }} + - --node-scheduler-policy={{ .Values.scheduler.defaultSchedulerPolicy.nodeSchedulerPolicy }} + - --gpu-scheduler-policy={{ .Values.scheduler.defaultSchedulerPolicy.gpuSchedulerPolicy }} + - --device-config-file=/device-config.yaml + {{- if .Values.devices.ascend.enabled }} + - --enable-ascend=true + {{- end }} + {{- if .Values.scheduler.nodeLabelSelector }} + - --node-label-selector={{- $first := true -}} + {{- range $key, $value := .Values.scheduler.nodeLabelSelector -}} + {{- if not $first }},{{ end -}} + {{- $key }}={{ $value -}} + {{- $first = false -}} + {{- end -}} + {{- end }} + {{- range .Values.scheduler.extender.extraArgs }} + - {{ . }} + {{- end }} + ports: + - name: http + containerPort: 443 + protocol: TCP + resources: + {{- toYaml .Values.scheduler.extender.resources | nindent 12 }} + volumeMounts: + - name: tls-config + mountPath: /tls + - name: device-config + mountPath: /device-config.yaml + subPath: device-config.yaml + {{- if .Values.scheduler.livenessProbe }} + livenessProbe: + httpGet: + path: /healthz + port: 443 + scheme: HTTPS + initialDelaySeconds: 10 + periodSeconds: 10 + failureThreshold: 3 + timeoutSeconds: 5 + {{- end }} + volumes: + - name: tls-config + secret: + secretName: {{ template "hami-vgpu.scheduler.tls" . }} + {{- if .Values.scheduler.kubeScheduler.enabled }} + - name: scheduler-config + configMap: + {{- if ge (.Capabilities.KubeVersion.Minor | int) 22 }} + name: {{ template "hami-vgpu.scheduler" . }}-newversion + {{- else }} + name: {{ template "hami-vgpu.scheduler" . }} + {{- end }} + {{- end }} + - name: device-config + configMap: + name: {{ include "hami-vgpu.scheduler" . }}-device + {{- if .Values.scheduler.nodeSelector }} + nodeSelector: {{ toYaml .Values.scheduler.nodeSelector | nindent 8 }} + {{- end }} + {{- if .Values.scheduler.tolerations }} + tolerations: {{ toYaml .Values.scheduler.tolerations | nindent 8 }} + {{- end }} + {{- if .Values.scheduler.nodeName }} + nodeName: {{ .Values.scheduler.nodeName }} + {{- end }} diff --git a/helm/hami/templates/scheduler/device-configmap.yaml b/helm/hami/templates/scheduler/device-configmap.yaml new file mode 100644 index 0000000..12af186 --- /dev/null +++ b/helm/hami/templates/scheduler/device-configmap.yaml @@ -0,0 +1,177 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "hami-vgpu.scheduler" . }}-device + labels: + app.kubernetes.io/component: hami-scheduler + {{- include "hami-vgpu.labels" . | nindent 4 }} +data: + device-config.yaml: |- + {{- if .Files.Glob "files/device-config.yaml" }} + {{- .Files.Get "files/device-config.yaml" | nindent 4}} + {{- else }} + nvidia: + resourceCountName: {{ .Values.resourceName }} + resourceMemoryName: {{ .Values.resourceMem }} + resourceMemoryPercentageName: {{ .Values.resourceMemPercentage }} + resourceCoreName: {{ .Values.resourceCores }} + resourcePriorityName: {{ .Values.resourcePriority }} + overwriteEnv: false + defaultMemory: 0 + defaultCores: 0 + defaultGPUNum: 1 + deviceSplitCount: {{ .Values.devicePlugin.deviceSplitCount }} + deviceMemoryScaling: {{ .Values.devicePlugin.deviceMemoryScaling }} + deviceCoreScaling: {{ .Values.devicePlugin.deviceCoreScaling }} + knownMigGeometries: + - models: [ "A30" ] + allowedGeometries: + - + - name: 1g.6gb + memory: 6144 + count: 4 + - + - name: 2g.12gb + memory: 12288 + count: 2 + - + - name: 4g.24gb + memory: 24576 + count: 1 + - models: [ "A100-SXM4-40GB", "A100-40GB-PCIe", "A100-PCIE-40GB", "A100-SXM4-40GB" ] + allowedGeometries: + - + - name: 1g.5gb + memory: 5120 + count: 7 + - + - name: 2g.10gb + memory: 10240 + count: 3 + - name: 1g.5gb + memory: 5120 + count: 1 + - + - name: 3g.20gb + memory: 20480 + count: 2 + - + - name: 7g.40gb + memory: 40960 + count: 1 + - models: [ "A100-SXM4-80GB", "A100-80GB-PCIe", "A100-PCIE-80GB"] + allowedGeometries: + - + - name: 1g.10gb + memory: 10240 + count: 7 + - + - name: 2g.20gb + memory: 20480 + count: 3 + - name: 1g.10gb + memory: 10240 + count: 1 + - + - name: 3g.40gb + memory: 40960 + count: 2 + - + - name: 7g.79gb + memory: 80896 + count: 1 + cambricon: + resourceCountName: {{ .Values.mluResourceName }} + resourceMemoryName: {{ .Values.mluResourceMem }} + resourceCoreName: {{ .Values.mluResourceCores }} + hygon: + resourceCountName: {{ .Values.dcuResourceName }} + resourceMemoryName: {{ .Values.dcuResourceMem }} + resourceCoreName: {{ .Values.dcuResourceCores }} + metax: + resourceCountName: "metax-tech.com/gpu" + mthreads: + resourceCountName: "mthreads.com/vgpu" + resourceMemoryName: "mthreads.com/sgpu-memory" + resourceCoreName: "mthreads.com/sgpu-core" + iluvatar: + resourceCountName: {{ .Values.iluvatarResourceName }} + resourceMemoryName: {{ .Values.iluvatarResourceMem }} + resourceCoreName: {{ .Values.iluvatarResourceCore }} + vnpus: + - chipName: 910B + commonWord: Ascend910A + resourceName: huawei.com/Ascend910A + resourceMemoryName: huawei.com/Ascend910A-memory + memoryAllocatable: 32768 + memoryCapacity: 32768 + aiCore: 30 + templates: + - name: vir02 + memory: 2184 + aiCore: 2 + - name: vir04 + memory: 4369 + aiCore: 4 + - name: vir08 + memory: 8738 + aiCore: 8 + - name: vir16 + memory: 17476 + aiCore: 16 + - chipName: 910B3 + commonWord: Ascend910B + resourceName: huawei.com/Ascend910B + resourceMemoryName: huawei.com/Ascend910B-memory + memoryAllocatable: 65536 + memoryCapacity: 65536 + aiCore: 20 + aiCPU: 7 + templates: + - name: vir05_1c_16g + memory: 16384 + aiCore: 5 + aiCPU: 1 + - name: vir10_3c_32g + memory: 32768 + aiCore: 10 + aiCPU: 3 + - chipName: 910B4 + commonWord: Ascend910B4 + resourceName: huawei.com/Ascend910B4 + resourceMemoryName: huawei.com/Ascend910B4-memory + memoryAllocatable: 32768 + memoryCapacity: 32768 + aiCore: 20 + aiCPU: 7 + templates: + - name: vir05_1c_8g + memory: 8192 + aiCore: 5 + aiCPU: 1 + - name: vir10_3c_16g + memory: 16384 + aiCore: 10 + aiCPU: 3 + - chipName: 310P3 + commonWord: Ascend310P + resourceName: huawei.com/Ascend310P + resourceMemoryName: huawei.com/Ascend310P-memory + memoryAllocatable: 21527 + memoryCapacity: 24576 + aiCore: 8 + aiCPU: 7 + templates: + - name: vir01 + memory: 3072 + aiCore: 1 + aiCPU: 1 + - name: vir02 + memory: 6144 + aiCore: 2 + aiCPU: 2 + - name: vir04 + memory: 12288 + aiCore: 4 + aiCPU: 4 + {{ end }} diff --git a/helm/hami/templates/scheduler/job-patch/clusterrole.yaml b/helm/hami/templates/scheduler/job-patch/clusterrole.yaml new file mode 100644 index 0000000..ef6d986 --- /dev/null +++ b/helm/hami/templates/scheduler/job-patch/clusterrole.yaml @@ -0,0 +1,26 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "hami-vgpu.fullname" . }}-admission + annotations: + "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded + labels: + {{- include "hami-vgpu.labels" . | nindent 4 }} + app.kubernetes.io/component: admission-webhook +rules: + - apiGroups: + - admissionregistration.k8s.io + resources: + #- validatingwebhookconfigurations + - mutatingwebhookconfigurations + verbs: + - get + - update +{{- if .Values.podSecurityPolicy.enabled }} + - apiGroups: ['extensions'] + resources: ['podsecuritypolicies'] + verbs: ['use'] + resourceNames: + - {{ include "hami-vgpu.fullname" . }}-admission +{{- end }} diff --git a/helm/hami/templates/scheduler/job-patch/clusterrolebinding.yaml b/helm/hami/templates/scheduler/job-patch/clusterrolebinding.yaml new file mode 100644 index 0000000..469419e --- /dev/null +++ b/helm/hami/templates/scheduler/job-patch/clusterrolebinding.yaml @@ -0,0 +1,18 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "hami-vgpu.fullname" . }}-admission + annotations: + "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded + labels: + {{- include "hami-vgpu.labels" . | nindent 4 }} + app.kubernetes.io/component: admission-webhook +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "hami-vgpu.fullname" . }}-admission +subjects: + - kind: ServiceAccount + name: {{ include "hami-vgpu.fullname" . }}-admission + namespace: {{ .Release.Namespace | quote }} diff --git a/helm/hami/templates/scheduler/job-patch/job-createSecret.yaml b/helm/hami/templates/scheduler/job-patch/job-createSecret.yaml new file mode 100644 index 0000000..a066f56 --- /dev/null +++ b/helm/hami/templates/scheduler/job-patch/job-createSecret.yaml @@ -0,0 +1,60 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ include "hami-vgpu.fullname" . }}-admission-create + annotations: + "helm.sh/hook": pre-install,pre-upgrade + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded + labels: + {{- include "hami-vgpu.labels" . | nindent 4 }} + app.kubernetes.io/component: admission-webhook +spec: + {{- if .Capabilities.APIVersions.Has "batch/v1alpha1" }} + # Alpha feature since k8s 1.12 + ttlSecondsAfterFinished: 0 + {{- end }} + template: + metadata: + name: {{ include "hami-vgpu.fullname" . }}-admission-create + {{- if .Values.scheduler.patch.podAnnotations }} + annotations: {{ toYaml .Values.scheduler.patch.podAnnotations | nindent 8 }} + {{- end }} + labels: + {{- include "hami-vgpu.labels" . | nindent 8 }} + app.kubernetes.io/component: admission-webhook + hami.io/webhook: ignore + spec: + {{- include "hami-vgpu.imagePullSecrets" . | nindent 6}} + {{- if .Values.scheduler.patch.priorityClassName }} + priorityClassName: {{ .Values.scheduler.patch.priorityClassName }} + {{- end }} + containers: + - name: create + {{- if ge (.Capabilities.KubeVersion.Minor | int) 22 }} + image: {{ .Values.scheduler.patch.imageNew }} + {{- else }} + image: {{ .Values.scheduler.patch.image }} + {{- end }} + imagePullPolicy: {{ .Values.scheduler.patch.imagePullPolicy }} + args: + - create + - --cert-name=tls.crt + - --key-name=tls.key + {{- if .Values.scheduler.admissionWebhook.customURL.enabled }} + - --host={{ printf "%s.%s.svc,127.0.0.1,%s" (include "hami-vgpu.scheduler" .) .Release.Namespace .Values.scheduler.admissionWebhook.customURL.host}} + {{- else }} + - --host={{ printf "%s.%s.svc,127.0.0.1" (include "hami-vgpu.scheduler" .) .Release.Namespace }} + {{- end }} + - --namespace={{ .Release.Namespace }} + - --secret-name={{ include "hami-vgpu.scheduler.tls" . }} + restartPolicy: OnFailure + serviceAccountName: {{ include "hami-vgpu.fullname" . }}-admission + {{- if .Values.scheduler.patch.nodeSelector }} + nodeSelector: {{ toYaml .Values.scheduler.patch.nodeSelector | nindent 8 }} + {{- end }} + {{- if .Values.scheduler.patch.tolerations }} + tolerations: {{ toYaml .Values.scheduler.patch.tolerations | nindent 8 }} + {{- end }} + securityContext: + runAsNonRoot: true + runAsUser: {{ .Values.scheduler.patch.runAsUser }} diff --git a/helm/hami/templates/scheduler/job-patch/job-patchWebhook.yaml b/helm/hami/templates/scheduler/job-patch/job-patchWebhook.yaml new file mode 100644 index 0000000..e6fde7f --- /dev/null +++ b/helm/hami/templates/scheduler/job-patch/job-patchWebhook.yaml @@ -0,0 +1,55 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ include "hami-vgpu.fullname" . }}-admission-patch + annotations: + "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded + labels: + {{- include "hami-vgpu.labels" . | nindent 4 }} + app.kubernetes.io/component: admission-webhook +spec: + {{- if .Capabilities.APIVersions.Has "batch/v1alpha1" }} + # Alpha feature since k8s 1.12 + ttlSecondsAfterFinished: 0 + {{- end }} + template: + metadata: + name: {{ include "hami-vgpu.fullname" . }}-admission-patch + {{- if .Values.scheduler.patch.podAnnotations }} + annotations: {{ toYaml .Values.scheduler.patch.podAnnotations | nindent 8 }} + {{- end }} + labels: + {{- include "hami-vgpu.labels" . | nindent 8 }} + app.kubernetes.io/component: admission-webhook + hami.io/webhook: ignore + spec: + {{- include "hami-vgpu.imagePullSecrets" . | nindent 6}} + {{- if .Values.scheduler.patch.priorityClassName }} + priorityClassName: {{ .Values.scheduler.patch.priorityClassName }} + {{- end }} + containers: + - name: patch + {{- if ge (.Capabilities.KubeVersion.Minor | int) 22 }} + image: {{ .Values.scheduler.patch.imageNew }} + {{- else }} + image: {{ .Values.scheduler.patch.image }} + {{- end }} + imagePullPolicy: {{ .Values.scheduler.patch.imagePullPolicy }} + args: + - patch + - --webhook-name={{ include "hami-vgpu.scheduler.webhook" . }} + - --namespace={{ .Release.Namespace }} + - --patch-validating=false + - --secret-name={{ include "hami-vgpu.scheduler.tls" . }} + restartPolicy: OnFailure + serviceAccountName: {{ include "hami-vgpu.fullname" . }}-admission + {{- if .Values.scheduler.patch.nodeSelector }} + nodeSelector: {{ toYaml .Values.scheduler.patch.nodeSelector | nindent 8 }} + {{- end }} + {{- if .Values.scheduler.patch.tolerations }} + tolerations: {{ toYaml .Values.scheduler.patch.tolerations | nindent 8 }} + {{- end }} + securityContext: + runAsNonRoot: true + runAsUser: {{ .Values.scheduler.patch.runAsUser }} diff --git a/helm/hami/templates/scheduler/job-patch/psp.yaml b/helm/hami/templates/scheduler/job-patch/psp.yaml new file mode 100644 index 0000000..5716585 --- /dev/null +++ b/helm/hami/templates/scheduler/job-patch/psp.yaml @@ -0,0 +1,36 @@ +{{- if .Values.podSecurityPolicy.enabled }} +apiVersion: policy/v1beta1 +kind: PodSecurityPolicy +metadata: + name: {{ include "hami-vgpu.fullname" . }}-admission + annotations: + "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded + labels: + {{- include "hami-vgpu.labels" . | nindent 4 }} + app.kubernetes.io/component: admission-webhook +spec: + allowPrivilegeEscalation: false + fsGroup: + ranges: + - max: 65535 + min: 1 + rule: MustRunAs + requiredDropCapabilities: + - ALL + runAsUser: + rule: MustRunAsNonRoot + seLinux: + rule: RunAsAny + supplementalGroups: + ranges: + - max: 65535 + min: 1 + rule: MustRunAs + volumes: + - configMap + - emptyDir + - projected + - secret + - downwardAPI +{{- end }} diff --git a/helm/hami/templates/scheduler/job-patch/role.yaml b/helm/hami/templates/scheduler/job-patch/role.yaml new file mode 100644 index 0000000..7a77cbd --- /dev/null +++ b/helm/hami/templates/scheduler/job-patch/role.yaml @@ -0,0 +1,18 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ include "hami-vgpu.fullname" . }}-admission + annotations: + "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded + labels: + {{- include "hami-vgpu.labels" . | nindent 4 }} + app.kubernetes.io/component: admission-webhook +rules: + - apiGroups: + - "" + resources: + - secrets + verbs: + - get + - create diff --git a/helm/hami/templates/scheduler/job-patch/rolebinding.yaml b/helm/hami/templates/scheduler/job-patch/rolebinding.yaml new file mode 100644 index 0000000..955ffe8 --- /dev/null +++ b/helm/hami/templates/scheduler/job-patch/rolebinding.yaml @@ -0,0 +1,18 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "hami-vgpu.fullname" . }}-admission + annotations: + "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded + labels: + {{- include "hami-vgpu.labels" . | nindent 4 }} + app.kubernetes.io/component: admission-webhook +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{ include "hami-vgpu.fullname" . }}-admission +subjects: + - kind: ServiceAccount + name: {{ include "hami-vgpu.fullname" . }}-admission + namespace: {{ .Release.Namespace | quote }} diff --git a/helm/hami/templates/scheduler/job-patch/serviceaccount.yaml b/helm/hami/templates/scheduler/job-patch/serviceaccount.yaml new file mode 100644 index 0000000..813d2b3 --- /dev/null +++ b/helm/hami/templates/scheduler/job-patch/serviceaccount.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "hami-vgpu.fullname" . }}-admission + annotations: + "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded + labels: + {{- include "hami-vgpu.labels" . | nindent 4 }} + app.kubernetes.io/component: admission-webhook diff --git a/helm/hami/templates/scheduler/rolebinding.yaml b/helm/hami/templates/scheduler/rolebinding.yaml new file mode 100644 index 0000000..37f3d86 --- /dev/null +++ b/helm/hami/templates/scheduler/rolebinding.yaml @@ -0,0 +1,15 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "hami-vgpu.scheduler" . }} + labels: + app.kubernetes.io/component: "hami-scheduler" + {{- include "hami-vgpu.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: cluster-admin +subjects: + - kind: ServiceAccount + name: {{ include "hami-vgpu.scheduler" . }} + namespace: {{ .Release.Namespace | quote }} diff --git a/helm/hami/templates/scheduler/service.yaml b/helm/hami/templates/scheduler/service.yaml new file mode 100644 index 0000000..344710e --- /dev/null +++ b/helm/hami/templates/scheduler/service.yaml @@ -0,0 +1,30 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "hami-vgpu.scheduler" . }} + labels: + app.kubernetes.io/component: hami-scheduler + {{- include "hami-vgpu.labels" . | nindent 4 }} + {{- if .Values.scheduler.service.labels }} + {{ toYaml .Values.scheduler.service.labels | indent 4 }} + {{- end }} + {{- if .Values.scheduler.service.annotations }} + annotations: {{ toYaml .Values.scheduler.service.annotations | nindent 4 }} + {{- end }} +spec: + type: NodePort + ports: + - name: http + port: {{ .Values.scheduler.service.httpPort }} + targetPort: 443 + nodePort: {{ .Values.scheduler.service.schedulerPort }} + protocol: TCP + - name: monitor + port: {{ .Values.scheduler.service.monitorPort }} + targetPort: {{ (split ":" (printf "%s" .Values.scheduler.metricsBindAddress))._1 }} + nodePort: {{ .Values.scheduler.service.monitorPort }} + protocol: TCP + selector: + app.kubernetes.io/component: hami-scheduler + {{- include "hami-vgpu.selectorLabels" . | nindent 4 }} + diff --git a/helm/hami/templates/scheduler/serviceaccount.yaml b/helm/hami/templates/scheduler/serviceaccount.yaml new file mode 100644 index 0000000..c9d129d --- /dev/null +++ b/helm/hami/templates/scheduler/serviceaccount.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "hami-vgpu.scheduler" . }} + namespace: {{ .Release.Namespace | quote }} + labels: + app.kubernetes.io/component: "hami-scheduler" + {{- include "hami-vgpu.labels" . | nindent 4 }} diff --git a/helm/hami/templates/scheduler/webhook.yaml b/helm/hami/templates/scheduler/webhook.yaml new file mode 100644 index 0000000..d39f236 --- /dev/null +++ b/helm/hami/templates/scheduler/webhook.yaml @@ -0,0 +1,51 @@ +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: {{ include "hami-vgpu.scheduler.webhook" . }} +webhooks: + - admissionReviewVersions: + - v1beta1 + clientConfig: + {{- if .Values.scheduler.admissionWebhook.customURL.enabled }} + url: https://{{ .Values.scheduler.admissionWebhook.customURL.host}}:{{.Values.scheduler.admissionWebhook.customURL.port}}{{.Values.scheduler.admissionWebhook.customURL.path}} + {{- else }} + service: + name: {{ include "hami-vgpu.scheduler" . }} + namespace: {{ .Release.Namespace }} + path: /webhook + port: {{ .Values.scheduler.service.httpPort }} + {{- end }} + failurePolicy: {{ .Values.scheduler.admissionWebhook.failurePolicy }} + matchPolicy: Equivalent + name: vgpu.hami.io + namespaceSelector: + matchExpressions: + - key: hami.io/webhook + operator: NotIn + values: + - ignore + {{- if .Values.scheduler.admissionWebhook.whitelistNamespaces }} + - key: kubernetes.io/metadata.name + operator: NotIn + values: + {{- toYaml .Values.scheduler.admissionWebhook.whitelistNamespaces | nindent 10 }} + {{- end }} + objectSelector: + matchExpressions: + - key: hami.io/webhook + operator: NotIn + values: + - ignore + reinvocationPolicy: {{ .Values.scheduler.admissionWebhook.reinvocationPolicy }} + rules: + - apiGroups: + - "" + apiVersions: + - v1 + operations: + - CREATE + resources: + - pods + scope: '*' + sideEffects: None + timeoutSeconds: 10 diff --git a/helm/hami/values.yaml b/helm/hami/values.yaml new file mode 100644 index 0000000..a85ebc7 --- /dev/null +++ b/helm/hami/values.yaml @@ -0,0 +1,204 @@ +# Default values for hami-vgpu. + +nameOverride: "" +fullnameOverride: "" +imagePullSecrets: [ ] +version: "v2.5.0" + +#Nvidia GPU Parameters +resourceName: "nvidia.com/gpu" +resourceMem: "nvidia.com/gpumem" +resourceMemPercentage: "nvidia.com/gpumem-percentage" +resourceCores: "nvidia.com/gpucores" +resourcePriority: "nvidia.com/priority" + +#MLU Parameters +mluResourceName: "cambricon.com/vmlu" +mluResourceMem: "cambricon.com/mlu.smlu.vmemory" +mluResourceCores: "cambricon.com/mlu.smlu.vcore" + +#Hygon DCU Parameters +dcuResourceName: "hygon.com/dcunum" +dcuResourceMem: "hygon.com/dcumem" +dcuResourceCores: "hygon.com/dcucores" + +#Iluvatar GPU Parameters +iluvatarResourceName: "iluvatar.ai/vgpu" +iluvatarResourceMem: "iluvatar.ai/vcuda-memory" +iluvatarResourceCore: "iluvatar.ai/vcuda-core" + +schedulerName: "hami-scheduler" + +podSecurityPolicy: + enabled: false + +global: + gpuHookPath: /usr/local + labels: {} + annotations: {} + managedNodeSelectorEnable: false + managedNodeSelector: + usage: "gpu" + + +scheduler: + # @param nodeName defines the node name and the nvidia-vgpu-scheduler-scheduler will schedule to the node. + # if we install the nvidia-vgpu-scheduler-scheduler as default scheduler, we need to remove the k8s default + # scheduler pod from the cluster first, we must specify node name to skip the schedule workflow. + nodeName: "" + #nodeLabelSelector: + # "gpu": "on" + overwriteEnv: "false" + defaultSchedulerPolicy: + nodeSchedulerPolicy: binpack + gpuSchedulerPolicy: spread + metricsBindAddress: ":9395" + livenessProbe: false + leaderElect: true + kubeScheduler: + # @param enabled indicate whether to run kube-scheduler container in the scheduler pod, it's true by default. + enabled: true + image: registry.cn-hangzhou.aliyuncs.com/google_containers/kube-scheduler + imageTag: "" + imagePullPolicy: IfNotPresent + resources: {} + # If you do want to specify resources, uncomment the following lines, adjust them as necessary. + # and remove the curly braces after 'resources:'. +# limits: +# cpu: 1000m +# memory: 1000Mi +# requests: +# cpu: 100m +# memory: 100Mi + extraNewArgs: + - --config=/config/config.yaml + - -v=4 + extraArgs: + - --policy-config-file=/config/config.json + - -v=4 + extender: + image: "projecthami/hami" + imagePullPolicy: IfNotPresent + resources: {} + # If you do want to specify resources, uncomment the following lines, adjust them as necessary, + # and remove the curly braces after 'resources:'. +# limits: +# cpu: 1000m +# memory: 1000Mi +# requests: +# cpu: 100m +# memory: 100Mi + extraArgs: + - --debug + - -v=4 + podAnnotations: {} + tolerations: [] + #serviceAccountName: "hami-vgpu-scheduler-sa" + admissionWebhook: + customURL: + enabled: false + # must be an endpoint using https. + # should generate host certs here + host: 127.0.0.1 # hostname or ip, can be your node'IP if you want to use https://:/ + port: 31998 + path: /webhook + whitelistNamespaces: + # Specify the namespaces that the webhook will not be applied to. + # - default + # - kube-system + # - istio-system + reinvocationPolicy: Never + failurePolicy: Ignore + patch: + image: docker.io/jettech/kube-webhook-certgen:v1.5.2 + imageNew: liangjw/kube-webhook-certgen:v1.1.1 + imagePullPolicy: IfNotPresent + priorityClassName: "" + podAnnotations: {} + nodeSelector: {} + tolerations: [] + runAsUser: 2000 + service: + httpPort: 443 + schedulerPort: 31998 + monitorPort: 31993 + labels: {} + annotations: {} + +devicePlugin: + image: "projecthami/hami" + monitorimage: "projecthami/hami" + monitorctrPath: /usr/local/vgpu/containers + imagePullPolicy: IfNotPresent + deviceSplitCount: 40 + deviceMemoryScaling: 1 + deviceCoreScaling: 1 + runtimeClassName: "" + migStrategy: "none" + disablecorelimit: "false" + passDeviceSpecsEnabled: true + extraArgs: + - -v=4 + + service: + httpPort: 31992 + + pluginPath: /var/lib/kubelet/device-plugins + libPath: /usr/local/vgpu + + podAnnotations: {} + nvidianodeSelector: + gpu: "on" + tolerations: [] + # The updateStrategy for DevicePlugin DaemonSet. + # If you want to update the DaemonSet by manual, set type as "OnDelete". + # We recommend use OnDelete update strategy because DevicePlugin pod restart will cause business pod restart, this behavior is destructive. + # Otherwise, you can use RollingUpdate update strategy to rolling update DevicePlugin pod. + updateStrategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 1 + + resources: {} + # If you do want to specify resources, uncomment the following lines, adjust them as necessary. + # and remove the curly braces after 'resources:'. +# limits: +# cpu: 1000m +# memory: 1000Mi +# requests: +# cpu: 100m +# memory: 100Mi + + vgpuMonitor: + resources: {} + # If you do want to specify resources, uncomment the following lines, adjust them as necessary. + # and remove the curly braces after 'resources:'. +# limits: +# cpu: 1000m +# memory: 1000Mi +# requests: +# cpu: 100m +# memory: 100Mi + +devices: + mthreads: + enabled: false + customresources: + - mthreads.com/vgpu + ascend: + enabled: false + image: "" + imagePullPolicy: IfNotPresent + extraArgs: [] + nodeSelector: + ascend: "on" + tolerations: [] + customresources: + - huawei.com/Ascend910A + - huawei.com/Ascend910A-memory + - huawei.com/Ascend910B + - huawei.com/Ascend910B-memory + - huawei.com/Ascend910B4 + - huawei.com/Ascend910B4-memory + - huawei.com/Ascend310P + - huawei.com/Ascend310P-memory diff --git a/helm/helm.sh b/helm/helm.sh new file mode 100755 index 0000000..86b46ed --- /dev/null +++ b/helm/helm.sh @@ -0,0 +1 @@ +helm upgrade -hami ./hami/ --namespace kube-system diff --git a/libnvidia-container-tools-1.15.0-1.x86_64.rpm b/libnvidia-container-tools-1.15.0-1.x86_64.rpm new file mode 100644 index 0000000..8906a09 Binary files /dev/null and b/libnvidia-container-tools-1.15.0-1.x86_64.rpm differ diff --git a/libnvidia-container1-1.15.0-1.x86_64.rpm b/libnvidia-container1-1.15.0-1.x86_64.rpm new file mode 100644 index 0000000..0d74fc1 Binary files /dev/null and b/libnvidia-container1-1.15.0-1.x86_64.rpm differ diff --git a/nvidia-container-toolkit-1.15.0-1.x86_64.rpm b/nvidia-container-toolkit-1.15.0-1.x86_64.rpm new file mode 100644 index 0000000..369d6b2 Binary files /dev/null and b/nvidia-container-toolkit-1.15.0-1.x86_64.rpm differ diff --git a/nvidia-container-toolkit-base-1.15.0-1.x86_64.rpm b/nvidia-container-toolkit-base-1.15.0-1.x86_64.rpm new file mode 100644 index 0000000..f51abbb Binary files /dev/null and b/nvidia-container-toolkit-base-1.15.0-1.x86_64.rpm differ