diff --git a/README.md b/README.md index c19f84c..416eb27 100644 --- a/README.md +++ b/README.md @@ -42,8 +42,8 @@ 在master节点上运行: ``` - # nerdctl -nk8s.io load -i hami251.tar - # nerdctl -nk8s.io push sealos.hub:5000/projecthami/hami:v2.5.1 + # nerdctl -nk8s.io load -i hami260.tar + # nerdctl -nk8s.io push sealos.hub:5000/projecthami/hami:v2.6.0 # nerdctl -nk8s.io load -i scheduler1-28-9.tar # nerdctl -nk8s.io push sealos.hub:5000/google_containers/kube-scheduler:v1.28.9 # nerdctl -nk8s.io load -i kube-webhook-certgen.tar diff --git a/hami251.tar b/hami260.tar similarity index 92% rename from hami251.tar rename to hami260.tar index 4910c6c..03c16cd 100644 Binary files a/hami251.tar and b/hami260.tar differ diff --git a/helm/hami/Chart.yaml b/helm/hami/Chart.yaml index 10811a7..203fd47 100644 --- a/helm/hami/Chart.yaml +++ b/helm/hami/Chart.yaml @@ -1,15 +1,15 @@ apiVersion: v2 -appVersion: 2.5.1 +appVersion: 2.6.0 description: Heterogeneous AI Computing Virtualization Middleware keywords: - vgpu - gpu -kubeVersion: '>= 1.18.0' +kubeVersion: '>= 1.18.0-0' maintainers: -- email: limengxuan@4paradigm.com +- email: archlitchi@gmail.com name: limengxuan - email: xiaozhang0210@hotmail.com name: zhangxiao name: hami type: application -version: 2.5.1 +version: 2.6.0 diff --git a/helm/hami/templates/_helpers.tpl b/helm/hami/templates/_helpers.tpl index 9fd4c14..38479ff 100644 --- a/helm/hami/templates/_helpers.tpl +++ b/helm/hami/templates/_helpers.tpl @@ -23,6 +23,17 @@ If release name contains chart name it will be used as a full name. {{- end -}} {{- end -}} +{{/* +Allow the release namespace to be overridden for multi-namespace deployments in combined charts +*/}} +{{- define "hami-vgpu.namespace" -}} + {{- if .Values.namespaceOverride -}} + {{- .Values.namespaceOverride -}} + {{- else -}} + {{- .Release.Namespace -}} + {{- end -}} +{{- end -}} + {{/* The app name for Scheduler */}} diff --git a/helm/hami/templates/device-plugin/configmap.yaml b/helm/hami/templates/device-plugin/configmap.yaml index 4842739..3424eab 100644 --- a/helm/hami/templates/device-plugin/configmap.yaml +++ b/helm/hami/templates/device-plugin/configmap.yaml @@ -2,6 +2,7 @@ apiVersion: v1 kind: ConfigMap metadata: name: {{ include "hami-vgpu.device-plugin" . }} + namespace: {{ include "hami-vgpu.namespace" . }} labels: app.kubernetes.io/component: hami-device-plugin {{- include "hami-vgpu.labels" . | nindent 4 }} diff --git a/helm/hami/templates/device-plugin/daemonsetnvidia.yaml b/helm/hami/templates/device-plugin/daemonsetnvidia.yaml index 5e0217c..f2209e4 100644 --- a/helm/hami/templates/device-plugin/daemonsetnvidia.yaml +++ b/helm/hami/templates/device-plugin/daemonsetnvidia.yaml @@ -2,6 +2,7 @@ apiVersion: apps/v1 kind: DaemonSet metadata: name: {{ include "hami-vgpu.device-plugin" . }} + namespace: {{ include "hami-vgpu.namespace" . }} labels: app.kubernetes.io/component: hami-device-plugin {{- include "hami-vgpu.labels" . | nindent 4 }} @@ -26,8 +27,11 @@ spec: app.kubernetes.io/component: hami-device-plugin hami.io/webhook: ignore {{- include "hami-vgpu.selectorLabels" . | nindent 8 }} + annotations: + checksum/hami-device-plugin-config: {{ include (print $.Template.BasePath "/device-plugin/configmap.yaml") . | sha256sum }} + checksum/hami-scheduler-device-config: {{ include (print $.Template.BasePath "/scheduler/device-configmap.yaml") . | sha256sum }} {{- if .Values.devicePlugin.podAnnotations }} - annotations: {{ toYaml .Values.devicePlugin.podAnnotations | nindent 8 }} + {{- toYaml .Values.devicePlugin.podAnnotations | nindent 8 }} {{- end }} spec: {{- if .Values.devicePlugin.runtimeClassName }} @@ -92,7 +96,11 @@ spec: - name: vgpu-monitor image: {{ .Values.devicePlugin.image }}:{{ .Values.version }} imagePullPolicy: {{ .Values.devicePlugin.imagePullPolicy | quote }} - command: ["vGPUmonitor"] + command: + - "vGPUmonitor" + {{- range .Values.devicePlugin.extraArgs }} + - {{ . }} + {{- end }} securityContext: allowPrivilegeEscalation: false capabilities: diff --git a/helm/hami/templates/device-plugin/monitorrolebinding.yaml b/helm/hami/templates/device-plugin/monitorrolebinding.yaml index 3d45e3a..7eb1eb2 100644 --- a/helm/hami/templates/device-plugin/monitorrolebinding.yaml +++ b/helm/hami/templates/device-plugin/monitorrolebinding.yaml @@ -13,4 +13,4 @@ roleRef: subjects: - kind: ServiceAccount name: {{ include "hami-vgpu.device-plugin" . }} - namespace: {{ .Release.Namespace | quote }} + namespace: {{ include "hami-vgpu.namespace" . }} diff --git a/helm/hami/templates/device-plugin/monitorservice.yaml b/helm/hami/templates/device-plugin/monitorservice.yaml index edfc380..9a3a9db 100644 --- a/helm/hami/templates/device-plugin/monitorservice.yaml +++ b/helm/hami/templates/device-plugin/monitorservice.yaml @@ -2,22 +2,26 @@ apiVersion: v1 kind: Service metadata: name: {{ include "hami-vgpu.device-plugin" . }}-monitor + namespace: {{ include "hami-vgpu.namespace" . }} labels: app.kubernetes.io/component: hami-device-plugin {{- include "hami-vgpu.labels" . | nindent 4 }} - {{- if .Values.scheduler.service.labels }} - {{ toYaml .Values.scheduler.service.labels | indent 4 }} + {{- if .Values.devicePlugin.service.labels }} # Use devicePlugin instead of scheduler + {{ toYaml .Values.devicePlugin.service.labels | indent 4 }} {{- end }} - {{- if .Values.scheduler.service.annotations }} - annotations: {{ toYaml .Values.scheduler.service.annotations | nindent 4 }} + {{- if .Values.devicePlugin.service.annotations }} # Use devicePlugin instead of scheduler + annotations: {{ toYaml .Values.devicePlugin.service.annotations | nindent 4 }} {{- end }} spec: - externalTrafficPolicy: Local - selector: - app.kubernetes.io/component: hami-device-plugin - type: NodePort + type: {{ .Values.devicePlugin.service.type | default "NodePort" }} # Default type is NodePort ports: - name: monitorport - port: {{ .Values.devicePlugin.service.httpPort }} + port: {{ .Values.devicePlugin.service.httpPort | default 31992 }} # Default HTTP port is 31992 targetPort: 9394 - nodePort: {{ .Values.devicePlugin.service.httpPort }} \ No newline at end of file + {{- if eq (.Values.devicePlugin.service.type | default "NodePort") "NodePort" }} # If type is NodePort, set nodePort + nodePort: {{ .Values.devicePlugin.service.httpPort | default 31992 }} + {{- end }} + protocol: TCP + selector: + app.kubernetes.io/component: hami-device-plugin + {{- include "hami-vgpu.selectorLabels" . | nindent 4 }} \ No newline at end of file diff --git a/helm/hami/templates/device-plugin/monitorserviceaccount.yaml b/helm/hami/templates/device-plugin/monitorserviceaccount.yaml index 076d9dd..8c3c0b4 100644 --- a/helm/hami/templates/device-plugin/monitorserviceaccount.yaml +++ b/helm/hami/templates/device-plugin/monitorserviceaccount.yaml @@ -2,7 +2,7 @@ apiVersion: v1 kind: ServiceAccount metadata: name: {{ include "hami-vgpu.device-plugin" . }} - namespace: {{ .Release.Namespace | quote }} + namespace: {{ include "hami-vgpu.namespace" . }} labels: app.kubernetes.io/component: "hami-device-plugin" {{- include "hami-vgpu.labels" . | nindent 4 }} diff --git a/helm/hami/templates/device-plugin/runtime-class.yaml b/helm/hami/templates/device-plugin/runtime-class.yaml new file mode 100644 index 0000000..7d50120 --- /dev/null +++ b/helm/hami/templates/device-plugin/runtime-class.yaml @@ -0,0 +1,9 @@ +{{- if and .Values.devicePlugin.createRuntimeClass .Values.devicePlugin.runtimeClassName }} +apiVersion: node.k8s.io/v1 +kind: RuntimeClass +metadata: + name: {{ .Values.devicePlugin.runtimeClassName }} + annotations: + helm.sh/hook: pre-install,pre-upgrade +handler: nvidia +{{- end }} diff --git a/helm/hami/templates/scheduler/certmanager.yaml b/helm/hami/templates/scheduler/certmanager.yaml new file mode 100644 index 0000000..9de0c48 --- /dev/null +++ b/helm/hami/templates/scheduler/certmanager.yaml @@ -0,0 +1,29 @@ +{{- if .Values.scheduler.certManager.enabled }} +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: {{ include "hami-vgpu.scheduler" . }}-serving-cert + namespace: {{ include "hami-vgpu.namespace" . }} + labels: + app.kubernetes.io/component: hami-scheduler + {{- include "hami-vgpu.labels" . | nindent 4 }} +spec: + dnsNames: + - {{ include "hami-vgpu.scheduler" . }}.{{ include "hami-vgpu.namespace" . }}.svc + - {{ include "hami-vgpu.scheduler" . }}.{{ include "hami-vgpu.namespace" . }}.svc.cluster.local + issuerRef: + kind: Issuer + name: {{ include "hami-vgpu.scheduler" . }}-selfsigned-issuer + secretName: {{ include "hami-vgpu.scheduler.tls" . }} +--- +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: {{ include "hami-vgpu.scheduler" . }}-selfsigned-issuer + namespace: {{ include "hami-vgpu.namespace" . }} + labels: + app.kubernetes.io/component: hami-scheduler + {{- include "hami-vgpu.labels" . | nindent 4 }} +spec: + selfSigned: {} +{{- end }} diff --git a/helm/hami/templates/scheduler/configmap.yaml b/helm/hami/templates/scheduler/configmap.yaml index b69ee15..688c0f0 100644 --- a/helm/hami/templates/scheduler/configmap.yaml +++ b/helm/hami/templates/scheduler/configmap.yaml @@ -3,6 +3,7 @@ apiVersion: v1 kind: ConfigMap metadata: name: {{ include "hami-vgpu.scheduler" . }} + namespace: {{ include "hami-vgpu.namespace" . }} labels: app.kubernetes.io/component: hami-scheduler {{- include "hami-vgpu.labels" . | nindent 4 }} @@ -40,6 +41,14 @@ data: }, {{- end }} {{- end }} + {{- if .Values.devices.enflame.enabled }} + {{- range .Values.devices.enflame.customresources }} + { + "name": "{{ . }}", + "ignoredByScheduler": true + }, + {{- end }} + {{- end }} { "name": "{{ .Values.resourceName }}", "ignoredByScheduler": true @@ -79,6 +88,22 @@ data: { "name": "{{ .Values.iluvatarResourceName }}", "ignoredByScheduler": true + }, + { + "name": "metax-tech.com/gpu", + "ignoredByScheduler": true + }, + { + "name": "{{ .Values.metaxResourceName }}", + "ignoredByScheduler": true + }, + { + "name": "{{ .Values.metaxResourceCore }}", + "ignoredByScheduler": true + }, + { + "name": "{{ .Values.metaxResourceMem }}", + "ignoredByScheduler": true } ], "ignoreable": false diff --git a/helm/hami/templates/scheduler/configmapnew.yaml b/helm/hami/templates/scheduler/configmapnew.yaml index b953cff..fb3a6ee 100644 --- a/helm/hami/templates/scheduler/configmapnew.yaml +++ b/helm/hami/templates/scheduler/configmapnew.yaml @@ -3,6 +3,7 @@ apiVersion: v1 kind: ConfigMap metadata: name: {{ include "hami-vgpu.scheduler" . }}-newversion + namespace: {{ include "hami-vgpu.namespace" . }} labels: app.kubernetes.io/component: hami-scheduler {{- include "hami-vgpu.labels" . | nindent 4 }} @@ -49,6 +50,14 @@ data: ignoredByScheduler: true - name: {{ .Values.iluvatarResourceName }} ignoredByScheduler: true + - name: "metax-tech.com/gpu" + ignoredByScheduler: true + - name: {{ .Values.metaxResourceName }} + ignoredByScheduler: true + - name: {{ .Values.metaxResourceCore }} + ignoredByScheduler: true + - name: {{ .Values.metaxResourceMem }} + ignoredByScheduler: true {{- if .Values.devices.ascend.enabled }} {{- range .Values.devices.ascend.customresources }} - name: {{ . }} @@ -61,4 +70,10 @@ data: ignoredByScheduler: true {{- end }} {{- end }} + {{- if .Values.devices.enflame.enabled }} + {{- range .Values.devices.enflame.customresources }} + - name: {{ . }} + ignoredByScheduler: true + {{- end }} + {{- end }} {{- end }} diff --git a/helm/hami/templates/scheduler/deployment.yaml b/helm/hami/templates/scheduler/deployment.yaml index 326f4a6..f31ef82 100644 --- a/helm/hami/templates/scheduler/deployment.yaml +++ b/helm/hami/templates/scheduler/deployment.yaml @@ -2,6 +2,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: {{ include "hami-vgpu.scheduler" . }} + namespace: {{ include "hami-vgpu.namespace" . }} labels: app.kubernetes.io/component: hami-scheduler {{- include "hami-vgpu.labels" . | nindent 4 }} @@ -12,7 +13,11 @@ metadata: annotations: {{ toYaml .Values.global.annotations | nindent 4}} {{- end }} spec: + {{- if .Values.scheduler.leaderElect }} + replicas: {{ .Values.scheduler.replicas }} + {{- else }} replicas: 1 + {{- end }} selector: matchLabels: app.kubernetes.io/component: hami-scheduler @@ -23,8 +28,15 @@ spec: app.kubernetes.io/component: hami-scheduler {{- include "hami-vgpu.selectorLabels" . | nindent 8 }} hami.io/webhook: ignore + annotations: + {{- if ge (regexReplaceAll "[^0-9]" .Capabilities.KubeVersion.Minor "" | int) 22 }} + checksum/hami-scheduler-newversion-config: {{ include (print $.Template.BasePath "/scheduler/configmapnew.yaml") . | sha256sum }} + {{- else }} + checksum/hami-scheduler-config: {{ include (print $.Template.BasePath "/scheduler/configmap.yaml") . | sha256sum }} + {{- end }} + checksum/hami-scheduler-device-config: {{ include (print $.Template.BasePath "/scheduler/device-configmap.yaml") . | sha256sum }} {{- if .Values.scheduler.podAnnotations }} - annotations: {{ toYaml .Values.scheduler.podAnnotations | nindent 8 }} + {{- toYaml .Values.scheduler.podAnnotations | nindent 8 }} {{- end }} spec: {{- include "hami-vgpu.imagePullSecrets" . | nindent 6}} @@ -49,7 +61,7 @@ spec: {{- end }} - --leader-elect={{ .Values.scheduler.leaderElect }} - --leader-elect-resource-name={{ .Values.schedulerName }} - - --leader-elect-resource-namespace={{ .Release.Namespace }} + - --leader-elect-resource-namespace={{ include "hami-vgpu.namespace" . }} resources: {{- toYaml .Values.scheduler.kubeScheduler.resources | nindent 12 }} volumeMounts: diff --git a/helm/hami/templates/scheduler/device-configmap.yaml b/helm/hami/templates/scheduler/device-configmap.yaml index 12af186..20fd8d7 100644 --- a/helm/hami/templates/scheduler/device-configmap.yaml +++ b/helm/hami/templates/scheduler/device-configmap.yaml @@ -2,6 +2,7 @@ apiVersion: v1 kind: ConfigMap metadata: name: {{ include "hami-vgpu.scheduler" . }}-device + namespace: {{ include "hami-vgpu.namespace" . }} labels: app.kubernetes.io/component: hami-scheduler {{- include "hami-vgpu.labels" . | nindent 4 }} @@ -23,6 +24,9 @@ data: deviceSplitCount: {{ .Values.devicePlugin.deviceSplitCount }} deviceMemoryScaling: {{ .Values.devicePlugin.deviceMemoryScaling }} deviceCoreScaling: {{ .Values.devicePlugin.deviceCoreScaling }} + gpuCorePolicy: {{ .Values.devices.nvidia.gpuCorePolicy }} + libCudaLogLevel: {{ .Values.devices.nvidia.libCudaLogLevel }} + runtimeClassName: "{{ .Values.devicePlugin.runtimeClassName }}" knownMigGeometries: - models: [ "A30" ] allowedGeometries: @@ -90,6 +94,12 @@ data: resourceCoreName: {{ .Values.dcuResourceCores }} metax: resourceCountName: "metax-tech.com/gpu" + resourceVCountName: {{ .Values.metaxResourceName }} + resourceVMemoryName: {{ .Values.metaxResourceMem }} + resourceVCoreName: {{ .Values.metaxResourceCore }} + enflame: + resourceCountName: "enflame.com/vgcu" + resourcePercentageName: "enflame.com/vgcu-percentage" mthreads: resourceCountName: "mthreads.com/vgpu" resourceMemoryName: "mthreads.com/sgpu-memory" @@ -119,6 +129,27 @@ data: - name: vir16 memory: 17476 aiCore: 16 + - chipName: 910B2 + commonWord: Ascend910B2 + resourceName: huawei.com/Ascend910B2 + resourceMemoryName: huawei.com/Ascend910B2-memory + memoryAllocatable: 65536 + memoryCapacity: 65536 + aiCore: 24 + aiCPU: 6 + templates: + - name: vir03_1c_8g + memory: 8192 + aiCore: 3 + aiCPU: 1 + - name: vir06_1c_16g + memory: 16384 + aiCore: 6 + aiCPU: 1 + - name: vir12_3c_32g + memory: 32768 + aiCore: 12 + aiCPU: 3 - chipName: 910B3 commonWord: Ascend910B resourceName: huawei.com/Ascend910B diff --git a/helm/hami/templates/scheduler/job-patch/clusterrole.yaml b/helm/hami/templates/scheduler/job-patch/clusterrole.yaml index ef6d986..3c04b51 100644 --- a/helm/hami/templates/scheduler/job-patch/clusterrole.yaml +++ b/helm/hami/templates/scheduler/job-patch/clusterrole.yaml @@ -1,3 +1,4 @@ +{{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }} apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: @@ -24,3 +25,4 @@ rules: resourceNames: - {{ include "hami-vgpu.fullname" . }}-admission {{- end }} +{{- end }} \ No newline at end of file diff --git a/helm/hami/templates/scheduler/job-patch/clusterrolebinding.yaml b/helm/hami/templates/scheduler/job-patch/clusterrolebinding.yaml index 469419e..56ecdf5 100644 --- a/helm/hami/templates/scheduler/job-patch/clusterrolebinding.yaml +++ b/helm/hami/templates/scheduler/job-patch/clusterrolebinding.yaml @@ -1,3 +1,4 @@ +{{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }} apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: @@ -15,4 +16,5 @@ roleRef: subjects: - kind: ServiceAccount name: {{ include "hami-vgpu.fullname" . }}-admission - namespace: {{ .Release.Namespace | quote }} + namespace: {{ include "hami-vgpu.namespace" . }} +{{- end }} diff --git a/helm/hami/templates/scheduler/job-patch/job-createSecret.yaml b/helm/hami/templates/scheduler/job-patch/job-createSecret.yaml index b375de9..645a407 100644 --- a/helm/hami/templates/scheduler/job-patch/job-createSecret.yaml +++ b/helm/hami/templates/scheduler/job-patch/job-createSecret.yaml @@ -1,7 +1,9 @@ +{{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }} apiVersion: batch/v1 kind: Job metadata: name: {{ include "hami-vgpu.fullname" . }}-admission-create + namespace: {{ include "hami-vgpu.namespace" . }} annotations: "helm.sh/hook": pre-install,pre-upgrade "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded @@ -41,11 +43,11 @@ spec: - --cert-name=tls.crt - --key-name=tls.key {{- if .Values.scheduler.admissionWebhook.customURL.enabled }} - - --host={{ printf "%s.%s.svc,127.0.0.1,%s" (include "hami-vgpu.scheduler" .) .Release.Namespace .Values.scheduler.admissionWebhook.customURL.host}} + - --host={{ printf "%s.%s.svc,127.0.0.1,%s" (include "hami-vgpu.scheduler" .) (include "hami-vgpu.namespace" .) .Values.scheduler.admissionWebhook.customURL.host}} {{- else }} - - --host={{ printf "%s.%s.svc,127.0.0.1" (include "hami-vgpu.scheduler" .) .Release.Namespace }} + - --host={{ printf "%s.%s.svc,127.0.0.1" (include "hami-vgpu.scheduler" .) (include "hami-vgpu.namespace" .) }} {{- end }} - - --namespace={{ .Release.Namespace }} + - --namespace={{ include "hami-vgpu.namespace" . }} - --secret-name={{ include "hami-vgpu.scheduler.tls" . }} restartPolicy: OnFailure serviceAccountName: {{ include "hami-vgpu.fullname" . }}-admission @@ -58,3 +60,4 @@ spec: securityContext: runAsNonRoot: true runAsUser: {{ .Values.scheduler.patch.runAsUser }} +{{- end }} diff --git a/helm/hami/templates/scheduler/job-patch/job-patchWebhook.yaml b/helm/hami/templates/scheduler/job-patch/job-patchWebhook.yaml index 5567f2b..5432daf 100644 --- a/helm/hami/templates/scheduler/job-patch/job-patchWebhook.yaml +++ b/helm/hami/templates/scheduler/job-patch/job-patchWebhook.yaml @@ -1,7 +1,9 @@ +{{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }} apiVersion: batch/v1 kind: Job metadata: name: {{ include "hami-vgpu.fullname" . }}-admission-patch + namespace: {{ include "hami-vgpu.namespace" . }} annotations: "helm.sh/hook": post-install,post-upgrade "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded @@ -39,7 +41,7 @@ spec: args: - patch - --webhook-name={{ include "hami-vgpu.scheduler.webhook" . }} - - --namespace={{ .Release.Namespace }} + - --namespace={{ include "hami-vgpu.namespace" . }} - --patch-validating=false - --secret-name={{ include "hami-vgpu.scheduler.tls" . }} restartPolicy: OnFailure @@ -53,3 +55,4 @@ spec: securityContext: runAsNonRoot: true runAsUser: {{ .Values.scheduler.patch.runAsUser }} +{{- end }} diff --git a/helm/hami/templates/scheduler/job-patch/psp.yaml b/helm/hami/templates/scheduler/job-patch/psp.yaml index 5716585..a2b02d7 100644 --- a/helm/hami/templates/scheduler/job-patch/psp.yaml +++ b/helm/hami/templates/scheduler/job-patch/psp.yaml @@ -1,3 +1,4 @@ +{{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }} {{- if .Values.podSecurityPolicy.enabled }} apiVersion: policy/v1beta1 kind: PodSecurityPolicy @@ -34,3 +35,4 @@ spec: - secret - downwardAPI {{- end }} +{{- end }} diff --git a/helm/hami/templates/scheduler/job-patch/role.yaml b/helm/hami/templates/scheduler/job-patch/role.yaml index 7a77cbd..f054ee0 100644 --- a/helm/hami/templates/scheduler/job-patch/role.yaml +++ b/helm/hami/templates/scheduler/job-patch/role.yaml @@ -1,7 +1,9 @@ +{{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }} apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: name: {{ include "hami-vgpu.fullname" . }}-admission + namespace: {{ include "hami-vgpu.namespace" . }} annotations: "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded @@ -16,3 +18,4 @@ rules: verbs: - get - create +{{- end }} diff --git a/helm/hami/templates/scheduler/job-patch/rolebinding.yaml b/helm/hami/templates/scheduler/job-patch/rolebinding.yaml index 955ffe8..9ca5ba2 100644 --- a/helm/hami/templates/scheduler/job-patch/rolebinding.yaml +++ b/helm/hami/templates/scheduler/job-patch/rolebinding.yaml @@ -1,7 +1,9 @@ +{{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }} apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: name: {{ include "hami-vgpu.fullname" . }}-admission + namespace: {{ include "hami-vgpu.namespace" . }} annotations: "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded @@ -15,4 +17,5 @@ roleRef: subjects: - kind: ServiceAccount name: {{ include "hami-vgpu.fullname" . }}-admission - namespace: {{ .Release.Namespace | quote }} + namespace: {{ include "hami-vgpu.namespace" . }} +{{- end }} diff --git a/helm/hami/templates/scheduler/job-patch/serviceaccount.yaml b/helm/hami/templates/scheduler/job-patch/serviceaccount.yaml index 813d2b3..a220308 100644 --- a/helm/hami/templates/scheduler/job-patch/serviceaccount.yaml +++ b/helm/hami/templates/scheduler/job-patch/serviceaccount.yaml @@ -1,10 +1,13 @@ +{{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }} apiVersion: v1 kind: ServiceAccount metadata: name: {{ include "hami-vgpu.fullname" . }}-admission + namespace: {{ include "hami-vgpu.namespace" . }} annotations: "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded labels: {{- include "hami-vgpu.labels" . | nindent 4 }} app.kubernetes.io/component: admission-webhook +{{- end }} diff --git a/helm/hami/templates/scheduler/rolebinding.yaml b/helm/hami/templates/scheduler/rolebinding.yaml index 37f3d86..1ba6a51 100644 --- a/helm/hami/templates/scheduler/rolebinding.yaml +++ b/helm/hami/templates/scheduler/rolebinding.yaml @@ -12,4 +12,4 @@ roleRef: subjects: - kind: ServiceAccount name: {{ include "hami-vgpu.scheduler" . }} - namespace: {{ .Release.Namespace | quote }} + namespace: {{ include "hami-vgpu.namespace" . }} diff --git a/helm/hami/templates/scheduler/service.yaml b/helm/hami/templates/scheduler/service.yaml index 344710e..70378c5 100644 --- a/helm/hami/templates/scheduler/service.yaml +++ b/helm/hami/templates/scheduler/service.yaml @@ -2,6 +2,7 @@ apiVersion: v1 kind: Service metadata: name: {{ include "hami-vgpu.scheduler" . }} + namespace: {{ include "hami-vgpu.namespace" . }} labels: app.kubernetes.io/component: hami-scheduler {{- include "hami-vgpu.labels" . | nindent 4 }} @@ -12,19 +13,22 @@ metadata: annotations: {{ toYaml .Values.scheduler.service.annotations | nindent 4 }} {{- end }} spec: - type: NodePort + type: {{ .Values.scheduler.service.type | default "NodePort" }} # Default type is NodePort ports: - name: http - port: {{ .Values.scheduler.service.httpPort }} - targetPort: 443 - nodePort: {{ .Values.scheduler.service.schedulerPort }} + port: {{ .Values.scheduler.service.httpPort | default 443 }} # Default HTTP port is 443 + targetPort: {{ .Values.scheduler.service.httpTargetPort | default 443 }} + {{- if eq (.Values.scheduler.service.type | default "NodePort") "NodePort" }} # If type is NodePort, set nodePort + nodePort: {{ .Values.scheduler.service.schedulerPort | default 31998 }} + {{- end }} protocol: TCP - name: monitor - port: {{ .Values.scheduler.service.monitorPort }} - targetPort: {{ (split ":" (printf "%s" .Values.scheduler.metricsBindAddress))._1 }} - nodePort: {{ .Values.scheduler.service.monitorPort }} + port: {{ .Values.scheduler.service.monitorPort | default 31993 }} # Default monitoring port is 31993 + targetPort: {{ .Values.scheduler.service.monitorTargetPort | default 9395 }} + {{- if eq (.Values.scheduler.service.type | default "NodePort") "NodePort" }} # If type is NodePort, set nodePort + nodePort: {{ .Values.scheduler.service.monitorPort | default 31993 }} + {{- end }} protocol: TCP selector: app.kubernetes.io/component: hami-scheduler - {{- include "hami-vgpu.selectorLabels" . | nindent 4 }} - + {{- include "hami-vgpu.selectorLabels" . | nindent 4 }} \ No newline at end of file diff --git a/helm/hami/templates/scheduler/serviceaccount.yaml b/helm/hami/templates/scheduler/serviceaccount.yaml index c9d129d..f20a6e6 100644 --- a/helm/hami/templates/scheduler/serviceaccount.yaml +++ b/helm/hami/templates/scheduler/serviceaccount.yaml @@ -2,7 +2,7 @@ apiVersion: v1 kind: ServiceAccount metadata: name: {{ include "hami-vgpu.scheduler" . }} - namespace: {{ .Release.Namespace | quote }} + namespace: {{ include "hami-vgpu.namespace" . }} labels: app.kubernetes.io/component: "hami-scheduler" {{- include "hami-vgpu.labels" . | nindent 4 }} diff --git a/helm/hami/templates/scheduler/webhook.yaml b/helm/hami/templates/scheduler/webhook.yaml index d39f236..aca4306 100644 --- a/helm/hami/templates/scheduler/webhook.yaml +++ b/helm/hami/templates/scheduler/webhook.yaml @@ -1,6 +1,10 @@ apiVersion: admissionregistration.k8s.io/v1 kind: MutatingWebhookConfiguration metadata: + {{- if .Values.scheduler.certManager.enabled }} + annotations: + cert-manager.io/inject-ca-from: {{ include "hami-vgpu.namespace" . }}/{{ include "hami-vgpu.scheduler" . }}-serving-cert + {{- end }} name: {{ include "hami-vgpu.scheduler.webhook" . }} webhooks: - admissionReviewVersions: @@ -11,7 +15,7 @@ webhooks: {{- else }} service: name: {{ include "hami-vgpu.scheduler" . }} - namespace: {{ .Release.Namespace }} + namespace: {{ include "hami-vgpu.namespace" . }} path: /webhook port: {{ .Values.scheduler.service.httpPort }} {{- end }} diff --git a/helm/hami/values.yaml b/helm/hami/values.yaml index 8b73708..05a4f3e 100644 --- a/helm/hami/values.yaml +++ b/helm/hami/values.yaml @@ -2,8 +2,9 @@ nameOverride: "" fullnameOverride: "" +namespaceOverride: "" imagePullSecrets: [ ] -version: "v2.5.1" +version: "v2.6.0" #Nvidia GPU Parameters resourceName: "nvidia.com/gpu" @@ -27,6 +28,11 @@ iluvatarResourceName: "iluvatar.ai/vgpu" iluvatarResourceMem: "iluvatar.ai/vcuda-memory" iluvatarResourceCore: "iluvatar.ai/vcuda-core" +#Metax SGPU Parameters +metaxResourceName: "metax-tech.com/sgpu" +metaxResourceCore: "metax-tech.com/vcore" +metaxResourceMem: "metax-tech.com/vmemory" + schedulerName: "hami-scheduler" podSecurityPolicy: @@ -55,6 +61,8 @@ scheduler: metricsBindAddress: ":9395" livenessProbe: false leaderElect: true + # when leaderElect is true, replicas is available, otherwise replicas is 1. + replicas: 1 kubeScheduler: # @param enabled indicate whether to run kube-scheduler container in the scheduler pod, it's true by default. enabled: true @@ -109,7 +117,14 @@ scheduler: # - istio-system reinvocationPolicy: Never failurePolicy: Ignore + ## TLS Certificate Option 1: Use cert-manager to generate self-signed certificate. + ## If enabled, always takes precedence over options 2. + certManager: + enabled: false + ## TLS Certificate Option 2: Use kube-webhook-certgen to generate self-signed certificate. + ## If true and certManager.enabled is false, Helm will automatically create a self-signed cert and secret for you. patch: + enabled: true image: docker.io/jettech/kube-webhook-certgen:v1.5.2 imageNew: liangjw/kube-webhook-certgen:v1.1.1 imagePullPolicy: IfNotPresent @@ -119,9 +134,11 @@ scheduler: tolerations: [] runAsUser: 2000 service: - httpPort: 443 - schedulerPort: 31998 - monitorPort: 31993 + type: NodePort # Default type is NodePort, can be changed to ClusterIP + httpPort: 443 # HTTP port + schedulerPort: 31998 # NodePort for HTTP + monitorPort: 31993 # Monitoring port + monitorTargetPort: 9395 labels: {} annotations: {} @@ -130,10 +147,13 @@ devicePlugin: monitorimage: "projecthami/hami" monitorctrPath: /usr/local/vgpu/containers imagePullPolicy: IfNotPresent - deviceSplitCount: 40 + deviceSplitCount: 10 deviceMemoryScaling: 1 deviceCoreScaling: 1 + # The runtime class name to be used by the device plugin, and added to the pod.spec.runtimeClassName of applications utilizing NVIDIA GPUs runtimeClassName: "" + # Whether to create runtime class, name comes from runtimeClassName when it is set + createRuntimeClass: false migStrategy: "none" disablecorelimit: "false" passDeviceSpecsEnabled: false @@ -141,7 +161,10 @@ devicePlugin: - -v=4 service: + type: NodePort # Default type is NodePort, can be changed to ClusterIP httpPort: 31992 + labels: {} + annotations: {} pluginPath: /var/lib/kubelet/device-plugins libPath: /usr/local/vgpu @@ -181,10 +204,18 @@ devicePlugin: # memory: 100Mi devices: + enflame: + enabled: false + customresources: + - enflame.com/vgcu + - enflame.com/vgcu-percentage mthreads: enabled: false customresources: - mthreads.com/vgpu + nvidia: + gpuCorePolicy: default + libCudaLogLevel: 1 ascend: enabled: false image: "" @@ -196,6 +227,8 @@ devices: customresources: - huawei.com/Ascend910A - huawei.com/Ascend910A-memory + - huawei.com/Ascend910B2 + - huawei.com/Ascend910B2-memory - huawei.com/Ascend910B - huawei.com/Ascend910B-memory - huawei.com/Ascend910B4