From c1e38c06e17318729d29c36e54ed5464a6c484ae Mon Sep 17 00:00:00 2001 From: Nimbus318 <136771156+Nimbus318@users.noreply.github.com> Date: Thu, 9 Jan 2025 19:32:02 +0800 Subject: [PATCH] feat: support vendorNodeSelectors in Proto, Helm Signed-off-by: Nimbus318 <136771156+Nimbus318@users.noreply.github.com> --- charts/hami-webui/templates/configmap.yaml | 6 ++- charts/hami-webui/values.yaml | 10 ++++- server/cmd/server/main.go | 5 +++ server/cmd/server/wire.go | 1 + server/config/config.yaml | 5 +++ server/internal/conf/conf.proto | 1 + server/internal/data/node.go | 10 ++--- server/internal/provider/ascend/provider.go | 27 ++++++++------ server/internal/provider/hygon/provider.go | 41 +++++++++++---------- server/internal/provider/mlu/provider.go | 11 ++++-- server/internal/provider/nvidia/provider.go | 11 ++++-- 11 files changed, 81 insertions(+), 47 deletions(-) diff --git a/charts/hami-webui/templates/configmap.yaml b/charts/hami-webui/templates/configmap.yaml index 2e395b8..31dceb9 100644 --- a/charts/hami-webui/templates/configmap.yaml +++ b/charts/hami-webui/templates/configmap.yaml @@ -14,4 +14,8 @@ data: timeout: 1s prometheus: address: {{ ternary .Values.externalPrometheus.address (printf "http://%s-kube-prometh-prometheus.%s.svc.cluster.local:9090" (include "hami-webui.fullname" .) (include "hami-webui.namespace" .)) .Values.externalPrometheus.enabled }} - timeout: 1m \ No newline at end of file + timeout: 1m + node_selectors: + {{- range $key, $value := .Values.vendorNodeSelectors }} + {{ $key }}: {{ $value }} + {{- end }} \ No newline at end of file diff --git a/charts/hami-webui/values.yaml b/charts/hami-webui/values.yaml index f741121..4b33abd 100644 --- a/charts/hami-webui/values.yaml +++ b/charts/hami-webui/values.yaml @@ -4,16 +4,22 @@ replicaCount: 1 +vendorNodeSelectors: + NVIDIA: gpu=on + Ascend: ascend=on + DCU: dcu=on + MLU: mlu=on + image: frontend: repository: projecthami/hami-webui-fe-oss pullPolicy: IfNotPresent # Overrides the image tag whose default is the chart appVersion. - tag: "v1.0.4" + tag: "main" backend: repository: projecthami/hami-webui-be-oss pullPolicy: IfNotPresent - tag: "v1.0.4" + tag: "main" imagePullSecrets: [] nameOverride: "" diff --git a/server/cmd/server/main.go b/server/cmd/server/main.go index fc1ca8d..59feda4 100644 --- a/server/cmd/server/main.go +++ b/server/cmd/server/main.go @@ -8,6 +8,7 @@ import ( "github.com/go-kratos/kratos/v2/transport/grpc" "github.com/go-kratos/kratos/v2/transport/http" "os" + "vgpu/internal/conf" _ "go.uber.org/automaxprocs" ) @@ -57,3 +58,7 @@ func newApp(ctx context.Context, logger log.Logger, gs *grpc.Server, hs *http.Se ), ) } + +func getNodeSelectors(c *conf.Bootstrap) map[string]string { + return c.NodeSelectors +} diff --git a/server/cmd/server/wire.go b/server/cmd/server/wire.go index 32bb380..1a7b50a 100644 --- a/server/cmd/server/wire.go +++ b/server/cmd/server/wire.go @@ -26,5 +26,6 @@ func initApp(configPath string, ctx context.Context) (*kratos.App, func(), error service.ProviderSet, exporter.ProviderSet, newApp, + getNodeSelectors, )) } diff --git a/server/config/config.yaml b/server/config/config.yaml index 69fadbb..f3905c8 100644 --- a/server/config/config.yaml +++ b/server/config/config.yaml @@ -8,3 +8,8 @@ server: prometheus: address: http://localhost:9090 timeout: 1m +node_selectors: + NVIDIA: gpu=on + Ascend: ascend=on + DCU: dcu=on + MLU: mlu=on \ No newline at end of file diff --git a/server/internal/conf/conf.proto b/server/internal/conf/conf.proto index 15c13bc..7eb44e6 100644 --- a/server/internal/conf/conf.proto +++ b/server/internal/conf/conf.proto @@ -8,6 +8,7 @@ import "google/protobuf/duration.proto"; message Bootstrap { Server server = 1; Prometheus prometheus = 2; + map node_selectors = 3; } message Server { diff --git a/server/internal/data/node.go b/server/internal/data/node.go index ef302fe..2757899 100644 --- a/server/internal/data/node.go +++ b/server/internal/data/node.go @@ -32,17 +32,17 @@ type nodeRepo struct { } // NewNodeRepo . -func NewNodeRepo(data *Data, logger log.Logger) biz.NodeRepo { +func NewNodeRepo(data *Data, nodeSelectors map[string]string, logger log.Logger) biz.NodeRepo { nodeRepo := &nodeRepo{ data: data, nodeNotify: make(chan struct{}, 1), nodes: map[k8stypes.UID]*biz.Node{}, log: log.NewHelper(logger), providers: []provider.Provider{ - nvidia.NewNvidia(data.promCl, log.NewHelper(logger)), - mlu.NewCambricon(data.promCl, log.NewHelper(logger)), - ascend.NewAscend(data.promCl, log.NewHelper(logger)), - hygon.NewHygon(data.promCl, log.NewHelper(logger)), + nvidia.NewNvidia(data.promCl, log.NewHelper(logger), nodeSelectors[biz.NvidiaGPUDevice]), + mlu.NewCambricon(data.promCl, log.NewHelper(logger), nodeSelectors[biz.CambriconGPUDevice]), + ascend.NewAscend(data.promCl, log.NewHelper(logger), nodeSelectors[biz.AscendGPUDevice]), + hygon.NewHygon(data.promCl, log.NewHelper(logger), nodeSelectors[biz.HygonGPUDevice]), }, } nodeRepo.init() diff --git a/server/internal/provider/ascend/provider.go b/server/internal/provider/ascend/provider.go index d905748..2658d5e 100644 --- a/server/internal/provider/ascend/provider.go +++ b/server/internal/provider/ascend/provider.go @@ -16,20 +16,23 @@ import ( type Ascend struct { prom *prom.Client log *log.Helper + + nodeSelectors string } -func NewAscend(prom *prom.Client, log *log.Helper) *Ascend { +func NewAscend(prom *prom.Client, log *log.Helper, nodeSelectors string) *Ascend { return &Ascend{ - prom: prom, - log: log, + prom: prom, + log: log, + nodeSelectors: nodeSelectors, } } -func (c *Ascend) GetNodeDevicePluginLabels() (labels.Selector, error) { - return labels.Parse("servertype=Ascend910B-20") +func (a *Ascend) GetNodeDevicePluginLabels() (labels.Selector, error) { + return labels.Parse(a.nodeSelectors) } -func (c *Ascend) GetProvider() string { +func (a *Ascend) GetProvider() string { return AscendDevice } @@ -39,16 +42,16 @@ type DeviceMeta struct { Driver string } -func (c *Ascend) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.DeviceInfo { +func (a *Ascend) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.DeviceInfo { device := make(map[string]*util.DeviceInfo) queryString := fmt.Sprintf("npu_chip_info_health_status{node=\"%s\"}", node.Name) - vs, err := c.prom.Query(context.Background(), queryString) + vs, err := a.prom.Query(context.Background(), queryString) if err != nil { - c.log.Warnf("query %s failed", queryString) + a.log.Warnf("query %s failed", queryString) } else { ds, ok := vs.(model.Vector) if !ok { - c.log.Warnf("vectorValue: %v, failed", vs) + a.log.Warnf("vectorValue: %v, failed", vs) } else { for _, d := range ds { id := d.Metric["id"] @@ -68,12 +71,12 @@ func (c *Ascend) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.De return device } -func (c *Ascend) FetchDevices(node *corev1.Node) ([]*util.DeviceInfo, error) { +func (a *Ascend) FetchDevices(node *corev1.Node) ([]*util.DeviceInfo, error) { nodedevices := []*util.DeviceInfo{} i := 0 cards, _ := node.Status.Capacity.Name(corev1.ResourceName(AscendResourceCoreCount), resource.DecimalSI).AsInt64() - tmpDevice := c.GetDevicesFromPrometheus(node) + tmpDevice := a.GetDevicesFromPrometheus(node) for int64(i)*10 < cards { index := fmt.Sprintf("%d", i) if _, ok := tmpDevice[index]; !ok { diff --git a/server/internal/provider/hygon/provider.go b/server/internal/provider/hygon/provider.go index 179754b..ca2ed79 100644 --- a/server/internal/provider/hygon/provider.go +++ b/server/internal/provider/hygon/provider.go @@ -17,20 +17,23 @@ import ( type Hygon struct { prom *prom.Client log *log.Helper + + nodeSelectors string } -func NewHygon(prom *prom.Client, log *log.Helper) *Hygon { +func NewHygon(prom *prom.Client, log *log.Helper, nodeSelectors string) *Hygon { return &Hygon{ - prom: prom, - log: log, + prom: prom, + log: log, + nodeSelectors: nodeSelectors, } } -func (c *Hygon) GetNodeDevicePluginLabels() (labels.Selector, error) { - return labels.Parse("dcu=on") +func (h *Hygon) GetNodeDevicePluginLabels() (labels.Selector, error) { + return labels.Parse(h.nodeSelectors) } -func (c *Hygon) GetProvider() string { +func (h *Hygon) GetProvider() string { return HygonDCUDevice } @@ -40,19 +43,19 @@ type DeviceMeta struct { Driver string } -func (c *Hygon) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.DeviceInfo { +func (h *Hygon) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.DeviceInfo { deviceMap := make(map[string]*util.DeviceInfo) queryString := fmt.Sprintf("dcu_temp{node=\"%s\"}", node.Name) - vs, err := c.prom.Query(context.Background(), queryString) + vs, err := h.prom.Query(context.Background(), queryString) if err != nil { - c.log.Warnf("Failed to query %s: %v", queryString, err) + h.log.Warnf("Failed to query %s: %v", queryString, err) return deviceMap } vector, ok := vs.(model.Vector) if !ok { - c.log.Warnf("Unexpected result type: %v", vs) + h.log.Warnf("Unexpected result type: %v", vs) return deviceMap } @@ -68,34 +71,34 @@ func (c *Hygon) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.Dev return deviceMap } -func (c *Hygon) FetchDevices(node *corev1.Node) ([]*util.DeviceInfo, error) { +func (h *Hygon) FetchDevices(node *corev1.Node) ([]*util.DeviceInfo, error) { devEncoded, ok := node.Annotations[RegisterAnnos] if !ok { return []*util.DeviceInfo{}, errors.New("annos not found " + RegisterAnnos) } - nodedevices, err := util.DecodeNodeDevices(devEncoded, c.log) + nodedevices, err := util.DecodeNodeDevices(devEncoded, h.log) if err != nil { - c.log.Errorw("failed to decode node devices", err, "node", node.Name, "device annotation", devEncoded) + h.log.Errorw("failed to decode node devices", err, "node", node.Name, "device annotation", devEncoded) return []*util.DeviceInfo{}, err } if len(nodedevices) == 0 { - c.log.Infow("event", "no gpu device found", "node", node.Name, "device annotation", devEncoded) + h.log.Infow("event", "no gpu device found", "node", node.Name, "device annotation", devEncoded) return []*util.DeviceInfo{}, errors.New("no gpu found on node") } - devDecoded := util.EncodeNodeDevices(nodedevices, c.log) - c.log.Infow("event", "nodes device information", "node", node.Name, "nodedevices", devDecoded) - devDetail := c.GetDevicesFromPrometheus(node) + devDecoded := util.EncodeNodeDevices(nodedevices, h.log) + h.log.Infow("event", "nodes device information", "node", node.Name, "nodedevices", devDecoded) + devDetail := h.GetDevicesFromPrometheus(node) for _, nodedevice := range nodedevices { idParts := strings.Split(nodedevice.ID, "-") if len(idParts) < 2 { - c.log.Warnf("Invalid nodedevice.ID format: %s", nodedevice.ID) + h.log.Warnf("Invalid nodedevice.ID format: %s", nodedevice.ID) continue } devDetailID := idParts[1] devInfo, exists := devDetail[devDetailID] if !exists { - c.log.Warnf("Device ID %s not found in devDetail", devDetailID) + h.log.Warnf("Device ID %s not found in devDetail", devDetailID) continue } diff --git a/server/internal/provider/mlu/provider.go b/server/internal/provider/mlu/provider.go index 00ed9b3..c81e4a8 100644 --- a/server/internal/provider/mlu/provider.go +++ b/server/internal/provider/mlu/provider.go @@ -15,17 +15,20 @@ import ( type Cambricon struct { prom *prom.Client log *log.Helper + + labelsSelector string } -func NewCambricon(prom *prom.Client, log *log.Helper) *Cambricon { +func NewCambricon(prom *prom.Client, log *log.Helper, labelSelector string) *Cambricon { return &Cambricon{ - prom: prom, - log: log, + prom: prom, + log: log, + labelsSelector: labelSelector, } } func (c *Cambricon) GetNodeDevicePluginLabels() (labels.Selector, error) { - return labels.Parse("mlu=on") + return labels.Parse(c.labelsSelector) } func (c *Cambricon) GetProvider() string { diff --git a/server/internal/provider/nvidia/provider.go b/server/internal/provider/nvidia/provider.go index a8cb4d6..6cecad1 100644 --- a/server/internal/provider/nvidia/provider.go +++ b/server/internal/provider/nvidia/provider.go @@ -12,17 +12,20 @@ import ( type Nvidia struct { prom *prom.Client log *log.Helper + + labelSelector string } -func NewNvidia(prom *prom.Client, log *log.Helper) *Nvidia { +func NewNvidia(prom *prom.Client, log *log.Helper, labelSelector string) *Nvidia { return &Nvidia{ - prom: prom, - log: log, + prom: prom, + log: log, + labelSelector: labelSelector, } } func (n *Nvidia) GetNodeDevicePluginLabels() (labels.Selector, error) { - return labels.Parse("gpu=on") + return labels.Parse(n.labelSelector) } func (n *Nvidia) GetProvider() string {