Merge pull request #14 from Nimbus318/feat/add-node-selector-support

Add support for configuring NodeSelectors for different node types
main
霓漠Nimbus 7 months ago committed by GitHub
commit f080a9daf3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -15,3 +15,7 @@ data:
prometheus: prometheus:
address: {{ ternary .Values.externalPrometheus.address (printf "http://%s-kube-prometh-prometheus.%s.svc.cluster.local:9090" (include "hami-webui.fullname" .) (include "hami-webui.namespace" .)) .Values.externalPrometheus.enabled }} address: {{ ternary .Values.externalPrometheus.address (printf "http://%s-kube-prometh-prometheus.%s.svc.cluster.local:9090" (include "hami-webui.fullname" .) (include "hami-webui.namespace" .)) .Values.externalPrometheus.enabled }}
timeout: 1m timeout: 1m
node_selectors:
{{- range $key, $value := .Values.vendorNodeSelectors }}
{{ $key }}: {{ $value }}
{{- end }}

@ -4,16 +4,22 @@
replicaCount: 1 replicaCount: 1
vendorNodeSelectors:
NVIDIA: gpu=on
Ascend: ascend=on
DCU: dcu=on
MLU: mlu=on
image: image:
frontend: frontend:
repository: projecthami/hami-webui-fe-oss repository: projecthami/hami-webui-fe-oss
pullPolicy: IfNotPresent pullPolicy: IfNotPresent
# Overrides the image tag whose default is the chart appVersion. # Overrides the image tag whose default is the chart appVersion.
tag: "v1.0.4" tag: "main"
backend: backend:
repository: projecthami/hami-webui-be-oss repository: projecthami/hami-webui-be-oss
pullPolicy: IfNotPresent pullPolicy: IfNotPresent
tag: "v1.0.4" tag: "main"
imagePullSecrets: [] imagePullSecrets: []
nameOverride: "" nameOverride: ""

@ -8,6 +8,7 @@ import (
"github.com/go-kratos/kratos/v2/transport/grpc" "github.com/go-kratos/kratos/v2/transport/grpc"
"github.com/go-kratos/kratos/v2/transport/http" "github.com/go-kratos/kratos/v2/transport/http"
"os" "os"
"vgpu/internal/conf"
_ "go.uber.org/automaxprocs" _ "go.uber.org/automaxprocs"
) )
@ -57,3 +58,7 @@ func newApp(ctx context.Context, logger log.Logger, gs *grpc.Server, hs *http.Se
), ),
) )
} }
func getNodeSelectors(c *conf.Bootstrap) map[string]string {
return c.NodeSelectors
}

@ -26,5 +26,6 @@ func initApp(configPath string, ctx context.Context) (*kratos.App, func(), error
service.ProviderSet, service.ProviderSet,
exporter.ProviderSet, exporter.ProviderSet,
newApp, newApp,
getNodeSelectors,
)) ))
} }

@ -8,3 +8,8 @@ server:
prometheus: prometheus:
address: http://localhost:9090 address: http://localhost:9090
timeout: 1m timeout: 1m
node_selectors:
NVIDIA: gpu=on
Ascend: ascend=on
DCU: dcu=on
MLU: mlu=on

@ -8,6 +8,7 @@ import "google/protobuf/duration.proto";
message Bootstrap { message Bootstrap {
Server server = 1; Server server = 1;
Prometheus prometheus = 2; Prometheus prometheus = 2;
map<string, string> node_selectors = 3;
} }
message Server { message Server {

@ -32,17 +32,17 @@ type nodeRepo struct {
} }
// NewNodeRepo . // NewNodeRepo .
func NewNodeRepo(data *Data, logger log.Logger) biz.NodeRepo { func NewNodeRepo(data *Data, nodeSelectors map[string]string, logger log.Logger) biz.NodeRepo {
nodeRepo := &nodeRepo{ nodeRepo := &nodeRepo{
data: data, data: data,
nodeNotify: make(chan struct{}, 1), nodeNotify: make(chan struct{}, 1),
nodes: map[k8stypes.UID]*biz.Node{}, nodes: map[k8stypes.UID]*biz.Node{},
log: log.NewHelper(logger), log: log.NewHelper(logger),
providers: []provider.Provider{ providers: []provider.Provider{
nvidia.NewNvidia(data.promCl, log.NewHelper(logger)), nvidia.NewNvidia(data.promCl, log.NewHelper(logger), nodeSelectors[biz.NvidiaGPUDevice]),
mlu.NewCambricon(data.promCl, log.NewHelper(logger)), mlu.NewCambricon(data.promCl, log.NewHelper(logger), nodeSelectors[biz.CambriconGPUDevice]),
ascend.NewAscend(data.promCl, log.NewHelper(logger)), ascend.NewAscend(data.promCl, log.NewHelper(logger), nodeSelectors[biz.AscendGPUDevice]),
hygon.NewHygon(data.promCl, log.NewHelper(logger)), hygon.NewHygon(data.promCl, log.NewHelper(logger), nodeSelectors[biz.HygonGPUDevice]),
}, },
} }
nodeRepo.init() nodeRepo.init()

@ -16,20 +16,23 @@ import (
type Ascend struct { type Ascend struct {
prom *prom.Client prom *prom.Client
log *log.Helper log *log.Helper
nodeSelectors string
} }
func NewAscend(prom *prom.Client, log *log.Helper) *Ascend { func NewAscend(prom *prom.Client, log *log.Helper, nodeSelectors string) *Ascend {
return &Ascend{ return &Ascend{
prom: prom, prom: prom,
log: log, log: log,
nodeSelectors: nodeSelectors,
} }
} }
func (c *Ascend) GetNodeDevicePluginLabels() (labels.Selector, error) { func (a *Ascend) GetNodeDevicePluginLabels() (labels.Selector, error) {
return labels.Parse("servertype=Ascend910B-20") return labels.Parse(a.nodeSelectors)
} }
func (c *Ascend) GetProvider() string { func (a *Ascend) GetProvider() string {
return AscendDevice return AscendDevice
} }
@ -39,16 +42,16 @@ type DeviceMeta struct {
Driver string Driver string
} }
func (c *Ascend) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.DeviceInfo { func (a *Ascend) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.DeviceInfo {
device := make(map[string]*util.DeviceInfo) device := make(map[string]*util.DeviceInfo)
queryString := fmt.Sprintf("npu_chip_info_health_status{node=\"%s\"}", node.Name) queryString := fmt.Sprintf("npu_chip_info_health_status{node=\"%s\"}", node.Name)
vs, err := c.prom.Query(context.Background(), queryString) vs, err := a.prom.Query(context.Background(), queryString)
if err != nil { if err != nil {
c.log.Warnf("query %s failed", queryString) a.log.Warnf("query %s failed", queryString)
} else { } else {
ds, ok := vs.(model.Vector) ds, ok := vs.(model.Vector)
if !ok { if !ok {
c.log.Warnf("vectorValue: %v, failed", vs) a.log.Warnf("vectorValue: %v, failed", vs)
} else { } else {
for _, d := range ds { for _, d := range ds {
id := d.Metric["id"] id := d.Metric["id"]
@ -68,12 +71,12 @@ func (c *Ascend) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.De
return device return device
} }
func (c *Ascend) FetchDevices(node *corev1.Node) ([]*util.DeviceInfo, error) { func (a *Ascend) FetchDevices(node *corev1.Node) ([]*util.DeviceInfo, error) {
nodedevices := []*util.DeviceInfo{} nodedevices := []*util.DeviceInfo{}
i := 0 i := 0
cards, _ := node.Status.Capacity.Name(corev1.ResourceName(AscendResourceCoreCount), resource.DecimalSI).AsInt64() cards, _ := node.Status.Capacity.Name(corev1.ResourceName(AscendResourceCoreCount), resource.DecimalSI).AsInt64()
tmpDevice := c.GetDevicesFromPrometheus(node) tmpDevice := a.GetDevicesFromPrometheus(node)
for int64(i)*10 < cards { for int64(i)*10 < cards {
index := fmt.Sprintf("%d", i) index := fmt.Sprintf("%d", i)
if _, ok := tmpDevice[index]; !ok { if _, ok := tmpDevice[index]; !ok {

@ -17,20 +17,23 @@ import (
type Hygon struct { type Hygon struct {
prom *prom.Client prom *prom.Client
log *log.Helper log *log.Helper
nodeSelectors string
} }
func NewHygon(prom *prom.Client, log *log.Helper) *Hygon { func NewHygon(prom *prom.Client, log *log.Helper, nodeSelectors string) *Hygon {
return &Hygon{ return &Hygon{
prom: prom, prom: prom,
log: log, log: log,
nodeSelectors: nodeSelectors,
} }
} }
func (c *Hygon) GetNodeDevicePluginLabels() (labels.Selector, error) { func (h *Hygon) GetNodeDevicePluginLabels() (labels.Selector, error) {
return labels.Parse("dcu=on") return labels.Parse(h.nodeSelectors)
} }
func (c *Hygon) GetProvider() string { func (h *Hygon) GetProvider() string {
return HygonDCUDevice return HygonDCUDevice
} }
@ -40,19 +43,19 @@ type DeviceMeta struct {
Driver string Driver string
} }
func (c *Hygon) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.DeviceInfo { func (h *Hygon) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.DeviceInfo {
deviceMap := make(map[string]*util.DeviceInfo) deviceMap := make(map[string]*util.DeviceInfo)
queryString := fmt.Sprintf("dcu_temp{node=\"%s\"}", node.Name) queryString := fmt.Sprintf("dcu_temp{node=\"%s\"}", node.Name)
vs, err := c.prom.Query(context.Background(), queryString) vs, err := h.prom.Query(context.Background(), queryString)
if err != nil { if err != nil {
c.log.Warnf("Failed to query %s: %v", queryString, err) h.log.Warnf("Failed to query %s: %v", queryString, err)
return deviceMap return deviceMap
} }
vector, ok := vs.(model.Vector) vector, ok := vs.(model.Vector)
if !ok { if !ok {
c.log.Warnf("Unexpected result type: %v", vs) h.log.Warnf("Unexpected result type: %v", vs)
return deviceMap return deviceMap
} }
@ -68,34 +71,34 @@ func (c *Hygon) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.Dev
return deviceMap return deviceMap
} }
func (c *Hygon) FetchDevices(node *corev1.Node) ([]*util.DeviceInfo, error) { func (h *Hygon) FetchDevices(node *corev1.Node) ([]*util.DeviceInfo, error) {
devEncoded, ok := node.Annotations[RegisterAnnos] devEncoded, ok := node.Annotations[RegisterAnnos]
if !ok { if !ok {
return []*util.DeviceInfo{}, errors.New("annos not found " + RegisterAnnos) return []*util.DeviceInfo{}, errors.New("annos not found " + RegisterAnnos)
} }
nodedevices, err := util.DecodeNodeDevices(devEncoded, c.log) nodedevices, err := util.DecodeNodeDevices(devEncoded, h.log)
if err != nil { if err != nil {
c.log.Errorw("failed to decode node devices", err, "node", node.Name, "device annotation", devEncoded) h.log.Errorw("failed to decode node devices", err, "node", node.Name, "device annotation", devEncoded)
return []*util.DeviceInfo{}, err return []*util.DeviceInfo{}, err
} }
if len(nodedevices) == 0 { if len(nodedevices) == 0 {
c.log.Infow("event", "no gpu device found", "node", node.Name, "device annotation", devEncoded) h.log.Infow("event", "no gpu device found", "node", node.Name, "device annotation", devEncoded)
return []*util.DeviceInfo{}, errors.New("no gpu found on node") return []*util.DeviceInfo{}, errors.New("no gpu found on node")
} }
devDecoded := util.EncodeNodeDevices(nodedevices, c.log) devDecoded := util.EncodeNodeDevices(nodedevices, h.log)
c.log.Infow("event", "nodes device information", "node", node.Name, "nodedevices", devDecoded) h.log.Infow("event", "nodes device information", "node", node.Name, "nodedevices", devDecoded)
devDetail := c.GetDevicesFromPrometheus(node) devDetail := h.GetDevicesFromPrometheus(node)
for _, nodedevice := range nodedevices { for _, nodedevice := range nodedevices {
idParts := strings.Split(nodedevice.ID, "-") idParts := strings.Split(nodedevice.ID, "-")
if len(idParts) < 2 { if len(idParts) < 2 {
c.log.Warnf("Invalid nodedevice.ID format: %s", nodedevice.ID) h.log.Warnf("Invalid nodedevice.ID format: %s", nodedevice.ID)
continue continue
} }
devDetailID := idParts[1] devDetailID := idParts[1]
devInfo, exists := devDetail[devDetailID] devInfo, exists := devDetail[devDetailID]
if !exists { if !exists {
c.log.Warnf("Device ID %s not found in devDetail", devDetailID) h.log.Warnf("Device ID %s not found in devDetail", devDetailID)
continue continue
} }

@ -15,17 +15,20 @@ import (
type Cambricon struct { type Cambricon struct {
prom *prom.Client prom *prom.Client
log *log.Helper log *log.Helper
labelsSelector string
} }
func NewCambricon(prom *prom.Client, log *log.Helper) *Cambricon { func NewCambricon(prom *prom.Client, log *log.Helper, labelSelector string) *Cambricon {
return &Cambricon{ return &Cambricon{
prom: prom, prom: prom,
log: log, log: log,
labelsSelector: labelSelector,
} }
} }
func (c *Cambricon) GetNodeDevicePluginLabels() (labels.Selector, error) { func (c *Cambricon) GetNodeDevicePluginLabels() (labels.Selector, error) {
return labels.Parse("mlu=on") return labels.Parse(c.labelsSelector)
} }
func (c *Cambricon) GetProvider() string { func (c *Cambricon) GetProvider() string {

@ -12,17 +12,20 @@ import (
type Nvidia struct { type Nvidia struct {
prom *prom.Client prom *prom.Client
log *log.Helper log *log.Helper
labelSelector string
} }
func NewNvidia(prom *prom.Client, log *log.Helper) *Nvidia { func NewNvidia(prom *prom.Client, log *log.Helper, labelSelector string) *Nvidia {
return &Nvidia{ return &Nvidia{
prom: prom, prom: prom,
log: log, log: log,
labelSelector: labelSelector,
} }
} }
func (n *Nvidia) GetNodeDevicePluginLabels() (labels.Selector, error) { func (n *Nvidia) GetNodeDevicePluginLabels() (labels.Selector, error) {
return labels.Parse("gpu=on") return labels.Parse(n.labelSelector)
} }
func (n *Nvidia) GetProvider() string { func (n *Nvidia) GetProvider() string {

Loading…
Cancel
Save