Merge pull request #14 from Nimbus318/feat/add-node-selector-support

Add support for configuring NodeSelectors for different node types
main
霓漠Nimbus 10 months ago committed by GitHub
commit f080a9daf3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -14,4 +14,8 @@ data:
timeout: 1s timeout: 1s
prometheus: prometheus:
address: {{ ternary .Values.externalPrometheus.address (printf "http://%s-kube-prometh-prometheus.%s.svc.cluster.local:9090" (include "hami-webui.fullname" .) (include "hami-webui.namespace" .)) .Values.externalPrometheus.enabled }} address: {{ ternary .Values.externalPrometheus.address (printf "http://%s-kube-prometh-prometheus.%s.svc.cluster.local:9090" (include "hami-webui.fullname" .) (include "hami-webui.namespace" .)) .Values.externalPrometheus.enabled }}
timeout: 1m timeout: 1m
node_selectors:
{{- range $key, $value := .Values.vendorNodeSelectors }}
{{ $key }}: {{ $value }}
{{- end }}

@ -4,16 +4,22 @@
replicaCount: 1 replicaCount: 1
vendorNodeSelectors:
NVIDIA: gpu=on
Ascend: ascend=on
DCU: dcu=on
MLU: mlu=on
image: image:
frontend: frontend:
repository: projecthami/hami-webui-fe-oss repository: projecthami/hami-webui-fe-oss
pullPolicy: IfNotPresent pullPolicy: IfNotPresent
# Overrides the image tag whose default is the chart appVersion. # Overrides the image tag whose default is the chart appVersion.
tag: "v1.0.4" tag: "main"
backend: backend:
repository: projecthami/hami-webui-be-oss repository: projecthami/hami-webui-be-oss
pullPolicy: IfNotPresent pullPolicy: IfNotPresent
tag: "v1.0.4" tag: "main"
imagePullSecrets: [] imagePullSecrets: []
nameOverride: "" nameOverride: ""

@ -8,6 +8,7 @@ import (
"github.com/go-kratos/kratos/v2/transport/grpc" "github.com/go-kratos/kratos/v2/transport/grpc"
"github.com/go-kratos/kratos/v2/transport/http" "github.com/go-kratos/kratos/v2/transport/http"
"os" "os"
"vgpu/internal/conf"
_ "go.uber.org/automaxprocs" _ "go.uber.org/automaxprocs"
) )
@ -57,3 +58,7 @@ func newApp(ctx context.Context, logger log.Logger, gs *grpc.Server, hs *http.Se
), ),
) )
} }
func getNodeSelectors(c *conf.Bootstrap) map[string]string {
return c.NodeSelectors
}

@ -26,5 +26,6 @@ func initApp(configPath string, ctx context.Context) (*kratos.App, func(), error
service.ProviderSet, service.ProviderSet,
exporter.ProviderSet, exporter.ProviderSet,
newApp, newApp,
getNodeSelectors,
)) ))
} }

@ -8,3 +8,8 @@ server:
prometheus: prometheus:
address: http://localhost:9090 address: http://localhost:9090
timeout: 1m timeout: 1m
node_selectors:
NVIDIA: gpu=on
Ascend: ascend=on
DCU: dcu=on
MLU: mlu=on

@ -8,6 +8,7 @@ import "google/protobuf/duration.proto";
message Bootstrap { message Bootstrap {
Server server = 1; Server server = 1;
Prometheus prometheus = 2; Prometheus prometheus = 2;
map<string, string> node_selectors = 3;
} }
message Server { message Server {

@ -32,17 +32,17 @@ type nodeRepo struct {
} }
// NewNodeRepo . // NewNodeRepo .
func NewNodeRepo(data *Data, logger log.Logger) biz.NodeRepo { func NewNodeRepo(data *Data, nodeSelectors map[string]string, logger log.Logger) biz.NodeRepo {
nodeRepo := &nodeRepo{ nodeRepo := &nodeRepo{
data: data, data: data,
nodeNotify: make(chan struct{}, 1), nodeNotify: make(chan struct{}, 1),
nodes: map[k8stypes.UID]*biz.Node{}, nodes: map[k8stypes.UID]*biz.Node{},
log: log.NewHelper(logger), log: log.NewHelper(logger),
providers: []provider.Provider{ providers: []provider.Provider{
nvidia.NewNvidia(data.promCl, log.NewHelper(logger)), nvidia.NewNvidia(data.promCl, log.NewHelper(logger), nodeSelectors[biz.NvidiaGPUDevice]),
mlu.NewCambricon(data.promCl, log.NewHelper(logger)), mlu.NewCambricon(data.promCl, log.NewHelper(logger), nodeSelectors[biz.CambriconGPUDevice]),
ascend.NewAscend(data.promCl, log.NewHelper(logger)), ascend.NewAscend(data.promCl, log.NewHelper(logger), nodeSelectors[biz.AscendGPUDevice]),
hygon.NewHygon(data.promCl, log.NewHelper(logger)), hygon.NewHygon(data.promCl, log.NewHelper(logger), nodeSelectors[biz.HygonGPUDevice]),
}, },
} }
nodeRepo.init() nodeRepo.init()

@ -16,20 +16,23 @@ import (
type Ascend struct { type Ascend struct {
prom *prom.Client prom *prom.Client
log *log.Helper log *log.Helper
nodeSelectors string
} }
func NewAscend(prom *prom.Client, log *log.Helper) *Ascend { func NewAscend(prom *prom.Client, log *log.Helper, nodeSelectors string) *Ascend {
return &Ascend{ return &Ascend{
prom: prom, prom: prom,
log: log, log: log,
nodeSelectors: nodeSelectors,
} }
} }
func (c *Ascend) GetNodeDevicePluginLabels() (labels.Selector, error) { func (a *Ascend) GetNodeDevicePluginLabels() (labels.Selector, error) {
return labels.Parse("servertype=Ascend910B-20") return labels.Parse(a.nodeSelectors)
} }
func (c *Ascend) GetProvider() string { func (a *Ascend) GetProvider() string {
return AscendDevice return AscendDevice
} }
@ -39,16 +42,16 @@ type DeviceMeta struct {
Driver string Driver string
} }
func (c *Ascend) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.DeviceInfo { func (a *Ascend) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.DeviceInfo {
device := make(map[string]*util.DeviceInfo) device := make(map[string]*util.DeviceInfo)
queryString := fmt.Sprintf("npu_chip_info_health_status{node=\"%s\"}", node.Name) queryString := fmt.Sprintf("npu_chip_info_health_status{node=\"%s\"}", node.Name)
vs, err := c.prom.Query(context.Background(), queryString) vs, err := a.prom.Query(context.Background(), queryString)
if err != nil { if err != nil {
c.log.Warnf("query %s failed", queryString) a.log.Warnf("query %s failed", queryString)
} else { } else {
ds, ok := vs.(model.Vector) ds, ok := vs.(model.Vector)
if !ok { if !ok {
c.log.Warnf("vectorValue: %v, failed", vs) a.log.Warnf("vectorValue: %v, failed", vs)
} else { } else {
for _, d := range ds { for _, d := range ds {
id := d.Metric["id"] id := d.Metric["id"]
@ -68,12 +71,12 @@ func (c *Ascend) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.De
return device return device
} }
func (c *Ascend) FetchDevices(node *corev1.Node) ([]*util.DeviceInfo, error) { func (a *Ascend) FetchDevices(node *corev1.Node) ([]*util.DeviceInfo, error) {
nodedevices := []*util.DeviceInfo{} nodedevices := []*util.DeviceInfo{}
i := 0 i := 0
cards, _ := node.Status.Capacity.Name(corev1.ResourceName(AscendResourceCoreCount), resource.DecimalSI).AsInt64() cards, _ := node.Status.Capacity.Name(corev1.ResourceName(AscendResourceCoreCount), resource.DecimalSI).AsInt64()
tmpDevice := c.GetDevicesFromPrometheus(node) tmpDevice := a.GetDevicesFromPrometheus(node)
for int64(i)*10 < cards { for int64(i)*10 < cards {
index := fmt.Sprintf("%d", i) index := fmt.Sprintf("%d", i)
if _, ok := tmpDevice[index]; !ok { if _, ok := tmpDevice[index]; !ok {

@ -17,20 +17,23 @@ import (
type Hygon struct { type Hygon struct {
prom *prom.Client prom *prom.Client
log *log.Helper log *log.Helper
nodeSelectors string
} }
func NewHygon(prom *prom.Client, log *log.Helper) *Hygon { func NewHygon(prom *prom.Client, log *log.Helper, nodeSelectors string) *Hygon {
return &Hygon{ return &Hygon{
prom: prom, prom: prom,
log: log, log: log,
nodeSelectors: nodeSelectors,
} }
} }
func (c *Hygon) GetNodeDevicePluginLabels() (labels.Selector, error) { func (h *Hygon) GetNodeDevicePluginLabels() (labels.Selector, error) {
return labels.Parse("dcu=on") return labels.Parse(h.nodeSelectors)
} }
func (c *Hygon) GetProvider() string { func (h *Hygon) GetProvider() string {
return HygonDCUDevice return HygonDCUDevice
} }
@ -40,19 +43,19 @@ type DeviceMeta struct {
Driver string Driver string
} }
func (c *Hygon) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.DeviceInfo { func (h *Hygon) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.DeviceInfo {
deviceMap := make(map[string]*util.DeviceInfo) deviceMap := make(map[string]*util.DeviceInfo)
queryString := fmt.Sprintf("dcu_temp{node=\"%s\"}", node.Name) queryString := fmt.Sprintf("dcu_temp{node=\"%s\"}", node.Name)
vs, err := c.prom.Query(context.Background(), queryString) vs, err := h.prom.Query(context.Background(), queryString)
if err != nil { if err != nil {
c.log.Warnf("Failed to query %s: %v", queryString, err) h.log.Warnf("Failed to query %s: %v", queryString, err)
return deviceMap return deviceMap
} }
vector, ok := vs.(model.Vector) vector, ok := vs.(model.Vector)
if !ok { if !ok {
c.log.Warnf("Unexpected result type: %v", vs) h.log.Warnf("Unexpected result type: %v", vs)
return deviceMap return deviceMap
} }
@ -68,34 +71,34 @@ func (c *Hygon) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.Dev
return deviceMap return deviceMap
} }
func (c *Hygon) FetchDevices(node *corev1.Node) ([]*util.DeviceInfo, error) { func (h *Hygon) FetchDevices(node *corev1.Node) ([]*util.DeviceInfo, error) {
devEncoded, ok := node.Annotations[RegisterAnnos] devEncoded, ok := node.Annotations[RegisterAnnos]
if !ok { if !ok {
return []*util.DeviceInfo{}, errors.New("annos not found " + RegisterAnnos) return []*util.DeviceInfo{}, errors.New("annos not found " + RegisterAnnos)
} }
nodedevices, err := util.DecodeNodeDevices(devEncoded, c.log) nodedevices, err := util.DecodeNodeDevices(devEncoded, h.log)
if err != nil { if err != nil {
c.log.Errorw("failed to decode node devices", err, "node", node.Name, "device annotation", devEncoded) h.log.Errorw("failed to decode node devices", err, "node", node.Name, "device annotation", devEncoded)
return []*util.DeviceInfo{}, err return []*util.DeviceInfo{}, err
} }
if len(nodedevices) == 0 { if len(nodedevices) == 0 {
c.log.Infow("event", "no gpu device found", "node", node.Name, "device annotation", devEncoded) h.log.Infow("event", "no gpu device found", "node", node.Name, "device annotation", devEncoded)
return []*util.DeviceInfo{}, errors.New("no gpu found on node") return []*util.DeviceInfo{}, errors.New("no gpu found on node")
} }
devDecoded := util.EncodeNodeDevices(nodedevices, c.log) devDecoded := util.EncodeNodeDevices(nodedevices, h.log)
c.log.Infow("event", "nodes device information", "node", node.Name, "nodedevices", devDecoded) h.log.Infow("event", "nodes device information", "node", node.Name, "nodedevices", devDecoded)
devDetail := c.GetDevicesFromPrometheus(node) devDetail := h.GetDevicesFromPrometheus(node)
for _, nodedevice := range nodedevices { for _, nodedevice := range nodedevices {
idParts := strings.Split(nodedevice.ID, "-") idParts := strings.Split(nodedevice.ID, "-")
if len(idParts) < 2 { if len(idParts) < 2 {
c.log.Warnf("Invalid nodedevice.ID format: %s", nodedevice.ID) h.log.Warnf("Invalid nodedevice.ID format: %s", nodedevice.ID)
continue continue
} }
devDetailID := idParts[1] devDetailID := idParts[1]
devInfo, exists := devDetail[devDetailID] devInfo, exists := devDetail[devDetailID]
if !exists { if !exists {
c.log.Warnf("Device ID %s not found in devDetail", devDetailID) h.log.Warnf("Device ID %s not found in devDetail", devDetailID)
continue continue
} }

@ -15,17 +15,20 @@ import (
type Cambricon struct { type Cambricon struct {
prom *prom.Client prom *prom.Client
log *log.Helper log *log.Helper
labelsSelector string
} }
func NewCambricon(prom *prom.Client, log *log.Helper) *Cambricon { func NewCambricon(prom *prom.Client, log *log.Helper, labelSelector string) *Cambricon {
return &Cambricon{ return &Cambricon{
prom: prom, prom: prom,
log: log, log: log,
labelsSelector: labelSelector,
} }
} }
func (c *Cambricon) GetNodeDevicePluginLabels() (labels.Selector, error) { func (c *Cambricon) GetNodeDevicePluginLabels() (labels.Selector, error) {
return labels.Parse("mlu=on") return labels.Parse(c.labelsSelector)
} }
func (c *Cambricon) GetProvider() string { func (c *Cambricon) GetProvider() string {

@ -12,17 +12,20 @@ import (
type Nvidia struct { type Nvidia struct {
prom *prom.Client prom *prom.Client
log *log.Helper log *log.Helper
labelSelector string
} }
func NewNvidia(prom *prom.Client, log *log.Helper) *Nvidia { func NewNvidia(prom *prom.Client, log *log.Helper, labelSelector string) *Nvidia {
return &Nvidia{ return &Nvidia{
prom: prom, prom: prom,
log: log, log: log,
labelSelector: labelSelector,
} }
} }
func (n *Nvidia) GetNodeDevicePluginLabels() (labels.Selector, error) { func (n *Nvidia) GetNodeDevicePluginLabels() (labels.Selector, error) {
return labels.Parse("gpu=on") return labels.Parse(n.labelSelector)
} }
func (n *Nvidia) GetProvider() string { func (n *Nvidia) GetProvider() string {

Loading…
Cancel
Save