Merge pull request #14 from Nimbus318/feat/add-node-selector-support

Add support for configuring NodeSelectors for different node types
main
霓漠Nimbus 10 months ago committed by GitHub
commit f080a9daf3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -14,4 +14,8 @@ data:
timeout: 1s
prometheus:
address: {{ ternary .Values.externalPrometheus.address (printf "http://%s-kube-prometh-prometheus.%s.svc.cluster.local:9090" (include "hami-webui.fullname" .) (include "hami-webui.namespace" .)) .Values.externalPrometheus.enabled }}
timeout: 1m
timeout: 1m
node_selectors:
{{- range $key, $value := .Values.vendorNodeSelectors }}
{{ $key }}: {{ $value }}
{{- end }}

@ -4,16 +4,22 @@
replicaCount: 1
vendorNodeSelectors:
NVIDIA: gpu=on
Ascend: ascend=on
DCU: dcu=on
MLU: mlu=on
image:
frontend:
repository: projecthami/hami-webui-fe-oss
pullPolicy: IfNotPresent
# Overrides the image tag whose default is the chart appVersion.
tag: "v1.0.4"
tag: "main"
backend:
repository: projecthami/hami-webui-be-oss
pullPolicy: IfNotPresent
tag: "v1.0.4"
tag: "main"
imagePullSecrets: []
nameOverride: ""

@ -8,6 +8,7 @@ import (
"github.com/go-kratos/kratos/v2/transport/grpc"
"github.com/go-kratos/kratos/v2/transport/http"
"os"
"vgpu/internal/conf"
_ "go.uber.org/automaxprocs"
)
@ -57,3 +58,7 @@ func newApp(ctx context.Context, logger log.Logger, gs *grpc.Server, hs *http.Se
),
)
}
func getNodeSelectors(c *conf.Bootstrap) map[string]string {
return c.NodeSelectors
}

@ -26,5 +26,6 @@ func initApp(configPath string, ctx context.Context) (*kratos.App, func(), error
service.ProviderSet,
exporter.ProviderSet,
newApp,
getNodeSelectors,
))
}

@ -8,3 +8,8 @@ server:
prometheus:
address: http://localhost:9090
timeout: 1m
node_selectors:
NVIDIA: gpu=on
Ascend: ascend=on
DCU: dcu=on
MLU: mlu=on

@ -8,6 +8,7 @@ import "google/protobuf/duration.proto";
message Bootstrap {
Server server = 1;
Prometheus prometheus = 2;
map<string, string> node_selectors = 3;
}
message Server {

@ -32,17 +32,17 @@ type nodeRepo struct {
}
// NewNodeRepo .
func NewNodeRepo(data *Data, logger log.Logger) biz.NodeRepo {
func NewNodeRepo(data *Data, nodeSelectors map[string]string, logger log.Logger) biz.NodeRepo {
nodeRepo := &nodeRepo{
data: data,
nodeNotify: make(chan struct{}, 1),
nodes: map[k8stypes.UID]*biz.Node{},
log: log.NewHelper(logger),
providers: []provider.Provider{
nvidia.NewNvidia(data.promCl, log.NewHelper(logger)),
mlu.NewCambricon(data.promCl, log.NewHelper(logger)),
ascend.NewAscend(data.promCl, log.NewHelper(logger)),
hygon.NewHygon(data.promCl, log.NewHelper(logger)),
nvidia.NewNvidia(data.promCl, log.NewHelper(logger), nodeSelectors[biz.NvidiaGPUDevice]),
mlu.NewCambricon(data.promCl, log.NewHelper(logger), nodeSelectors[biz.CambriconGPUDevice]),
ascend.NewAscend(data.promCl, log.NewHelper(logger), nodeSelectors[biz.AscendGPUDevice]),
hygon.NewHygon(data.promCl, log.NewHelper(logger), nodeSelectors[biz.HygonGPUDevice]),
},
}
nodeRepo.init()

@ -16,20 +16,23 @@ import (
type Ascend struct {
prom *prom.Client
log *log.Helper
nodeSelectors string
}
func NewAscend(prom *prom.Client, log *log.Helper) *Ascend {
func NewAscend(prom *prom.Client, log *log.Helper, nodeSelectors string) *Ascend {
return &Ascend{
prom: prom,
log: log,
prom: prom,
log: log,
nodeSelectors: nodeSelectors,
}
}
func (c *Ascend) GetNodeDevicePluginLabels() (labels.Selector, error) {
return labels.Parse("servertype=Ascend910B-20")
func (a *Ascend) GetNodeDevicePluginLabels() (labels.Selector, error) {
return labels.Parse(a.nodeSelectors)
}
func (c *Ascend) GetProvider() string {
func (a *Ascend) GetProvider() string {
return AscendDevice
}
@ -39,16 +42,16 @@ type DeviceMeta struct {
Driver string
}
func (c *Ascend) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.DeviceInfo {
func (a *Ascend) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.DeviceInfo {
device := make(map[string]*util.DeviceInfo)
queryString := fmt.Sprintf("npu_chip_info_health_status{node=\"%s\"}", node.Name)
vs, err := c.prom.Query(context.Background(), queryString)
vs, err := a.prom.Query(context.Background(), queryString)
if err != nil {
c.log.Warnf("query %s failed", queryString)
a.log.Warnf("query %s failed", queryString)
} else {
ds, ok := vs.(model.Vector)
if !ok {
c.log.Warnf("vectorValue: %v, failed", vs)
a.log.Warnf("vectorValue: %v, failed", vs)
} else {
for _, d := range ds {
id := d.Metric["id"]
@ -68,12 +71,12 @@ func (c *Ascend) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.De
return device
}
func (c *Ascend) FetchDevices(node *corev1.Node) ([]*util.DeviceInfo, error) {
func (a *Ascend) FetchDevices(node *corev1.Node) ([]*util.DeviceInfo, error) {
nodedevices := []*util.DeviceInfo{}
i := 0
cards, _ := node.Status.Capacity.Name(corev1.ResourceName(AscendResourceCoreCount), resource.DecimalSI).AsInt64()
tmpDevice := c.GetDevicesFromPrometheus(node)
tmpDevice := a.GetDevicesFromPrometheus(node)
for int64(i)*10 < cards {
index := fmt.Sprintf("%d", i)
if _, ok := tmpDevice[index]; !ok {

@ -17,20 +17,23 @@ import (
type Hygon struct {
prom *prom.Client
log *log.Helper
nodeSelectors string
}
func NewHygon(prom *prom.Client, log *log.Helper) *Hygon {
func NewHygon(prom *prom.Client, log *log.Helper, nodeSelectors string) *Hygon {
return &Hygon{
prom: prom,
log: log,
prom: prom,
log: log,
nodeSelectors: nodeSelectors,
}
}
func (c *Hygon) GetNodeDevicePluginLabels() (labels.Selector, error) {
return labels.Parse("dcu=on")
func (h *Hygon) GetNodeDevicePluginLabels() (labels.Selector, error) {
return labels.Parse(h.nodeSelectors)
}
func (c *Hygon) GetProvider() string {
func (h *Hygon) GetProvider() string {
return HygonDCUDevice
}
@ -40,19 +43,19 @@ type DeviceMeta struct {
Driver string
}
func (c *Hygon) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.DeviceInfo {
func (h *Hygon) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.DeviceInfo {
deviceMap := make(map[string]*util.DeviceInfo)
queryString := fmt.Sprintf("dcu_temp{node=\"%s\"}", node.Name)
vs, err := c.prom.Query(context.Background(), queryString)
vs, err := h.prom.Query(context.Background(), queryString)
if err != nil {
c.log.Warnf("Failed to query %s: %v", queryString, err)
h.log.Warnf("Failed to query %s: %v", queryString, err)
return deviceMap
}
vector, ok := vs.(model.Vector)
if !ok {
c.log.Warnf("Unexpected result type: %v", vs)
h.log.Warnf("Unexpected result type: %v", vs)
return deviceMap
}
@ -68,34 +71,34 @@ func (c *Hygon) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.Dev
return deviceMap
}
func (c *Hygon) FetchDevices(node *corev1.Node) ([]*util.DeviceInfo, error) {
func (h *Hygon) FetchDevices(node *corev1.Node) ([]*util.DeviceInfo, error) {
devEncoded, ok := node.Annotations[RegisterAnnos]
if !ok {
return []*util.DeviceInfo{}, errors.New("annos not found " + RegisterAnnos)
}
nodedevices, err := util.DecodeNodeDevices(devEncoded, c.log)
nodedevices, err := util.DecodeNodeDevices(devEncoded, h.log)
if err != nil {
c.log.Errorw("failed to decode node devices", err, "node", node.Name, "device annotation", devEncoded)
h.log.Errorw("failed to decode node devices", err, "node", node.Name, "device annotation", devEncoded)
return []*util.DeviceInfo{}, err
}
if len(nodedevices) == 0 {
c.log.Infow("event", "no gpu device found", "node", node.Name, "device annotation", devEncoded)
h.log.Infow("event", "no gpu device found", "node", node.Name, "device annotation", devEncoded)
return []*util.DeviceInfo{}, errors.New("no gpu found on node")
}
devDecoded := util.EncodeNodeDevices(nodedevices, c.log)
c.log.Infow("event", "nodes device information", "node", node.Name, "nodedevices", devDecoded)
devDetail := c.GetDevicesFromPrometheus(node)
devDecoded := util.EncodeNodeDevices(nodedevices, h.log)
h.log.Infow("event", "nodes device information", "node", node.Name, "nodedevices", devDecoded)
devDetail := h.GetDevicesFromPrometheus(node)
for _, nodedevice := range nodedevices {
idParts := strings.Split(nodedevice.ID, "-")
if len(idParts) < 2 {
c.log.Warnf("Invalid nodedevice.ID format: %s", nodedevice.ID)
h.log.Warnf("Invalid nodedevice.ID format: %s", nodedevice.ID)
continue
}
devDetailID := idParts[1]
devInfo, exists := devDetail[devDetailID]
if !exists {
c.log.Warnf("Device ID %s not found in devDetail", devDetailID)
h.log.Warnf("Device ID %s not found in devDetail", devDetailID)
continue
}

@ -15,17 +15,20 @@ import (
type Cambricon struct {
prom *prom.Client
log *log.Helper
labelsSelector string
}
func NewCambricon(prom *prom.Client, log *log.Helper) *Cambricon {
func NewCambricon(prom *prom.Client, log *log.Helper, labelSelector string) *Cambricon {
return &Cambricon{
prom: prom,
log: log,
prom: prom,
log: log,
labelsSelector: labelSelector,
}
}
func (c *Cambricon) GetNodeDevicePluginLabels() (labels.Selector, error) {
return labels.Parse("mlu=on")
return labels.Parse(c.labelsSelector)
}
func (c *Cambricon) GetProvider() string {

@ -12,17 +12,20 @@ import (
type Nvidia struct {
prom *prom.Client
log *log.Helper
labelSelector string
}
func NewNvidia(prom *prom.Client, log *log.Helper) *Nvidia {
func NewNvidia(prom *prom.Client, log *log.Helper, labelSelector string) *Nvidia {
return &Nvidia{
prom: prom,
log: log,
prom: prom,
log: log,
labelSelector: labelSelector,
}
}
func (n *Nvidia) GetNodeDevicePluginLabels() (labels.Selector, error) {
return labels.Parse("gpu=on")
return labels.Parse(n.labelSelector)
}
func (n *Nvidia) GetProvider() string {

Loading…
Cancel
Save