feat: support vendorNodeSelectors in Proto, Helm

Signed-off-by: Nimbus318 <136771156+Nimbus318@users.noreply.github.com>
main
Nimbus318 7 months ago
parent f7a7837f1c
commit c1e38c06e1

@ -14,4 +14,8 @@ data:
timeout: 1s
prometheus:
address: {{ ternary .Values.externalPrometheus.address (printf "http://%s-kube-prometh-prometheus.%s.svc.cluster.local:9090" (include "hami-webui.fullname" .) (include "hami-webui.namespace" .)) .Values.externalPrometheus.enabled }}
timeout: 1m
timeout: 1m
node_selectors:
{{- range $key, $value := .Values.vendorNodeSelectors }}
{{ $key }}: {{ $value }}
{{- end }}

@ -4,16 +4,22 @@
replicaCount: 1
vendorNodeSelectors:
NVIDIA: gpu=on
Ascend: ascend=on
DCU: dcu=on
MLU: mlu=on
image:
frontend:
repository: projecthami/hami-webui-fe-oss
pullPolicy: IfNotPresent
# Overrides the image tag whose default is the chart appVersion.
tag: "v1.0.4"
tag: "main"
backend:
repository: projecthami/hami-webui-be-oss
pullPolicy: IfNotPresent
tag: "v1.0.4"
tag: "main"
imagePullSecrets: []
nameOverride: ""

@ -8,6 +8,7 @@ import (
"github.com/go-kratos/kratos/v2/transport/grpc"
"github.com/go-kratos/kratos/v2/transport/http"
"os"
"vgpu/internal/conf"
_ "go.uber.org/automaxprocs"
)
@ -57,3 +58,7 @@ func newApp(ctx context.Context, logger log.Logger, gs *grpc.Server, hs *http.Se
),
)
}
func getNodeSelectors(c *conf.Bootstrap) map[string]string {
return c.NodeSelectors
}

@ -26,5 +26,6 @@ func initApp(configPath string, ctx context.Context) (*kratos.App, func(), error
service.ProviderSet,
exporter.ProviderSet,
newApp,
getNodeSelectors,
))
}

@ -8,3 +8,8 @@ server:
prometheus:
address: http://localhost:9090
timeout: 1m
node_selectors:
NVIDIA: gpu=on
Ascend: ascend=on
DCU: dcu=on
MLU: mlu=on

@ -8,6 +8,7 @@ import "google/protobuf/duration.proto";
message Bootstrap {
Server server = 1;
Prometheus prometheus = 2;
map<string, string> node_selectors = 3;
}
message Server {

@ -32,17 +32,17 @@ type nodeRepo struct {
}
// NewNodeRepo .
func NewNodeRepo(data *Data, logger log.Logger) biz.NodeRepo {
func NewNodeRepo(data *Data, nodeSelectors map[string]string, logger log.Logger) biz.NodeRepo {
nodeRepo := &nodeRepo{
data: data,
nodeNotify: make(chan struct{}, 1),
nodes: map[k8stypes.UID]*biz.Node{},
log: log.NewHelper(logger),
providers: []provider.Provider{
nvidia.NewNvidia(data.promCl, log.NewHelper(logger)),
mlu.NewCambricon(data.promCl, log.NewHelper(logger)),
ascend.NewAscend(data.promCl, log.NewHelper(logger)),
hygon.NewHygon(data.promCl, log.NewHelper(logger)),
nvidia.NewNvidia(data.promCl, log.NewHelper(logger), nodeSelectors[biz.NvidiaGPUDevice]),
mlu.NewCambricon(data.promCl, log.NewHelper(logger), nodeSelectors[biz.CambriconGPUDevice]),
ascend.NewAscend(data.promCl, log.NewHelper(logger), nodeSelectors[biz.AscendGPUDevice]),
hygon.NewHygon(data.promCl, log.NewHelper(logger), nodeSelectors[biz.HygonGPUDevice]),
},
}
nodeRepo.init()

@ -16,20 +16,23 @@ import (
type Ascend struct {
prom *prom.Client
log *log.Helper
nodeSelectors string
}
func NewAscend(prom *prom.Client, log *log.Helper) *Ascend {
func NewAscend(prom *prom.Client, log *log.Helper, nodeSelectors string) *Ascend {
return &Ascend{
prom: prom,
log: log,
prom: prom,
log: log,
nodeSelectors: nodeSelectors,
}
}
func (c *Ascend) GetNodeDevicePluginLabels() (labels.Selector, error) {
return labels.Parse("servertype=Ascend910B-20")
func (a *Ascend) GetNodeDevicePluginLabels() (labels.Selector, error) {
return labels.Parse(a.nodeSelectors)
}
func (c *Ascend) GetProvider() string {
func (a *Ascend) GetProvider() string {
return AscendDevice
}
@ -39,16 +42,16 @@ type DeviceMeta struct {
Driver string
}
func (c *Ascend) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.DeviceInfo {
func (a *Ascend) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.DeviceInfo {
device := make(map[string]*util.DeviceInfo)
queryString := fmt.Sprintf("npu_chip_info_health_status{node=\"%s\"}", node.Name)
vs, err := c.prom.Query(context.Background(), queryString)
vs, err := a.prom.Query(context.Background(), queryString)
if err != nil {
c.log.Warnf("query %s failed", queryString)
a.log.Warnf("query %s failed", queryString)
} else {
ds, ok := vs.(model.Vector)
if !ok {
c.log.Warnf("vectorValue: %v, failed", vs)
a.log.Warnf("vectorValue: %v, failed", vs)
} else {
for _, d := range ds {
id := d.Metric["id"]
@ -68,12 +71,12 @@ func (c *Ascend) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.De
return device
}
func (c *Ascend) FetchDevices(node *corev1.Node) ([]*util.DeviceInfo, error) {
func (a *Ascend) FetchDevices(node *corev1.Node) ([]*util.DeviceInfo, error) {
nodedevices := []*util.DeviceInfo{}
i := 0
cards, _ := node.Status.Capacity.Name(corev1.ResourceName(AscendResourceCoreCount), resource.DecimalSI).AsInt64()
tmpDevice := c.GetDevicesFromPrometheus(node)
tmpDevice := a.GetDevicesFromPrometheus(node)
for int64(i)*10 < cards {
index := fmt.Sprintf("%d", i)
if _, ok := tmpDevice[index]; !ok {

@ -17,20 +17,23 @@ import (
type Hygon struct {
prom *prom.Client
log *log.Helper
nodeSelectors string
}
func NewHygon(prom *prom.Client, log *log.Helper) *Hygon {
func NewHygon(prom *prom.Client, log *log.Helper, nodeSelectors string) *Hygon {
return &Hygon{
prom: prom,
log: log,
prom: prom,
log: log,
nodeSelectors: nodeSelectors,
}
}
func (c *Hygon) GetNodeDevicePluginLabels() (labels.Selector, error) {
return labels.Parse("dcu=on")
func (h *Hygon) GetNodeDevicePluginLabels() (labels.Selector, error) {
return labels.Parse(h.nodeSelectors)
}
func (c *Hygon) GetProvider() string {
func (h *Hygon) GetProvider() string {
return HygonDCUDevice
}
@ -40,19 +43,19 @@ type DeviceMeta struct {
Driver string
}
func (c *Hygon) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.DeviceInfo {
func (h *Hygon) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.DeviceInfo {
deviceMap := make(map[string]*util.DeviceInfo)
queryString := fmt.Sprintf("dcu_temp{node=\"%s\"}", node.Name)
vs, err := c.prom.Query(context.Background(), queryString)
vs, err := h.prom.Query(context.Background(), queryString)
if err != nil {
c.log.Warnf("Failed to query %s: %v", queryString, err)
h.log.Warnf("Failed to query %s: %v", queryString, err)
return deviceMap
}
vector, ok := vs.(model.Vector)
if !ok {
c.log.Warnf("Unexpected result type: %v", vs)
h.log.Warnf("Unexpected result type: %v", vs)
return deviceMap
}
@ -68,34 +71,34 @@ func (c *Hygon) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.Dev
return deviceMap
}
func (c *Hygon) FetchDevices(node *corev1.Node) ([]*util.DeviceInfo, error) {
func (h *Hygon) FetchDevices(node *corev1.Node) ([]*util.DeviceInfo, error) {
devEncoded, ok := node.Annotations[RegisterAnnos]
if !ok {
return []*util.DeviceInfo{}, errors.New("annos not found " + RegisterAnnos)
}
nodedevices, err := util.DecodeNodeDevices(devEncoded, c.log)
nodedevices, err := util.DecodeNodeDevices(devEncoded, h.log)
if err != nil {
c.log.Errorw("failed to decode node devices", err, "node", node.Name, "device annotation", devEncoded)
h.log.Errorw("failed to decode node devices", err, "node", node.Name, "device annotation", devEncoded)
return []*util.DeviceInfo{}, err
}
if len(nodedevices) == 0 {
c.log.Infow("event", "no gpu device found", "node", node.Name, "device annotation", devEncoded)
h.log.Infow("event", "no gpu device found", "node", node.Name, "device annotation", devEncoded)
return []*util.DeviceInfo{}, errors.New("no gpu found on node")
}
devDecoded := util.EncodeNodeDevices(nodedevices, c.log)
c.log.Infow("event", "nodes device information", "node", node.Name, "nodedevices", devDecoded)
devDetail := c.GetDevicesFromPrometheus(node)
devDecoded := util.EncodeNodeDevices(nodedevices, h.log)
h.log.Infow("event", "nodes device information", "node", node.Name, "nodedevices", devDecoded)
devDetail := h.GetDevicesFromPrometheus(node)
for _, nodedevice := range nodedevices {
idParts := strings.Split(nodedevice.ID, "-")
if len(idParts) < 2 {
c.log.Warnf("Invalid nodedevice.ID format: %s", nodedevice.ID)
h.log.Warnf("Invalid nodedevice.ID format: %s", nodedevice.ID)
continue
}
devDetailID := idParts[1]
devInfo, exists := devDetail[devDetailID]
if !exists {
c.log.Warnf("Device ID %s not found in devDetail", devDetailID)
h.log.Warnf("Device ID %s not found in devDetail", devDetailID)
continue
}

@ -15,17 +15,20 @@ import (
type Cambricon struct {
prom *prom.Client
log *log.Helper
labelsSelector string
}
func NewCambricon(prom *prom.Client, log *log.Helper) *Cambricon {
func NewCambricon(prom *prom.Client, log *log.Helper, labelSelector string) *Cambricon {
return &Cambricon{
prom: prom,
log: log,
prom: prom,
log: log,
labelsSelector: labelSelector,
}
}
func (c *Cambricon) GetNodeDevicePluginLabels() (labels.Selector, error) {
return labels.Parse("mlu=on")
return labels.Parse(c.labelsSelector)
}
func (c *Cambricon) GetProvider() string {

@ -12,17 +12,20 @@ import (
type Nvidia struct {
prom *prom.Client
log *log.Helper
labelSelector string
}
func NewNvidia(prom *prom.Client, log *log.Helper) *Nvidia {
func NewNvidia(prom *prom.Client, log *log.Helper, labelSelector string) *Nvidia {
return &Nvidia{
prom: prom,
log: log,
prom: prom,
log: log,
labelSelector: labelSelector,
}
}
func (n *Nvidia) GetNodeDevicePluginLabels() (labels.Selector, error) {
return labels.Parse("gpu=on")
return labels.Parse(n.labelSelector)
}
func (n *Nvidia) GetProvider() string {

Loading…
Cancel
Save