You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
110 lines
2.8 KiB
110 lines
2.8 KiB
package hygon
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"github.com/go-kratos/kratos/v2/log"
|
|
"github.com/prometheus/common/model"
|
|
corev1 "k8s.io/api/core/v1"
|
|
"k8s.io/apimachinery/pkg/labels"
|
|
"strconv"
|
|
"strings"
|
|
"vgpu/internal/data/prom"
|
|
"vgpu/internal/provider/util"
|
|
)
|
|
|
|
type Hygon struct {
|
|
prom *prom.Client
|
|
log *log.Helper
|
|
|
|
nodeSelectors string
|
|
}
|
|
|
|
func NewHygon(prom *prom.Client, log *log.Helper, nodeSelectors string) *Hygon {
|
|
return &Hygon{
|
|
prom: prom,
|
|
log: log,
|
|
nodeSelectors: nodeSelectors,
|
|
}
|
|
}
|
|
|
|
func (h *Hygon) GetNodeDevicePluginLabels() (labels.Selector, error) {
|
|
return labels.Parse(h.nodeSelectors)
|
|
}
|
|
|
|
func (h *Hygon) GetProvider() string {
|
|
return HygonDCUDevice
|
|
}
|
|
|
|
type DeviceMeta struct {
|
|
UUID string
|
|
Type string
|
|
Driver string
|
|
}
|
|
|
|
func (h *Hygon) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.DeviceInfo {
|
|
deviceMap := make(map[string]*util.DeviceInfo)
|
|
queryString := fmt.Sprintf("dcu_temp{node=\"%s\"}", node.Name)
|
|
|
|
vs, err := h.prom.Query(context.Background(), queryString)
|
|
if err != nil {
|
|
h.log.Warnf("Failed to query %s: %v", queryString, err)
|
|
return deviceMap
|
|
}
|
|
|
|
vector, ok := vs.(model.Vector)
|
|
if !ok {
|
|
h.log.Warnf("Unexpected result type: %v", vs)
|
|
return deviceMap
|
|
}
|
|
|
|
for _, sample := range vector {
|
|
minorNumber := string(sample.Metric["minor_number"])
|
|
index, _ := strconv.Atoi(minorNumber)
|
|
deviceMap[minorNumber] = &util.DeviceInfo{
|
|
ID: string(sample.Metric["device_id"]),
|
|
Index: uint(index),
|
|
}
|
|
}
|
|
|
|
return deviceMap
|
|
}
|
|
|
|
func (h *Hygon) FetchDevices(node *corev1.Node) ([]*util.DeviceInfo, error) {
|
|
devEncoded, ok := node.Annotations[RegisterAnnos]
|
|
if !ok {
|
|
return []*util.DeviceInfo{}, errors.New("annos not found " + RegisterAnnos)
|
|
}
|
|
nodedevices, err := util.DecodeNodeDevices(devEncoded, h.log)
|
|
if err != nil {
|
|
h.log.Errorw("failed to decode node devices", err, "node", node.Name, "device annotation", devEncoded)
|
|
return []*util.DeviceInfo{}, err
|
|
}
|
|
if len(nodedevices) == 0 {
|
|
h.log.Infow("event", "no gpu device found", "node", node.Name, "device annotation", devEncoded)
|
|
return []*util.DeviceInfo{}, errors.New("no gpu found on node")
|
|
}
|
|
devDecoded := util.EncodeNodeDevices(nodedevices, h.log)
|
|
h.log.Infow("event", "nodes device information", "node", node.Name, "nodedevices", devDecoded)
|
|
devDetail := h.GetDevicesFromPrometheus(node)
|
|
for _, nodedevice := range nodedevices {
|
|
idParts := strings.Split(nodedevice.ID, "-")
|
|
if len(idParts) < 2 {
|
|
h.log.Warnf("Invalid nodedevice.ID format: %s", nodedevice.ID)
|
|
continue
|
|
}
|
|
|
|
devDetailID := idParts[1]
|
|
devInfo, exists := devDetail[devDetailID]
|
|
if !exists {
|
|
h.log.Warnf("Device ID %s not found in devDetail", devDetailID)
|
|
continue
|
|
}
|
|
|
|
nodedevice.ID = devInfo.ID
|
|
nodedevice.AliasId = fmt.Sprintf("%s-dcu-%d", node.Name, devInfo.Index)
|
|
}
|
|
return nodedevices, nil
|
|
}
|