You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							110 lines
						
					
					
						
							2.8 KiB
						
					
					
				
			
		
		
	
	
							110 lines
						
					
					
						
							2.8 KiB
						
					
					
				| package hygon
 | |
| 
 | |
| import (
 | |
| 	"context"
 | |
| 	"errors"
 | |
| 	"fmt"
 | |
| 	"github.com/go-kratos/kratos/v2/log"
 | |
| 	"github.com/prometheus/common/model"
 | |
| 	corev1 "k8s.io/api/core/v1"
 | |
| 	"k8s.io/apimachinery/pkg/labels"
 | |
| 	"strconv"
 | |
| 	"strings"
 | |
| 	"vgpu/internal/data/prom"
 | |
| 	"vgpu/internal/provider/util"
 | |
| )
 | |
| 
 | |
| type Hygon struct {
 | |
| 	prom *prom.Client
 | |
| 	log  *log.Helper
 | |
| 
 | |
| 	nodeSelectors string
 | |
| }
 | |
| 
 | |
| func NewHygon(prom *prom.Client, log *log.Helper, nodeSelectors string) *Hygon {
 | |
| 	return &Hygon{
 | |
| 		prom:          prom,
 | |
| 		log:           log,
 | |
| 		nodeSelectors: nodeSelectors,
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func (h *Hygon) GetNodeDevicePluginLabels() (labels.Selector, error) {
 | |
| 	return labels.Parse(h.nodeSelectors)
 | |
| }
 | |
| 
 | |
| func (h *Hygon) GetProvider() string {
 | |
| 	return HygonDCUDevice
 | |
| }
 | |
| 
 | |
| type DeviceMeta struct {
 | |
| 	UUID   string
 | |
| 	Type   string
 | |
| 	Driver string
 | |
| }
 | |
| 
 | |
| func (h *Hygon) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.DeviceInfo {
 | |
| 	deviceMap := make(map[string]*util.DeviceInfo)
 | |
| 	queryString := fmt.Sprintf("dcu_temp{node=\"%s\"}", node.Name)
 | |
| 
 | |
| 	vs, err := h.prom.Query(context.Background(), queryString)
 | |
| 	if err != nil {
 | |
| 		h.log.Warnf("Failed to query %s: %v", queryString, err)
 | |
| 		return deviceMap
 | |
| 	}
 | |
| 
 | |
| 	vector, ok := vs.(model.Vector)
 | |
| 	if !ok {
 | |
| 		h.log.Warnf("Unexpected result type: %v", vs)
 | |
| 		return deviceMap
 | |
| 	}
 | |
| 
 | |
| 	for _, sample := range vector {
 | |
| 		minorNumber := string(sample.Metric["minor_number"])
 | |
| 		index, _ := strconv.Atoi(minorNumber)
 | |
| 		deviceMap[minorNumber] = &util.DeviceInfo{
 | |
| 			ID:    string(sample.Metric["device_id"]),
 | |
| 			Index: uint(index),
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return deviceMap
 | |
| }
 | |
| 
 | |
| func (h *Hygon) FetchDevices(node *corev1.Node) ([]*util.DeviceInfo, error) {
 | |
| 	devEncoded, ok := node.Annotations[RegisterAnnos]
 | |
| 	if !ok {
 | |
| 		return []*util.DeviceInfo{}, errors.New("annos not found " + RegisterAnnos)
 | |
| 	}
 | |
| 	nodedevices, err := util.DecodeNodeDevices(devEncoded, h.log)
 | |
| 	if err != nil {
 | |
| 		h.log.Errorw("failed to decode node devices", err, "node", node.Name, "device annotation", devEncoded)
 | |
| 		return []*util.DeviceInfo{}, err
 | |
| 	}
 | |
| 	if len(nodedevices) == 0 {
 | |
| 		h.log.Infow("event", "no gpu device found", "node", node.Name, "device annotation", devEncoded)
 | |
| 		return []*util.DeviceInfo{}, errors.New("no gpu found on node")
 | |
| 	}
 | |
| 	devDecoded := util.EncodeNodeDevices(nodedevices, h.log)
 | |
| 	h.log.Infow("event", "nodes device information", "node", node.Name, "nodedevices", devDecoded)
 | |
| 	devDetail := h.GetDevicesFromPrometheus(node)
 | |
| 	for _, nodedevice := range nodedevices {
 | |
| 		idParts := strings.Split(nodedevice.ID, "-")
 | |
| 		if len(idParts) < 2 {
 | |
| 			h.log.Warnf("Invalid nodedevice.ID format: %s", nodedevice.ID)
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		devDetailID := idParts[1]
 | |
| 		devInfo, exists := devDetail[devDetailID]
 | |
| 		if !exists {
 | |
| 			h.log.Warnf("Device ID %s not found in devDetail", devDetailID)
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		nodedevice.ID = devInfo.ID
 | |
| 		nodedevice.AliasId = fmt.Sprintf("%s-dcu-%d", node.Name, devInfo.Index)
 | |
| 	}
 | |
| 	return nodedevices, nil
 | |
| }
 |